def parse_subject(self, response): item = DoubanItem() api_info = response.request.info actors = response.css(".actor a::text").extract() page = response.text e = pq(page) link_report = e("#link-report").text() pic_url = e(".related-pic-bd a") related_pic = [p.attr("href") for p in pic_url.items()] rating_people = e(".rating_people span").text() # 解析页面 得到一个字典 d = self.parse_subject_one(page) sub_info = { "title": api_info["title"], "movieId": api_info["id"], "directors": api_info["directors"], "casts": api_info["casts"], "actors": actors, "category": "电影", "coverUrl": api_info["cover"], "movieUrl": api_info["url"], "ratingValue": float(api_info["rate"]), "introduction": link_report, "ratingPeople": int(rating_people), "comments": { "shortComment": [], "longComment": [], }, "commentStatus": { # 全部抓取完成设为 True "short": False, "long": False, "full": True, }, "etc": { "relatedPictures": related_pic, } } sub_info = dict(d, **sub_info) movie_id = api_info["id"] # 短评 short_url = "https://movie.douban.com/subject/{}/comments?start=0&limit=20&sort=new_score&status=P".format( movie_id) # request = Request(short_url, callback=self.parse_short, meta=self.ip_meta) request = Request(short_url, callback=self.parse_short) request.item = item request.info = sub_info yield request # 剧评 long_url = "https://movie.douban.com/subject/{}/reviews?start=0".format( movie_id) request2 = Request(long_url, callback=self.parse_long) request2.info = sub_info request2.item = item yield request2 time.sleep(self.SLEEP)
def parse(self, response): r = response.text r = json.loads(r) data = r['data'] # print(len(data)) if len(data) > 0: for d in data: request = Request(url=d.get('url'), callback=self.parse_subject) request.info = d yield request self.count += 1 url = 'https://movie.douban.com/j/new_search_subjects?range=0,10&start={}'.format( self.count * 20) yield Request(url=url, callback=self.parse)
def start_requests(self): """ 前奏: 读取iqiyi库标题,匹配 1. 读取库里所有的iqiyi标题 2. 使用标题搜索,如果标题匹配入库, 否则第三条 3. 抓取第一条 4. 解析第一条的iqiyi链接地址是否对应,对应入库更新,否则结束 """ for index, res in enumerate(self.db.iqiyi_movie.find()): title = res.get("title") iqiyi_video_id = res.get("info").get("videoId") mongo_id = res.get("_id") iqiyi_info = { "title": title, "videoId": iqiyi_video_id, "mongoId": mongo_id, "mongoDBName": "xiaociwei", "tableName": "v_iqiyi_movie", } # 查豆瓣库看有多少匹配 query = { "title": res.get("title"), } r = self.db.v_doubanMovie.find_one(query) if r is not None: # 名字和豆瓣库的匹配,入库,不发起搜索请求 del r["_id"] query2 = {"movieId": r["movieId"]} r['relation'] = iqiyi_info print("爱奇艺名字和豆瓣库的匹配, 正在入库", r) self.count += 1 print("匹配成功总数", self.count) self.write_db.v_doubanMovie.update(query2, {"$set": r}, upsert=True) else: # url = "https://movie.douban.com/subject_search?search_text={}&cat=1002".format(title) url = "http://api.douban.com/v2/movie/search?q={}".format( title) request = Request(url, callback=self.parse) request.info = iqiyi_info yield request time.sleep(1)
def parse(self, response): iqiyi_info = response.request.info page = response.text r = json.loads(page) first = r.get("subjects")[0] if first.get("title") == iqiyi_info.get("title"): # 匹配成功 入库 query2 = {"movieId": first["id"]} insert = { 'relation': iqiyi_info, } print("搜索结果第一个名字匹配成功,正在入库", first.get("title")) self.count += 1 print("匹配成功总数", self.count) self.write_db.v_doubanMovie.update(query2, {"$set": insert}, upsert=True) else: url = first.get("alt") request = Request(url, callback=self.parse_sub) request.info = iqiyi_info request.doubanId = first.get("id") print("搜索结果第一个名字不匹配,正在发起请求", first.get("title")) yield request