コード例 #1
0
 def parse_subject(self, response):
     item = DoubanItem()
     api_info = response.request.info
     actors = response.css(".actor a::text").extract()
     page = response.text
     e = pq(page)
     link_report = e("#link-report").text()
     pic_url = e(".related-pic-bd a")
     related_pic = [p.attr("href") for p in pic_url.items()]
     rating_people = e(".rating_people span").text()
     # 解析页面 得到一个字典
     d = self.parse_subject_one(page)
     sub_info = {
         "title": api_info["title"],
         "movieId": api_info["id"],
         "directors": api_info["directors"],
         "casts": api_info["casts"],
         "actors": actors,
         "category": "电影",
         "coverUrl": api_info["cover"],
         "movieUrl": api_info["url"],
         "ratingValue": float(api_info["rate"]),
         "introduction": link_report,
         "ratingPeople": int(rating_people),
         "comments": {
             "shortComment": [],
             "longComment": [],
         },
         "commentStatus": {
             # 全部抓取完成设为 True
             "short": False,
             "long": False,
             "full": True,
         },
         "etc": {
             "relatedPictures": related_pic,
         }
     }
     sub_info = dict(d, **sub_info)
     movie_id = api_info["id"]
     # 短评
     short_url = "https://movie.douban.com/subject/{}/comments?start=0&limit=20&sort=new_score&status=P".format(
         movie_id)
     # request = Request(short_url, callback=self.parse_short, meta=self.ip_meta)
     request = Request(short_url, callback=self.parse_short)
     request.item = item
     request.info = sub_info
     yield request
     # 剧评
     long_url = "https://movie.douban.com/subject/{}/reviews?start=0".format(
         movie_id)
     request2 = Request(long_url, callback=self.parse_long)
     request2.info = sub_info
     request2.item = item
     yield request2
     time.sleep(self.SLEEP)
コード例 #2
0
 def parse(self, response):
     r = response.text
     r = json.loads(r)
     data = r['data']
     # print(len(data))
     if len(data) > 0:
         for d in data:
             request = Request(url=d.get('url'),
                               callback=self.parse_subject)
             request.info = d
             yield request
         self.count += 1
         url = 'https://movie.douban.com/j/new_search_subjects?range=0,10&start={}'.format(
             self.count * 20)
         yield Request(url=url, callback=self.parse)
コード例 #3
0
 def start_requests(self):
     """
     前奏: 读取iqiyi库标题,匹配
     1. 读取库里所有的iqiyi标题
     2. 使用标题搜索,如果标题匹配入库, 否则第三条
     3. 抓取第一条
     4. 解析第一条的iqiyi链接地址是否对应,对应入库更新,否则结束
     """
     for index, res in enumerate(self.db.iqiyi_movie.find()):
         title = res.get("title")
         iqiyi_video_id = res.get("info").get("videoId")
         mongo_id = res.get("_id")
         iqiyi_info = {
             "title": title,
             "videoId": iqiyi_video_id,
             "mongoId": mongo_id,
             "mongoDBName": "xiaociwei",
             "tableName": "v_iqiyi_movie",
         }
         # 查豆瓣库看有多少匹配
         query = {
             "title": res.get("title"),
         }
         r = self.db.v_doubanMovie.find_one(query)
         if r is not None:
             # 名字和豆瓣库的匹配,入库,不发起搜索请求
             del r["_id"]
             query2 = {"movieId": r["movieId"]}
             r['relation'] = iqiyi_info
             print("爱奇艺名字和豆瓣库的匹配, 正在入库", r)
             self.count += 1
             print("匹配成功总数", self.count)
             self.write_db.v_doubanMovie.update(query2, {"$set": r},
                                                upsert=True)
         else:
             # url = "https://movie.douban.com/subject_search?search_text={}&cat=1002".format(title)
             url = "http://api.douban.com/v2/movie/search?q={}".format(
                 title)
             request = Request(url, callback=self.parse)
             request.info = iqiyi_info
             yield request
         time.sleep(1)
コード例 #4
0
 def parse(self, response):
     iqiyi_info = response.request.info
     page = response.text
     r = json.loads(page)
     first = r.get("subjects")[0]
     if first.get("title") == iqiyi_info.get("title"):
         # 匹配成功 入库
         query2 = {"movieId": first["id"]}
         insert = {
             'relation': iqiyi_info,
         }
         print("搜索结果第一个名字匹配成功,正在入库", first.get("title"))
         self.count += 1
         print("匹配成功总数", self.count)
         self.write_db.v_doubanMovie.update(query2, {"$set": insert},
                                            upsert=True)
     else:
         url = first.get("alt")
         request = Request(url, callback=self.parse_sub)
         request.info = iqiyi_info
         request.doubanId = first.get("id")
         print("搜索结果第一个名字不匹配,正在发起请求", first.get("title"))
         yield request