コード例 #1
0
 def parse_subject(self, response):
     item = DoubanItem()
     api_info = response.request.info
     actors = response.css(".actor a::text").extract()
     page = response.text
     e = pq(page)
     link_report = e("#link-report").text()
     pic_url = e(".related-pic-bd a")
     related_pic = [p.attr("href") for p in pic_url.items()]
     rating_people = e(".rating_people span").text()
     # 解析页面 得到一个字典
     d = self.parse_subject_one(page)
     sub_info = {
         "title": api_info["title"],
         "movieId": api_info["id"],
         "directors": api_info["directors"],
         "casts": api_info["casts"],
         "actors": actors,
         "category": "电影",
         "coverUrl": api_info["cover"],
         "movieUrl": api_info["url"],
         "ratingValue": float(api_info["rate"]),
         "introduction": link_report,
         "ratingPeople": int(rating_people),
         "comments": {
             "shortComment": [],
             "longComment": [],
         },
         "commentStatus": {
             # 全部抓取完成设为 True
             "short": False,
             "long": False,
             "full": True,
         },
         "etc": {
             "relatedPictures": related_pic,
         }
     }
     sub_info = dict(d, **sub_info)
     movie_id = api_info["id"]
     # 短评
     short_url = "https://movie.douban.com/subject/{}/comments?start=0&limit=20&sort=new_score&status=P".format(
         movie_id)
     # request = Request(short_url, callback=self.parse_short, meta=self.ip_meta)
     request = Request(short_url, callback=self.parse_short)
     request.item = item
     request.info = sub_info
     yield request
     # 剧评
     long_url = "https://movie.douban.com/subject/{}/reviews?start=0".format(
         movie_id)
     request2 = Request(long_url, callback=self.parse_long)
     request2.info = sub_info
     request2.item = item
     yield request2
     time.sleep(self.SLEEP)
コード例 #2
0
 def get_media_requests(self, item, info):
     #文件名使用数字排序
     imagename = 1
     for image_url in item['大图']:
         imagename += 1
         self.default_headers['referer'] = image_url
         myrequest = Request(image_url, headers=self.default_headers)
         #传递参数
         myrequest.item = item
         myrequest.name = str(imagename) + ".jpg"
         yield myrequest
コード例 #3
0
 def parseAndroid(self, response):
     item = UptodownItem()
     html_android = response.text
     item["android_html"] = html_android
     print("包含描述信息的页面", response.url)
     item["android_url"] = response.url
     # 这里调用解析,拿到下载地址,下载
     parse_result = self.__parseAndroid(html_android)
     for field in item.fields:
         if field in parse_result.keys():
             item[field] = parse_result[field]
     android_download_url = response.url + "/download"
     request = Request(android_download_url, callback=self.parseDownload, dont_filter=True)
     request.item = item
     yield request
コード例 #4
0
    def parse_sina(self, response):
        try:
            e = pq(response.text)
            loopblk = e(".loopblk")
            for i, div in enumerate(loopblk.items()):
                if i == 1 or i == 2:
                    # 去掉图片排行和视频排行
                    pass
                else:
                    """
                    1. 拿到这个 div 是干什么的 如:新闻总排行
                    2. 拿到数据很简单,重点是判断它是属于什么排行 --点击量排行,评论量排行,分享数排行
                    """
                    rank_name = div(".lbti h2").text()
                    cons = div(".Cons")
                    for index, c in enumerate(cons.items()):
                        # 只要前三个
                        if index == 3 or index == 4:
                            pass
                        else:
                            # self.test(i, c, rank_name, url, item)
                            # self.parse_cons(i, c, rank_name, url, item)
                            # ----------------------------test----------------------------
                            # print("parsing sina")
                            """
                            :param url: 拿到这些信息的 url
                            :param rank_name: 排行榜的名字
                            :param i: i==0 -->点击量排行, i==1 -->评论数排行, i==2 -->分享数排行
                            :param c: 要抓的最小单位
                            :return:
                            """
                            # item = SougoutopItem()
                            info = {}
                            if index == 0:
                                info['news_type'] = "新浪" + rank_name + "点击量排行"
                            elif index == 1:

                                info['news_type'] = "新浪" + rank_name + "评论数排行"
                            elif index == 2:
                                info['news_type'] = "新浪" + rank_name + "分享数排行"

                            script = c("script")
                            item = SougoutopItem()
                            item['url'] = response.url
                            item["missionCreateTime"] = datetime.now()
                            item["info"] = info
                            for a, s in enumerate(script.items()):
                                if a == 0 and index < 2:
                                    pass
                                    # print(i, a, s)
                                    # print(i, a, "dududu", s.attr("src"))
                                    url = s.attr("src")
                                    request = Request(
                                        url,
                                        callback=self._parse_script,
                                        dont_filter=True)
                                    request.item = item
                                    yield request
                                    # yield Request("http://www.baidu.com", callback=self.parse_sougou)

                                if index == 2 and a == 1:
                                    # pass
                                    # print(i, a, "总有奇葩", s.attr("src"))
                                    url = s.attr("src")
                                    request = Request(
                                        url,
                                        callback=self._parse_script,
                                        dont_filter=True)
                                    request.item = item
                                    yield request
                            # ----------------------------test----------------------------
        finally:
            print("sina 正在添加新任务至队列头部")
            request = Request(url=response.url,
                              callback=self.parse_sina,
                              dont_filter=True)
            yield request
            time.sleep(self.avg_sleep)