Exemple #1
0
    def parse_json(self, response):
        print("json")
        div_xpath = '//div[@class="news_li xh-highlight" or "news_li"]'
        title_xpath = 'h2/a/text()'
        url_xpath = 'h2/a/@href'
        summary_xpath = 'p/text()'
        image_url_xpath = 'div[@class="news_tu"]/a/img/@src'
        news_time_xpath = 'div[@class="pdtt_trbs"]/span[1]/text()'

        cnt = 0
        re_selectors = response.xpath(div_xpath)
        for re_selector in re_selectors:
            title = re_selector.xpath(title_xpath).extract()
            url = re_selector.xpath(url_xpath).extract()
            # url_md5 = get_md5(url)
            summary = re_selector.xpath(summary_xpath).extract()
            img_urls = re_selector.xpath(image_url_xpath).extract()
            category = response.meta.get("category", "")
            from_platform = self.from_platform
            news_time = re_selector.xpath(news_time_xpath).extract()
            try:
                crawl_time = datetime.datetime.strptime(
                    crawl_time, "%Y/%m/%d").date()
            except Exception as e:
                crawl_time = datetime.datetime.now().date()

            #排除亲爱的会员这几种特殊情况
            if len(title) and len(url) and len(news_time) and len(img_urls):
                pass
            else:
                continue

            #设置一个默认值,防止出错
            if len(news_time) >= 1:
                news_score = get_score(news_time[0])
            else:
                news_score = 1.0
            base_url = 'https://www.thepaper.cn/'
            img_url = make_str('http:', img_urls)
            news_itemloader = ItemLoader(item=NewsItem(), response=response)
            news_itemloader.add_value("title", title)
            news_itemloader.add_value("image_urls", img_url)
            news_itemloader.add_value("url", parse.urljoin(base_url, url[0]))
            news_itemloader.add_value("url_md5", get_md5(url[0]))
            news_itemloader.add_value("category", category)
            news_itemloader.add_value("summary", summary)
            news_itemloader.add_value("from_platform", from_platform)
            news_itemloader.add_value("news_time", news_time)
            news_itemloader.add_value("crawl_time", crawl_time)
            news_itemloader.add_value("news_score", news_score)
            news_item = news_itemloader.load_item()

            # # 测试json,减少数量
            # cnt = cnt + 1
            # if cnt == 2:
            #     break

            yield news_item

        pass
Exemple #2
0
    def parse_detail(self, response):
        print("detail")
        # html = Selector(response)
        cnt = 0
        div_xpath = '//div/div[@class="item-inner"]'
        title_xpath = 'h2//a/text()'
        url_xpath = 'h2//a/@href'
        summary_xpath = 'div[@class="item-lead"]/text()'
        image_url_xpath = 'a[@class="image"]/figure/@data-url'
        news_time_xpath = 'div[@class="item-time"]/text()'

        re_selectors = response.xpath(div_xpath)
        for re_selector in re_selectors:
            title = re_selector.xpath(title_xpath).extract()
            url = re_selector.xpath(url_xpath).extract()
            # url_md5 = get_md5(url)
            summary = re_selector.xpath(summary_xpath).extract()
            img_url = re_selector.xpath(image_url_xpath).extract()
            category = response.meta.get("category", "")
            from_platform = self.from_platform
            news_time = re_selector.xpath(news_time_xpath).extract()
            try:
                crawl_time = datetime.datetime.strptime(
                    crawl_time, "%Y/%m/%d").date()
            except Exception as e:
                crawl_time = datetime.datetime.now().date()

            #检测爬取是否正确
            # with open('test.html','wb') as fp:
            #     fp.write(response.text.encode('utf-8'))
            #     fp.close()

            #排除亲爱的会员这几种特殊情况
            if len(title) and len(url) and len(news_time) and len(img_url):
                pass
            else:
                continue

            #设置一个默认值,防止出错
            if len(news_time) >= 1:
                news_score = get_score(news_time[0])
            else:
                news_score = 1.0

            # news_item = NewsItem()
            # news_item['title'] = title
            # news_item['image_urls'] = img_url
            # news_item['url'] = url
            # news_item['url_md5'] = get_md5(url)
            # news_item['category'] = category
            # news_item['summary'] = summary
            # news_item['from_platform'] = from_platform
            # news_item['news_time'] = news_time
            # news_item['crawl_time'] = crawl_time
            # news_item['news_score'] = news_score
            test_img = 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2519070834.webp'

            news_itemloader = ItemLoader(item=NewsItem(), response=response)
            news_itemloader.add_value("title", title)
            news_itemloader.add_value("image_urls", img_url)
            # news_itemloader.add_value("image_path", '/images/full')   #测试scrapyd的时候,指定图片路径
            news_itemloader.add_value("url",
                                      parse.urljoin(response.url, url[0]))
            news_itemloader.add_value("url_md5", get_md5(url[0]))
            news_itemloader.add_value("category", category)
            news_itemloader.add_value("summary", summary)
            news_itemloader.add_value("from_platform", from_platform)
            news_itemloader.add_value("news_time", news_time)
            news_itemloader.add_value("crawl_time", crawl_time)
            news_itemloader.add_value("news_score", news_score)
            news_item = news_itemloader.load_item()

            # 测试json,减少数量
            cnt = cnt + 1
            if cnt == 3:
                break

            yield news_item

        pass
Exemple #3
0
    def parse_detail(self, response):
        print("detail")
        div_xpath = '//div[@class="newsbox"]/div[@class="news_li xh-highlight" or "news_li"]'
        title_xpath = 'h2/a/text()'
        url_xpath = 'h2/a/@href'
        summary_xpath = 'p/text()'
        image_url_xpath = 'div[@class="news_tu"]/a/img/@src'
        news_time_xpath = 'div[@class="pdtt_trbs"]/span[1]/text()'

        # topCid1_xpath = '//div[@class="pdtt01"]/div[@class="pdtt_lt"]/a[@class="tiptitleImg"]/@data-id'
        # topCid23_xpath = '//div[@class="newsbox"]/div[@class="news_li xh-highlight" or "news_li"]/div/a/@data-id'
        # # test_xpath = '//div[@id="masonryContent"]/div[@id="cont2257855"]/div[@class="news_tu"]/a[@class="tiptitleImg"]'
        # lasttime_xpath = '//div[@class="newsbox"]/div[@class="news_li" and @id="last1"]/@lasttime'
        # topCid1 = response.xpath(topCid1_xpath).extract()[0]
        # topCid2 = response.xpath(topCid23_xpath).extract()[0]
        # topCid3 = response.xpath(topCid23_xpath).extract()[1]
        # last_time = response.xpath(lasttime_xpath).extract()[0]
        # topCids = []
        # topCids.append(topCid1)
        # topCids.append(topCid2)
        # topCids.append(topCid3)
        # topCid = ','.join(topCids)
        #
        # kv = {"category": name, "nodeids": node, "time": last_time, "topCids": topCid}

        cnt = 0
        re_selectors = response.xpath(div_xpath)
        for re_selector in re_selectors:
            title = re_selector.xpath(title_xpath).extract()
            url = re_selector.xpath(url_xpath).extract()
            # url_md5 = get_md5(url)
            temp_summary = re_selector.xpath(summary_xpath).extract()
            if len(temp_summary) >= 1:
                summary = temp_summary
            else:
                summary = ''
            img_urls = re_selector.xpath(image_url_xpath).extract()
            category = response.meta.get("category", "")
            from_platform = self.from_platform
            news_time = re_selector.xpath(news_time_xpath).extract()
            try:
                crawl_time = datetime.datetime.strptime(
                    crawl_time, "%Y/%m/%d").date()
            except Exception as e:
                crawl_time = datetime.datetime.now().date()

            # 检测爬取是否正确
            with open('pengpai_test.html', 'wb') as fp:
                fp.write(response.text.encode('utf-8'))
                fp.close()

            #排除亲爱的会员这几种特殊情况
            if len(title) and len(url) and len(news_time) and len(img_urls):
                pass
            else:
                continue

            #设置一个默认值,防止出错
            if len(news_time) >= 1:
                news_score = get_score(news_time[0])
            else:
                news_score = 1.0

            base_url = 'https://www.thepaper.cn/'
            img_url = make_str('http:', img_urls)
            news_itemloader = ItemLoader(item=NewsItem(), response=response)
            news_itemloader.add_value("title", title)
            news_itemloader.add_value("image_urls", img_url)
            news_itemloader.add_value("url", parse.urljoin(base_url, url[0]))
            news_itemloader.add_value("url_md5", get_md5(url[0]))
            news_itemloader.add_value("category", category)
            news_itemloader.add_value("summary", summary)
            news_itemloader.add_value("from_platform", from_platform)
            news_itemloader.add_value("news_time", news_time)
            news_itemloader.add_value("crawl_time", crawl_time)
            news_itemloader.add_value("news_score", news_score)
            news_item = news_itemloader.load_item()

            # #测试json,减少数量
            # cnt = cnt + 1
            # if cnt == 2:
            #     break

            yield news_item

        node = response.meta.get("nodeids", "")
        index_add = 2
        if node != "":
            for i in range(2):
                page = i + 2
                if category == '精选':
                    json_url = 'https://www.thepaper.cn/load_chosen.jsp?nodeids={0}&pageidx={1}'.format(
                        node, page)
                else:
                    json_url = 'https://www.thepaper.cn/load_index.jsp?nodeids={0}&pageidx={1}'.format(
                        node, page)
                yield Request(url=json_url,
                              callback=self.parse_json,
                              meta={"category": category},
                              dont_filter=True)
    def parse_detail(self, response):
        print('detail')
        self.open_selenium = False
        cnt = 0
        div_xpath = '//div[@class="focus-mod"]//a[@class="focus-item"]'
        title_xpath = 'div[@class="txt"]/h2/text()'
        url_xpath = '@href'
        # summary_xpath = 'p/text()'
        image_url_xpath = 'div[@class="pic"]/img/@src'
        news_time_xpath = 'div[@class="txt"]/div[@class="info"]/span[2]/text()'
        re_selectors = response.xpath(div_xpath)

        for re_selector in re_selectors:
            title = re_selector.xpath(title_xpath).extract()
            url = re_selector.xpath(url_xpath).extract()
            # url_md5 = get_md5(url)
            summary = ''
            # 提取图片地址
            temp_urls = re_selector.xpath(image_url_xpath).extract()
            if len(temp_urls) >= 1:
                img_urls = 'http:' + temp_urls[0]
            else:
                img_urls = 'https://mat1.gtimg.com/pingjs/ext2020/newom/build/static/images/new_logo.png'

            category = response.meta.get("category", "")
            from_platform = self.from_platform
            news_time = re_selector.xpath(news_time_xpath).extract()
            if len(news_time) == 0:
                news_time = '1天前'

            crawl_time = datetime.datetime.now().date()

            # 排除亲爱的会员这几种特殊情况
            if len(title) and len(url) and len(news_time) and len(img_urls):
                pass
            else:
                continue

            # 设置一个默认值,防止出错
            if len(news_time) >= 1:
                news_score = get_score(news_time[0])
            else:
                news_score = 1.0

            news_itemloader = ItemLoader(item=NewsItem(), response=response)
            news_itemloader.add_value("title", title)
            news_itemloader.add_value("image_urls", img_urls)
            news_itemloader.add_value("url", url)
            news_itemloader.add_value("url_md5", get_md5(url[0]))
            news_itemloader.add_value("category", category)
            news_itemloader.add_value("summary", summary)
            news_itemloader.add_value("from_platform", from_platform)
            news_itemloader.add_value("news_time", news_time)
            news_itemloader.add_value("crawl_time", crawl_time)
            news_itemloader.add_value("news_score", news_score)
            news_item = news_itemloader.load_item()

            # 测试json,减少数量
            cnt = cnt + 1
            if cnt == 2:
                break

            yield news_item

        self.open_selenium = True
        yield Request(url=response.url,
                      callback=self.parse_more,
                      meta={"category": category},
                      dont_filter=True)

        pass
    def parse_more(self, response):
        print('more')
        self.open_selenium = False

        cnt = 0
        div_xpath = '//div[@id="List"]//ul[@class="list"]/li[@class="item cf" or "item-pics cf"]'
        title_pics_xpath = 'h3/a/text()'
        title_xpath = 'div[@class="detail"]/h3/a/text()'
        url_xpath = 'div[@class="detail"]/h3/a/@href'
        url_pic_xpath = 'h3/a/@href'
        # summary_xpath = 'p/text()'
        image_url_xpath = 'div[@class="picture" or "fl picture"]//img[1]/@src'
        news_time_xpath = 'div[@class="detail"]//span[@class="time"]/text()'
        re_selectors = response.xpath(div_xpath)

        for re_selector in re_selectors:
            item_type = re_selector.xpath('@class').extract()[0]
            if item_type == "item cf":
                title = re_selector.xpath(title_xpath).extract()
                url = re_selector.xpath(url_xpath).extract()
            else:
                title = re_selector.xpath(title_pics_xpath).extract()
                url = re_selector.xpath(url_pic_xpath).extract()
            # url_md5 = get_md5(url)
            summary = ''
            # 提取图片地址
            temp_urls = re_selector.xpath(image_url_xpath).extract()
            if len(temp_urls) >= 1:
                img_urls = 'http:' + temp_urls[0]
            else:
                img_urls = 'https://mat1.gtimg.com/pingjs/ext2020/newom/build/static/images/new_logo.png'

            category = response.meta.get("category", "")
            from_platform = self.from_platform
            news_time = re_selector.xpath(news_time_xpath).extract()
            if len(news_time) == 0:
                news_time = '1天前'

            crawl_time = datetime.datetime.now().date()

            # 排除亲爱的会员这几种特殊情况
            if len(title) and len(url) and len(news_time) and len(img_urls):
                pass
            else:
                continue

            # 设置一个默认值,防止出错
            if len(news_time) >= 1:
                news_score = get_score(news_time[0])
            else:
                news_score = 1.0

            news_itemloader = ItemLoader(item=NewsItem(), response=response)
            news_itemloader.add_value("title", title)
            news_itemloader.add_value("image_urls", img_urls)
            news_itemloader.add_value("url", url)
            news_itemloader.add_value("url_md5", get_md5(url[0]))
            news_itemloader.add_value("category", category)
            news_itemloader.add_value("summary", summary)
            news_itemloader.add_value("from_platform", from_platform)
            news_itemloader.add_value("news_time", news_time)
            news_itemloader.add_value("crawl_time", crawl_time)
            news_itemloader.add_value("news_score", news_score)
            news_item = news_itemloader.load_item()

            # 测试json,减少数量
            cnt = cnt + 1
            if cnt == 50:
                break

            yield news_item

        pass
    def parse_detail(self, response):
        print('detail')
        cnt = 0
        div_xpath = '//div[@id="section"]/div[@class="figure flex-block"]'
        title_xpath = 'div/h2/a/text()'
        url_xpath = 'div/h2/a/@href'
        # summary_xpath = 'p/text()'
        image_url_xpath = 'a/@style'
        news_time_xpath = 'div/div/span[2]/text()'

        re_selectors = response.xpath(div_xpath)
        for re_selector in re_selectors:
            title = re_selector.xpath(title_xpath).extract()
            url = re_selector.xpath(url_xpath).extract()[0]
            url = 'http:' + url
            # url_md5 = get_md5(url)
            summary = 'null'
            #提取图片地址
            temp_urls = re_selector.xpath(image_url_xpath).extract()
            if len(temp_urls) >= 1:
                temp_url = temp_urls[0]
                temp = temp_url.split(":")[1]
                temp=temp.replace("url(", "http:")
                temp = temp.replace(");", "")
                img_urls = temp
            else:
                img_urls = 'http://zkres.myzaker.com/static/zaker_web2/img/logo.png?v=20170726'

            category = response.meta.get("category", "")
            from_platform = self.from_platform
            news_time = re_selector.xpath(news_time_xpath).extract()

            try:
                crawl_time = datetime.datetime.strptime(crawl_time, "%Y/%m/%d").date()
            except Exception as e:
                crawl_time = datetime.datetime.now().date()

            #排除亲爱的会员这几种特殊情况
            if len(title) and len(url) and len(news_time) and len(img_urls):
                pass
            else:
                continue

            #设置一个默认值,防止出错
            if len(news_time) >= 1:
                news_score = get_score(news_time[0])
            else:
                news_score = 1.0

            img_url = make_str('http:', img_urls)
            news_itemloader = ItemLoader(item=NewsItem(), response=response)
            news_itemloader.add_value("title", title)
            news_itemloader.add_value("image_urls", img_urls)
            news_itemloader.add_value("url", url)
            news_itemloader.add_value("url_md5", get_md5(url[0]))
            news_itemloader.add_value("category", category)
            news_itemloader.add_value("summary", summary)
            news_itemloader.add_value("from_platform", from_platform)
            news_itemloader.add_value("news_time", news_time)
            news_itemloader.add_value("crawl_time", crawl_time)
            news_itemloader.add_value("news_score", news_score)
            news_item = news_itemloader.load_item()

            # #测试json,减少数量
            # cnt = cnt + 1
            # if cnt == 2:
            #     break

            yield news_item

        #获取下一页json内容
        next_page_xpath = '//div[@id="content"]/div[@class="main flex-block"]/a[@class="next_page"]/@href'
        next_url = response.xpath(next_page_xpath).extract()[0]
        deal_url = tranfer_str(next_url)
        kv = get_re_zaker(deal_url)
        appid = kv.get("appid")
        date = kv.get("date")
        artilce = kv.get("artcile")
        stamp = kv.get("stamp")
        tab = kv.get("tab")
        version = kv.get("version")

        myversion = '&_version='+version

        head = 'http://www.myzaker.com/news/next_new.php?f=myzaker_com&url='
        no_aticle_url = 'http://iphone.myzaker.com/zaker/blog2news.php?app_id={0}&since_date={1}&nt={2}&_appid=iphone&opage={3}&top_tab_id={4}&_version={5}'
        base_url = 'http://iphone.myzaker.com/zaker/blog2news.php?app_id={0}&since_date={1}&nt={2}&next_aticle_id={3}&_appid=iphone&opage={4}&otimestamp={5}&top_tab_id={6}&_version={7}'
        for page in range(1):
            nt = page + 1
            opage = page + 2
            if artilce == None or stamp==None:
                json_url = head + reverse_tranfer_str(no_aticle_url.format(appid, date, nt, opage, tab, version)) + myversion
            else:
                json_url = head + reverse_tranfer_str(base_url.format(appid, date, nt, artilce, opage, stamp, tab, version)) + myversion
            # json_url = 'http://www.myzaker.com/news/next_new.php?f=myzaker_com&url=http%3A%2F%2Fiphone.myzaker.com%2Fzaker%2Fblog2news.php%3Fapp_id%3D10001%26since_date%3D1531383311%26nt%3D1%26_appid%3Diphone%26top_tab_id%3D12183%26_version%3D6.5&_version=6.5'
            yield Request(url=json_url, callback=self.parse_json, meta={"category": category}, dont_filter=True)
        pass
    def parse_json(self, response):
        print('json')
        cnt = 0
        pretty_content = decode_zaker(response.text)

        # 检测爬取是否正确
        # with open('zaker.html', 'wb') as fp:
        #     fp.write(response.text.encode('utf-8'))
        #     fp.close()
        # # print(response.text.decode('unicode_escape'))

        next_url = pretty_content['data']['next_url']
        next_url = 'http:' + next_url
        num = len(pretty_content['data']['article'])
        for article in pretty_content['data']['article']:
            url = article['href']
            title = article['title']
            news_time = article["marks"][1]
            img_urls = article["img"]
            url = 'http:' + url
            summary = 'null'
            # 提取图片地址
            if len(img_urls) >= 1:
                pass
            else:
                img_urls = 'zkres.myzaker.com/static/zaker_web2/img/logo.png?v=20170726'

            category = response.meta.get("category", "")
            from_platform = self.from_platform

            crawl_time = datetime.datetime.now()
            try:
                crawl_time = datetime.datetime.strptime(crawl_time, "%Y/%m/%d").date()
            except Exception as e:
                crawl_time = datetime.datetime.now().date()
            finally:
                pass

            # 排除亲爱的会员这几种特殊情况
            if len(title) and len(url) and len(news_time) and len(img_urls):
                pass
            else:
                continue

            # 设置一个默认值,防止出错
            if len(news_time) >= 1:
                news_score = get_score(news_time)
            else:
                news_score = 1.0

            img_urls = 'http://'+ img_urls
            news_itemloader = ItemLoader(item=NewsItem(), response=response)
            news_itemloader.add_value("title", title)
            news_itemloader.add_value("image_urls", img_urls)
            news_itemloader.add_value("url", url)
            news_itemloader.add_value("url_md5", get_md5(url))
            news_itemloader.add_value("category", category)
            news_itemloader.add_value("summary", summary)
            news_itemloader.add_value("from_platform", from_platform)
            news_itemloader.add_value("news_time", news_time)
            news_itemloader.add_value("crawl_time", crawl_time)
            news_itemloader.add_value("news_score", news_score)
            news_item = news_itemloader.load_item()

            # # 测试json,减少数量
            # cnt = cnt + 1
            # if cnt == 2:
            #     break

            yield news_item

            pass

        depth = int(response.meta.get("depth", ""))
        if depth <= 3:
            yield Request(url=next_url, callback=self.parse_json, meta={"category": category}, dont_filter=True)