コード例 #1
0
    def parse_article_list(self, response):
        """
        文章列表
        :param response:
        :return:
        """
        body = response.body_as_unicode()
        jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0)
        result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')'))
        # 翻页
        has_more = result.get('has_more')
        if has_more:
            max_behot_time = result['next']['max_behot_time']
            AS, CP = get_as_cp()
            jsonp_index = response.meta.get('jsonp_index', 0) + 1

            url_params_next = {
                'max_behot_time': max_behot_time,
                'as': AS,
                'cp': CP,
                'callback': 'jsonp%d' % jsonp_index,
            }

            url_article_list_next = get_update_url(response.url, url_params_next)

            meta = dict(response.meta, jsonp_index=jsonp_index)
            yield scrapy.Request(url=url_article_list_next, callback=self.parse_article_list, meta=meta)
        # 详情
        data_list = result.get('data', [])
        for data_item in data_list:
            detail_url = data_item.get('source_url')
            meta = dict(response.meta, detail_url=detail_url)
            yield scrapy.Request(url=detail_url, callback=self.parse_article_detail, meta=meta)
コード例 #2
0
ファイル: toutiao_m.py プロジェクト: JesseYan/news_spider
    def jssdk_signature(self, response):
        AS, CP = get_as_cp()
        jsonp_index = 3

        url = 'https://www.toutiao.com/pgc/ma/'
        url_params = {
            'page_type': 1,
            'max_behot_time': '',
            'uid': response.meta['userid'],
            'media_id': response.meta['mediaid'],
            'output': 'json',
            'is_json': 1,
            'count': 20,
            'from': 'user_profile_app',
            'version': 2,
            'as': AS,
            'cp': CP,
            'callback': 'jsonp%d' % jsonp_index,
        }
        url_article_list = get_update_url(url, url_params)
        url_article_list = "https://www.toutiao.com/pgc/ma/?page_type=1&max_behot_time=&uid=6555293927&media_id=6555293927&output=json&is_json=1&count=20&from=user_profile_app&version=2&as=A1D53BF94259E77&cp=5B92C98E87777E1&callback=jsonp3"

        print("===url_article_list:", url_article_list)

        meta = dict(response.meta, jsonp_index=jsonp_index)
        print("===meta:", meta)

        # print("===headers:", response.headers)

        yield scrapy.Request(url=url_article_list,
                             callback=self.parse_article_list,
                             meta=meta)
コード例 #3
0
    def jssdk_signature(self, response):
        AS, CP = get_as_cp()
        jsonp_index = 3

        url = 'https://www.toutiao.com/pgc/ma/'
        url_params = {
            'page_type': 1,
            'max_behot_time': '',
            'uid': response.meta['userid'],
            'media_id': response.meta['mediaid'],
            'output': 'json',
            'is_json': 1,
            'count': 20,
            'from': 'user_profile_app',
            'version': 2,
            'as': AS,
            'cp': CP,
            'callback': 'jsonp%d' % jsonp_index,
        }
        url_article_list = get_update_url(url, url_params)

        meta = dict(response.meta, jsonp_index=jsonp_index)

        yield scrapy.Request(url=url_article_list,
                             callback=self.parse_article_list,
                             meta=meta)
コード例 #4
0
ファイル: wb.py プロジェクト: JesseYan/news_spider
    def jssdk_signature(self, response):
        AS, CP = get_as_cp()
        jsonp_index = 3

        url = 'https://m.toutiao.com/list/'
        # url_params = {
        #     'page_type': 1,
        #     'max_behot_time': '',
        #     'uid': response.meta['userid'],
        #     'media_id': response.meta['mediaid'],
        #     'output': 'json',
        #     'is_json': 1,
        #     'count': 20,
        #     'from': 'user_profile_app',
        #     'version': 2,
        #     'as': AS,
        #     'cp': CP,
        #     'callback': 'jsonp%d' % jsonp_index,
        # }
        url_params = {
            'tag': 'news_hot',
            'max_behot_time': '%10d' % time.time(),
            'format': 'json_raw',
            'output': 'json',
            'is_json': 1,
            'count': 20,
            'version': 2,
            'as': AS,
            'cp': CP,
            'callback': 'jsonp%d' % jsonp_index,
        }
        url_article_list = get_update_url(url, url_params)
        # url_article_list = "https://m.toutiao.com/list/?tag=__all__&ac=wap&count=20&format=json_raw&as=A1755BB952EA81E&cp=5B922AF8A19E2E1&max_behot_time=1536333324"

        print("===url_article_list:", url_article_list)

        meta = dict(response.meta, jsonp_index=jsonp_index)
        # print("===meta:", meta)

        # print("===headers:", response.headers)

        yield scrapy.Request(url=url_article_list,
                             callback=self.parse_article_list,
                             meta=meta)
コード例 #5
0
ファイル: wb.py プロジェクト: JesseYan/news_spider
    def parse_article_list(self, response):
        """
        文章列表
        :param response:
        :return:
        """
        body = response.body_as_unicode()
        # print("headers:===\n", response.request.headers)
        # print("body:====\n", body)

        jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0)
        result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')'))

        # 详情
        data_list = result.get('data', [])
        print("\n====data_list len:", len(data_list))
        for data_item in data_list:
            detail_url = self.web_host_url + data_item.get(
                'source_url') + 'info/'
            print("****detail_url:", detail_url)
            article_url = self.web_host_url + data_item.get('source_url')

            article_id = data_item['item_id']
            article_title = data_item['title']
            pub_time = data_item['behot_time']
            keywords = data_item['keywords'] if 'keywords' in data_item else ''

            meta = dict(
                response.meta,
                detail_url=detail_url,
                article_url=article_url,
                item_id=article_id,
                article_title=article_title,
                article_pub_time=pub_time,
                keywords=keywords,
            )
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse_article_detail,
                                 meta=meta)

        # 翻页
        has_more = result.get('has_more')
        if has_more:
            max_behot_time = ''
            if 'next' in result and 'max_behot_time' in result['next']:
                max_behot_time = result['next']['max_behot_time']
            AS, CP = get_as_cp()
            jsonp_index = response.meta.get('jsonp_index', 0) + 1

            url_params_next = {
                'max_behot_time': max_behot_time or '%10d' % time.time(),
                'as': AS,
                'cp': CP,
                'callback': 'jsonp%d' % jsonp_index,
            }
            print("max_behot_time:", url_params_next['max_behot_time'])

            url_article_list_next = get_update_url(response.url,
                                                   url_params_next)

            meta = dict(response.meta, jsonp_index=jsonp_index)
            time.sleep(self.FRESH_DELAY)
            yield scrapy.Request(url=url_article_list_next,
                                 callback=self.parse_article_list,
                                 meta=meta)