Esempio n. 1
0
    def after_login(self, response):
        data = {
            'savestate': '1',
            'callback': 'jsonpcallback%13d' % (time.time() * 1000),
        }

        res = response.body_as_unicode()
        info = json.loads(res)

        crossdomainlist = info['data']['crossdomainlist']
        self.uid = info['data']['uid']

        url_weibo_com = get_update_url(crossdomainlist['weibo.com'], data)
        url_sina_com_cn = get_update_url(crossdomainlist['sina.com.cn'], data)
        url_weibo_cn = get_update_url(crossdomainlist['weibo.cn'], data)

        url_items = {
            'url_weibo_com': url_weibo_com,
            'url_sina_com_cn': url_sina_com_cn,
            'url_weibo_cn': url_weibo_cn,
        }

        meta = dict(response.meta, **url_items)

        # 跨域处理 weibo.com
        yield scrapy.Request(url=url_weibo_com,
                             callback=self.crossdomain_weibo_com,
                             meta=meta)
Esempio n. 2
0
    def start_requests(self):
        """
        入口准备
        :return:
        """
        url_params = {
            'version_code': '6.4.2',
            'version_name': '',
            'device_platform': 'iphone',
            'tt_from': 'weixin',
            'utm_source': 'weixin',
            'utm_medium': 'toutiao_ios',
            'utm_campaign': 'client_share',
            'wxshare_count': '1',
        }

        task_id = pop_task(self.name)

        if not task_id:
            print('%s task is empty' % self.name)
            return
        print('%s task id: %s' % (self.name, task_id))

        task_item = get_item(FetchTask, task_id)
        fetch_url = 'http://m.toutiao.com/profile/%s/' % task_item.follow_id
        url_profile = get_update_url(fetch_url, url_params)
        meta = {
            'task_id': task_item.id,
            'platform_id': task_item.platform_id,
            'channel_id': task_item.channel_id,
            'follow_id': task_item.follow_id,
            'follow_name': task_item.follow_name,
        }
        yield scrapy.Request(url=url_profile, callback=self.get_profile, meta=meta)
Esempio n. 3
0
    def parse_article_list(self, response):
        """
        文章列表
        :param response:
        :return:
        """
        body = response.body_as_unicode()
        jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0)
        result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')'))
        # 翻页
        has_more = result.get('has_more')
        if has_more:
            max_behot_time = result['next']['max_behot_time']
            AS, CP = get_as_cp()
            jsonp_index = response.meta.get('jsonp_index', 0) + 1

            url_params_next = {
                'max_behot_time': max_behot_time,
                'as': AS,
                'cp': CP,
                'callback': 'jsonp%d' % jsonp_index,
            }

            url_article_list_next = get_update_url(response.url, url_params_next)

            meta = dict(response.meta, jsonp_index=jsonp_index)
            yield scrapy.Request(url=url_article_list_next, callback=self.parse_article_list, meta=meta)
        # 详情
        data_list = result.get('data', [])
        for data_item in data_list:
            detail_url = data_item.get('source_url')
            meta = dict(response.meta, detail_url=detail_url)
            yield scrapy.Request(url=detail_url, callback=self.parse_article_detail, meta=meta)
Esempio n. 4
0
    def start_requests(self):
        """
        入口准备
        :return:
        """
        boot_url = 'http://weixin.sogou.com/weixin'

        task_id = pop_task(self.name)

        if not task_id:
            print('%s task is empty' % self.name)
            return

        task_item = get_item(FetchTask, task_id)

        cookies_id, cookies = get_cookies(self.name)
        url_params = {
            'type': 1,
            # 'query': task_item.follow_id,
            'query': task_item.follow_name.encode('utf-8'),
        }
        url_profile = get_update_url(boot_url, url_params)
        meta = {
            'cookiejar': cookies_id,
            'task_id': task_item.id,
            'platform_id': task_item.platform_id,
            'channel_id': task_item.channel_id,
            'follow_id': task_item.follow_id,
            'follow_name': task_item.follow_name,
        }

        yield scrapy.Request(url=url_profile,
                             cookies=cookies,
                             callback=self.parse_account_search_list,
                             meta=meta)
Esempio n. 5
0
    def jssdk_signature(self, response):
        AS, CP = get_as_cp()
        jsonp_index = 3

        url = 'https://www.toutiao.com/pgc/ma/'
        url_params = {
            'page_type': 1,
            'max_behot_time': '',
            'uid': response.meta['userid'],
            'media_id': response.meta['mediaid'],
            'output': 'json',
            'is_json': 1,
            'count': 20,
            'from': 'user_profile_app',
            'version': 2,
            'as': AS,
            'cp': CP,
            'callback': 'jsonp%d' % jsonp_index,
        }
        url_article_list = get_update_url(url, url_params)

        meta = dict(response.meta, jsonp_index=jsonp_index)

        yield scrapy.Request(url=url_article_list,
                             callback=self.parse_article_list,
                             meta=meta)
Esempio n. 6
0
    def jssdk_signature(self, response):
        AS, CP = get_as_cp()
        jsonp_index = 3

        url = 'https://www.toutiao.com/pgc/ma/'
        url_params = {
            'page_type': 1,
            'max_behot_time': '',
            'uid': response.meta['userid'],
            'media_id': response.meta['mediaid'],
            'output': 'json',
            'is_json': 1,
            'count': 20,
            'from': 'user_profile_app',
            'version': 2,
            'as': AS,
            'cp': CP,
            'callback': 'jsonp%d' % jsonp_index,
        }
        url_article_list = get_update_url(url, url_params)
        url_article_list = "https://www.toutiao.com/pgc/ma/?page_type=1&max_behot_time=&uid=6555293927&media_id=6555293927&output=json&is_json=1&count=20&from=user_profile_app&version=2&as=A1D53BF94259E77&cp=5B92C98E87777E1&callback=jsonp3"

        print("===url_article_list:", url_article_list)

        meta = dict(response.meta, jsonp_index=jsonp_index)
        print("===meta:", meta)

        # print("===headers:", response.headers)

        yield scrapy.Request(url=url_article_list,
                             callback=self.parse_article_list,
                             meta=meta)
Esempio n. 7
0
 def start_requests(self):
     """
     入口准备
     :return:
     """
     url_params = {
         'version_code': '6.4.2',
         'version_name': '',
         'device_platform': 'iphone',
         'tt_from': 'weixin',
         'utm_source': 'weixin',
         'utm_medium': 'toutiao_ios',
         'utm_campaign': 'client_share',
         'wxshare_count': '1',
     }
     url = 'http://open.snssdk.com/jssdk_signature/'
     url_params = {
         'appid': 'wxe8b89be1715734a0',
         'noncestr': 'Wm3WZYTPz0wzccnW',
         'timestamp': '%13d' % (time.time() * 1000),
         'callback': 'jsonp2',
     }
     url_jssdk_signature = get_update_url(url, url_params)
     yield scrapy.Request(
         url=url_jssdk_signature,
         callback=self.jssdk_signature,
         headers=self.custom_settings['DEFAULT_REQUEST_HEADERS'],
         cookies=None)
Esempio n. 8
0
    def get_profile(self, response):
        userid = response.xpath('//button[@itemid="topsharebtn"]/@data-userid').extract_first(default='')
        mediaid = response.xpath('//button[@itemid="topsharebtn"]/@data-mediaid').extract_first(default='')

        meta = dict(response.meta, userid=userid, mediaid=mediaid)

        url = 'http://open.snssdk.com/jssdk_signature/'
        url_params = {
            'appid': 'wxe8b89be1715734a6',
            'noncestr': 'Wm3WZYTPz0wzccnW',
            'timestamp': '%13d' % (time.time() * 1000),
            'callback': 'jsonp2',
        }
        url_jssdk_signature = get_update_url(url, url_params)
        yield scrapy.Request(url=url_jssdk_signature, callback=self.jssdk_signature, meta=meta)
Esempio n. 9
0
    def login_sina_sso_prelogin(self, response):
        login_data = get_login_data()
        self.login_form_data.update(login_data)
        login_sina_sso_prelogin_url = 'https://login.sina.com.cn/sso/prelogin.php'
        query_payload = {
            'checkpin': '1',
            'entry': 'mweibo',
            'su': get_su(login_data.get('username', '')),
            'callback': 'jsonpcallback%13d' % (time.time() * 1000),
        }
        request_url = get_update_url(login_sina_sso_prelogin_url,
                                     query_payload)

        yield scrapy.Request(url=request_url,
                             callback=self.passport_weibo_sso_login)
Esempio n. 10
0
def fetch_proxy(country='China', scheme='http'):
    """
    获取代理
    :param country:
    :param scheme:
    :return:
    """
    data = {}
    if country:
        data['country'] = country
    if scheme:
        data['type'] = scheme
    url = 'http://proxy.nghuyong.top/'
    url = get_update_url(url, data)
    res = requests.get(url, timeout=REQUESTS_TIME_OUT).json()
    return [
        '%s://%s' % (i['type'], i['ip_and_port']) for i in res.get('data', [])
    ]
Esempio n. 11
0
    def jssdk_signature(self, response):
        AS, CP = get_as_cp()
        jsonp_index = 3

        url = 'https://m.toutiao.com/list/'
        # url_params = {
        #     'page_type': 1,
        #     'max_behot_time': '',
        #     'uid': response.meta['userid'],
        #     'media_id': response.meta['mediaid'],
        #     'output': 'json',
        #     'is_json': 1,
        #     'count': 20,
        #     'from': 'user_profile_app',
        #     'version': 2,
        #     'as': AS,
        #     'cp': CP,
        #     'callback': 'jsonp%d' % jsonp_index,
        # }
        url_params = {
            'tag': 'news_hot',
            'max_behot_time': '%10d' % time.time(),
            'format': 'json_raw',
            'output': 'json',
            'is_json': 1,
            'count': 20,
            'version': 2,
            'as': AS,
            'cp': CP,
            'callback': 'jsonp%d' % jsonp_index,
        }
        url_article_list = get_update_url(url, url_params)
        # url_article_list = "https://m.toutiao.com/list/?tag=__all__&ac=wap&count=20&format=json_raw&as=A1755BB952EA81E&cp=5B922AF8A19E2E1&max_behot_time=1536333324"

        print("===url_article_list:", url_article_list)

        meta = dict(response.meta, jsonp_index=jsonp_index)
        # print("===meta:", meta)

        # print("===headers:", response.headers)

        yield scrapy.Request(url=url_article_list,
                             callback=self.parse_article_list,
                             meta=meta)
Esempio n. 12
0
    def parse_article_list(self, response):
        """
        文章列表
        :param response:
        :return:
        """
        body = response.body_as_unicode()
        # print("headers:===\n", response.request.headers)
        # print("body:====\n", body)

        jsonp_text = 'jsonp%d' % response.meta.get('jsonp_index', 0)
        result = json.loads(body.lstrip('%s(' % jsonp_text).rstrip(')'))

        # 详情
        data_list = result.get('data', [])
        print("\n====data_list len:", len(data_list))
        for data_item in data_list:
            detail_url = self.web_host_url + data_item.get(
                'source_url') + 'info/'
            print("****detail_url:", detail_url)
            article_url = self.web_host_url + data_item.get('source_url')

            article_id = data_item['item_id']
            article_title = data_item['title']
            pub_time = data_item['behot_time']
            keywords = data_item['keywords'] if 'keywords' in data_item else ''

            meta = dict(
                response.meta,
                detail_url=detail_url,
                article_url=article_url,
                item_id=article_id,
                article_title=article_title,
                article_pub_time=pub_time,
                keywords=keywords,
            )
            yield scrapy.Request(url=detail_url,
                                 callback=self.parse_article_detail,
                                 meta=meta)

        # 翻页
        has_more = result.get('has_more')
        if has_more:
            max_behot_time = ''
            if 'next' in result and 'max_behot_time' in result['next']:
                max_behot_time = result['next']['max_behot_time']
            AS, CP = get_as_cp()
            jsonp_index = response.meta.get('jsonp_index', 0) + 1

            url_params_next = {
                'max_behot_time': max_behot_time or '%10d' % time.time(),
                'as': AS,
                'cp': CP,
                'callback': 'jsonp%d' % jsonp_index,
            }
            print("max_behot_time:", url_params_next['max_behot_time'])

            url_article_list_next = get_update_url(response.url,
                                                   url_params_next)

            meta = dict(response.meta, jsonp_index=jsonp_index)
            time.sleep(self.FRESH_DELAY)
            yield scrapy.Request(url=url_article_list_next,
                                 callback=self.parse_article_list,
                                 meta=meta)