コード例 #1
0
    def _get_one_page_comment_info(self, page_num, goods_id) -> tuple:
        """
        获取单页comment info
        :return:
        """
        def _get_params(goods_id, page_num, page_size):
            params = (
                ('productId', str(goods_id)),
                ('tagId', ''),
                ('page', str(page_num)),
                ('perPage', page_size),
            )

            return params
        
        tmp_url = 'https://th5.m.zhe800.com/app/detail/comment/list'
        headers = get_random_headers(
            connection_status_keep_alive=False,
            upgrade_insecure_requests=False,
            cache_control='', )
        headers.update({
            'referer': 'https://th5.m.zhe800.com/h5/comment/list?zid={0}&dealId=39890410&tagId='.format(str(goods_id))
        })
        params = _get_params(
            goods_id=goods_id,
            page_num=page_num,
            page_size=self.page_size,
        )
        body = Requests.get_url_body(
            url=tmp_url,
            headers=headers,
            params=params,
            ip_pool_type=self.ip_pool_type)
        # self.lg.info(str(body))
        data = json_2_dict(
            json_str=body,
            logger=self.lg,
            default_res={})
        # pprint(data)
        assert data.get('comments') is not None\
            and data.get('hasNext') is not None, '获取到的data为None, 出错goods_id: {}'.format(goods_id)

        # 判断是否下页还有评论信息
        # <class 'bool'>
        has_next_page = data.get('hasNext', False)
        data = data.get('comments', [])
        self.lg.info('[{}] page_num: {}'.format(
            '+' if data != [] else '-',
            page_num,))

        return data, has_next_page
コード例 #2
0
def get_public_ip():
    renew_tor()
    proxies = {
        # privoxy默认监听8118端口,它把http请求转向到Tor的9050端口
        'http': '127.0.0.1:8118',
    }
    headers = get_random_headers(
        connection_status_keep_alive=False,
        upgrade_insecure_requests=False,
        cache_control='',
    )
    body = requests.get(
        url="https://httpbin.org/get",
        headers=headers,
        proxies=proxies)\
        .text
    print(body)
コード例 #3
0
    def _judge_is_taobao_head_img(self, url):
        '''
        判断是否为淘宝默认头像地址
        :param url:
        :return:
        '''
        tmp_proxies = Requests._get_proxies(ip_pool_type=self.ip_pool_type)

        headers = get_random_headers(
            connection_status_keep_alive=False,
            upgrade_insecure_requests=False,
            cache_control='',
        )
        try:
            _res = requests.get(url=url, headers=headers, proxies=tmp_proxies)
            self.lg.info(str(_res.url))
            if _res.url == 'https://gw.alicdn.com/tps/i3/TB1yeWeIFXXXXX5XFXXuAZJYXXX-210-210.png_40x40.jpg':
                return True
            else:
                return False
        except:
            self.lg.info('检测图片地址时网络错误! 跳过!')
            return False
コード例 #4
0
    def _get_one_page_comment_info(self, goods_id, page_num, member_id,) -> list:
        """
        获取单页的comment info
        :param page_num:
        :return:
        """
        def _get_params(goods_id, page_num, member_id):
            # t = str(int(time.time())) + str(randint(100, 999))
            # self.lg.info(member_id)
            params = (
                # ('callback', 'jQuery17205914468174705312_1531451658317'),
                ('_input_charset', 'GBK'),
                ('offerId', str(goods_id)),
                ('page', str(page_num)),
                ('pageSize', '15'),
                ('starLevel', '7'),
                # ('orderBy', 'date'),
                ('orderBy', ''),
                ('semanticId', ''),
                # ('showStat', '0'),
                ('showStat', '1'),
                ('content', '1'),
                # ('t', t),
                ('memberId', str(member_id)),
                ('isNeedInitRate', 'false'),
            )

            return params

        params = _get_params(
            goods_id=goods_id,
            page_num=page_num,
            member_id=member_id)
        url = 'https://rate.1688.com/remark/offerDetail/rates.json'
        headers = get_random_headers(
            connection_status_keep_alive=False,
            upgrade_insecure_requests=False,
            cache_control='',)
        headers.update({
            'referer': 'https://detail.1688.com/offer/{0}.html'.format(str(goods_id))
        })
        # 原先用Requests老是404,改用phantomjs也老是404
        body = Requests.get_url_body(
            url=url,
            headers=headers,
            params=params,
            ip_pool_type=self.ip_pool_type,
            cookies=self.login_cookies_dict,)
        self.lg.info(str(body))
        assert body != '', '该地址的body为空值, 出错goods_id: {0}'.format(goods_id)

        _data = json_2_dict(
            json_str=body,
            logger=self.lg,
            default_res={})
        if _data.get('url') is not None:
            sleep(self._page_sleep_time)
            assert _data.get('url') is not None, '------>>>| 被重定向到404页面, 休眠{0}s中...'.format(self._page_sleep_time)

        data = _data.get('data', {}).get('rates', [])
        self.lg.info('[{}] goods_id: {}, page_num: {}'.format(
            '+' if data != [] else '-',
            goods_id,
            page_num,))

        # assert data != [], '获取到的data为空list!'

        return data
コード例 #5
0
    def _get_one_page_comment_info(self, page_num, goods_id) -> list:
        """
        获取单页评论页面信息
        :param page_num:
        :param goods_id:
        :return:
        """
        def _get_params(goods_id, page_num) -> tuple:
            return (
                ('auctionNumId', goods_id),
                # ('userNumId', '1681172037'),
                ('currentPageNum', str(page_num)),
                ('pageSize', '20'),
                ('rateType', '1'),
                ('orderType', 'sort_weight'),
                ('attribute', ''),
                ('sku', ''),
                ('hasSku', 'false'),
                ('folded', '0'),  # 把默认的0改成1能得到需求数据
                # ('ua', '098#E1hv1QvWvRGvUpCkvvvvvjiPPFMWAjEmRLdWlj1VPmPvtjEvnLsh1j1WR2cZgjnVRT6Cvvyv9VliFvmvngJjvpvhvUCvp2yCvvpvvhCv2QhvCPMMvvvCvpvVvUCvpvvvKphv8vvvpHwvvvmRvvCmDpvvvNyvvhxHvvmChvvvB8wvvUVhvvChiQvv9OoivpvUvvCCUqf1csREvpvVvpCmpaFZmphvLv84Rs+azCIajCiABq2XrqpAhjCbFO7t+3vXwyFEDLuTRLa9C7zhVTTJhLhL+87J+u0OakSGtEkfVCl1pY2ZV1OqrADn9Wma+fmtEp75vpvhvvCCBUhCvCiI712MPY147DSOSrGukn22SYHsp7uC6bSVksyCvvpvvhCv'),
                # ('_ksTS', '1523329154439_1358'),
                # ('callback', 'jsonp_tbcrate_reviews_list'),
            )

        headers = get_random_headers(
            connection_status_keep_alive=False,
            upgrade_insecure_requests=False,
            cache_control='',
        )
        headers.update({
            'authority':
            'rate.taobao.com',
            'referer':
            'https://item.taobao.com/item.htm?id={}'.format(goods_id)
        })
        url = 'https://rate.taobao.com/feedRateList.htm'
        _params = _get_params(goods_id=goods_id, page_num=page_num)
        body = Requests.get_url_body(
            url=url,
            headers=headers,
            params=_params,
            encoding='gbk',
            ip_pool_type=self.ip_pool_type,
            cookies=self.login_cookies_dict,
        )
        # self.lg.info(str(body))

        try:
            body = re.compile('\((.*)\)').findall(body)[0]
        except IndexError:
            sleep(.5)
            raise IndexError('re得到需求body时出错! 出错goods_id: ' + goods_id)

        data = json_2_dict(json_str=body, logger=self.lg).get('comments')
        # pprint(data)
        if data is None:
            assert data is not None, '出错goods_id: ' + goods_id

        self.lg.info('[{}] page_num: {}'.format(
            '+' if data != [] else '-',
            page_num,
        ))
        # assert data != [], '该页的"comments"=[], 跳出本次循环!'

        return data
コード例 #6
0
ファイル: tmall_comment_parse.py プロジェクト: devyru/python
    def _get_one_page_comment_info(self, goods_id, seller_id, page_num,
                                   _type) -> list:
        """
        获取单页comment info
        :return:
        """
        def _get_params(goods_id, seller_id, page_num, page_size):
            callback = '_DLP_2519_der_3_currentPage_{0}_pageSize_{1}_'.format(
                page_num, page_size)
            params = (
                ('itemId', goods_id),
                ('sellerId', seller_id),
                ('order', '3'),
                ('currentPage', str(page_num)),
                ('pageSize', page_size),
                ('callback', callback),
            )

            return params

        _url = 'https://rate.tmall.com/list_detail_rate.htm'
        headers = get_random_headers(
            connection_status_keep_alive=False,
            upgrade_insecure_requests=False,
            cache_control='',
        )
        headers.update({
            'referer':
            'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id),
        })
        params = _get_params(goods_id=goods_id,
                             seller_id=seller_id,
                             page_num=page_num,
                             page_size=self.page_size)
        # cookies必须! requests 请求无数据!
        body = Requests.get_url_body(
            url=_url,
            headers=headers,
            params=params,
            cookies=self.login_cookies_dict,
            ip_pool_type=self.ip_pool_type,
        )

        # 所以直接用phantomjs来获取相关api数据
        # _url = _get_url_contain_params(url=_url, params=params)
        # self.lg.info(_url)
        # body = self.driver.get_url_body(url=_url)
        # self.lg.info(str(body))
        assert body != '', '获取到的body为空str! 出错type:{0}, goods_id:{1}'.format(
            _type, goods_id)

        try:
            data = json_2_dict(
                json_str=re.compile('\((.*)\)').findall(body)[0],
                default_res={},
                logger=self.lg,
            )
            redict_url = 'https:' + data.get('url', '').replace(
                'https:', '') if data.get('url', '') != '' else ''
            if redict_url != '':
                self.lg.info(redict_url)
            else:
                pass

            data = data.get('rateDetail', {}).get('rateList', [])
        except IndexError:
            raise IndexError

        self.lg.info('[{}] page_num: {}'.format(
            '+' if data != [] else '-',
            page_num,
        ))

        return data