def _get_one_page_comment_info(self, page_num, goods_id) -> tuple: """ 获取单页comment info :return: """ def _get_params(goods_id, page_num, page_size): params = ( ('productId', str(goods_id)), ('tagId', ''), ('page', str(page_num)), ('perPage', page_size), ) return params tmp_url = 'https://th5.m.zhe800.com/app/detail/comment/list' headers = get_random_headers( connection_status_keep_alive=False, upgrade_insecure_requests=False, cache_control='', ) headers.update({ 'referer': 'https://th5.m.zhe800.com/h5/comment/list?zid={0}&dealId=39890410&tagId='.format(str(goods_id)) }) params = _get_params( goods_id=goods_id, page_num=page_num, page_size=self.page_size, ) body = Requests.get_url_body( url=tmp_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) data = json_2_dict( json_str=body, logger=self.lg, default_res={}) # pprint(data) assert data.get('comments') is not None\ and data.get('hasNext') is not None, '获取到的data为None, 出错goods_id: {}'.format(goods_id) # 判断是否下页还有评论信息 # <class 'bool'> has_next_page = data.get('hasNext', False) data = data.get('comments', []) self.lg.info('[{}] page_num: {}'.format( '+' if data != [] else '-', page_num,)) return data, has_next_page
def get_public_ip(): renew_tor() proxies = { # privoxy默认监听8118端口,它把http请求转向到Tor的9050端口 'http': '127.0.0.1:8118', } headers = get_random_headers( connection_status_keep_alive=False, upgrade_insecure_requests=False, cache_control='', ) body = requests.get( url="https://httpbin.org/get", headers=headers, proxies=proxies)\ .text print(body)
def _judge_is_taobao_head_img(self, url): ''' 判断是否为淘宝默认头像地址 :param url: :return: ''' tmp_proxies = Requests._get_proxies(ip_pool_type=self.ip_pool_type) headers = get_random_headers( connection_status_keep_alive=False, upgrade_insecure_requests=False, cache_control='', ) try: _res = requests.get(url=url, headers=headers, proxies=tmp_proxies) self.lg.info(str(_res.url)) if _res.url == 'https://gw.alicdn.com/tps/i3/TB1yeWeIFXXXXX5XFXXuAZJYXXX-210-210.png_40x40.jpg': return True else: return False except: self.lg.info('检测图片地址时网络错误! 跳过!') return False
def _get_one_page_comment_info(self, goods_id, page_num, member_id,) -> list: """ 获取单页的comment info :param page_num: :return: """ def _get_params(goods_id, page_num, member_id): # t = str(int(time.time())) + str(randint(100, 999)) # self.lg.info(member_id) params = ( # ('callback', 'jQuery17205914468174705312_1531451658317'), ('_input_charset', 'GBK'), ('offerId', str(goods_id)), ('page', str(page_num)), ('pageSize', '15'), ('starLevel', '7'), # ('orderBy', 'date'), ('orderBy', ''), ('semanticId', ''), # ('showStat', '0'), ('showStat', '1'), ('content', '1'), # ('t', t), ('memberId', str(member_id)), ('isNeedInitRate', 'false'), ) return params params = _get_params( goods_id=goods_id, page_num=page_num, member_id=member_id) url = 'https://rate.1688.com/remark/offerDetail/rates.json' headers = get_random_headers( connection_status_keep_alive=False, upgrade_insecure_requests=False, cache_control='',) headers.update({ 'referer': 'https://detail.1688.com/offer/{0}.html'.format(str(goods_id)) }) # 原先用Requests老是404,改用phantomjs也老是404 body = Requests.get_url_body( url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type, cookies=self.login_cookies_dict,) self.lg.info(str(body)) assert body != '', '该地址的body为空值, 出错goods_id: {0}'.format(goods_id) _data = json_2_dict( json_str=body, logger=self.lg, default_res={}) if _data.get('url') is not None: sleep(self._page_sleep_time) assert _data.get('url') is not None, '------>>>| 被重定向到404页面, 休眠{0}s中...'.format(self._page_sleep_time) data = _data.get('data', {}).get('rates', []) self.lg.info('[{}] goods_id: {}, page_num: {}'.format( '+' if data != [] else '-', goods_id, page_num,)) # assert data != [], '获取到的data为空list!' return data
def _get_one_page_comment_info(self, page_num, goods_id) -> list: """ 获取单页评论页面信息 :param page_num: :param goods_id: :return: """ def _get_params(goods_id, page_num) -> tuple: return ( ('auctionNumId', goods_id), # ('userNumId', '1681172037'), ('currentPageNum', str(page_num)), ('pageSize', '20'), ('rateType', '1'), ('orderType', 'sort_weight'), ('attribute', ''), ('sku', ''), ('hasSku', 'false'), ('folded', '0'), # 把默认的0改成1能得到需求数据 # ('ua', '098#E1hv1QvWvRGvUpCkvvvvvjiPPFMWAjEmRLdWlj1VPmPvtjEvnLsh1j1WR2cZgjnVRT6Cvvyv9VliFvmvngJjvpvhvUCvp2yCvvpvvhCv2QhvCPMMvvvCvpvVvUCvpvvvKphv8vvvpHwvvvmRvvCmDpvvvNyvvhxHvvmChvvvB8wvvUVhvvChiQvv9OoivpvUvvCCUqf1csREvpvVvpCmpaFZmphvLv84Rs+azCIajCiABq2XrqpAhjCbFO7t+3vXwyFEDLuTRLa9C7zhVTTJhLhL+87J+u0OakSGtEkfVCl1pY2ZV1OqrADn9Wma+fmtEp75vpvhvvCCBUhCvCiI712MPY147DSOSrGukn22SYHsp7uC6bSVksyCvvpvvhCv'), # ('_ksTS', '1523329154439_1358'), # ('callback', 'jsonp_tbcrate_reviews_list'), ) headers = get_random_headers( connection_status_keep_alive=False, upgrade_insecure_requests=False, cache_control='', ) headers.update({ 'authority': 'rate.taobao.com', 'referer': 'https://item.taobao.com/item.htm?id={}'.format(goods_id) }) url = 'https://rate.taobao.com/feedRateList.htm' _params = _get_params(goods_id=goods_id, page_num=page_num) body = Requests.get_url_body( url=url, headers=headers, params=_params, encoding='gbk', ip_pool_type=self.ip_pool_type, cookies=self.login_cookies_dict, ) # self.lg.info(str(body)) try: body = re.compile('\((.*)\)').findall(body)[0] except IndexError: sleep(.5) raise IndexError('re得到需求body时出错! 出错goods_id: ' + goods_id) data = json_2_dict(json_str=body, logger=self.lg).get('comments') # pprint(data) if data is None: assert data is not None, '出错goods_id: ' + goods_id self.lg.info('[{}] page_num: {}'.format( '+' if data != [] else '-', page_num, )) # assert data != [], '该页的"comments"=[], 跳出本次循环!' return data
def _get_one_page_comment_info(self, goods_id, seller_id, page_num, _type) -> list: """ 获取单页comment info :return: """ def _get_params(goods_id, seller_id, page_num, page_size): callback = '_DLP_2519_der_3_currentPage_{0}_pageSize_{1}_'.format( page_num, page_size) params = ( ('itemId', goods_id), ('sellerId', seller_id), ('order', '3'), ('currentPage', str(page_num)), ('pageSize', page_size), ('callback', callback), ) return params _url = 'https://rate.tmall.com/list_detail_rate.htm' headers = get_random_headers( connection_status_keep_alive=False, upgrade_insecure_requests=False, cache_control='', ) headers.update({ 'referer': 'https://detail.m.tmall.com/item.htm?id={}'.format(goods_id), }) params = _get_params(goods_id=goods_id, seller_id=seller_id, page_num=page_num, page_size=self.page_size) # cookies必须! requests 请求无数据! body = Requests.get_url_body( url=_url, headers=headers, params=params, cookies=self.login_cookies_dict, ip_pool_type=self.ip_pool_type, ) # 所以直接用phantomjs来获取相关api数据 # _url = _get_url_contain_params(url=_url, params=params) # self.lg.info(_url) # body = self.driver.get_url_body(url=_url) # self.lg.info(str(body)) assert body != '', '获取到的body为空str! 出错type:{0}, goods_id:{1}'.format( _type, goods_id) try: data = json_2_dict( json_str=re.compile('\((.*)\)').findall(body)[0], default_res={}, logger=self.lg, ) redict_url = 'https:' + data.get('url', '').replace( 'https:', '') if data.get('url', '') != '' else '' if redict_url != '': self.lg.info(redict_url) else: pass data = data.get('rateDetail', {}).get('rateList', []) except IndexError: raise IndexError self.lg.info('[{}] page_num: {}'.format( '+' if data != [] else '-', page_num, )) return data