def get_all_img_url(self, goods_id, is_hk): ''' 得到all_img_url :param goods_id: :param is_hk: :return: ''' if is_hk is True: # 全球购 tmp_url_2 = 'https://www.miyabaobei.hk/item-' + str( goods_id) + '.html' else: tmp_url_2 = 'https://www.mia.com/item-' + str(goods_id) + '.html' tmp_body_2 = Requests.get_url_body(url=tmp_url_2, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(Selector(text=tmp_body_2).css('div.small').extract()) if tmp_body_2 == '': print('请求tmp_body_2为空值, 此处先跳过!') return '' all_img_url = [] for item in Selector(text=tmp_body_2).css('div.small img').extract(): # print(item) tmp_img_url = Selector( text=item).css('img::attr("src")').extract_first() all_img_url.append({'img_url': tmp_img_url}) return all_img_url
def get_p_info_list(self, goods_id): ''' 得到详情介绍信息 :param goods_id: :return: 返回一个list ''' p_info_url = 'https://pina.m.zhe800.com/cns/products/get_product_properties_list.json?productId=' + str(goods_id) p_info_body = Requests.get_url_body(url=p_info_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) if p_info_body == '': print('获取到的p_info_body为空值, 此处跳过!') p_info_body = '{}' tmp_p_info = json_2_dict(json_str=p_info_body).get('perportieslist', []) if tmp_p_info == []: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 if tmp_p_info != []: p_info = [{ 'p_name': item.get('name', ''), 'p_value': item.get('value'), } for item in tmp_p_info] else: p_info = tmp_p_info return p_info
def share_2_wx() -> bool: ''' 分享给微信 :return: ''' cookies = { 'wk_': '8llgqrevckd0bmllcdgrtqjv88elq3fl', } headers = { 'Host': 'ios.riyiwk.com', 'accept': '*/*', 'content-type': 'application/x-www-form-urlencoded', 'user-agent': 'ExtraIncome/2.6.0 (iPhone; iOS 11.0; Scale/3.00)', 'accept-language': 'zh-Hans-CN;q=1, en-CN;q=0.9', } data = 'data=6FutSNjTIN512XBvPZXgztwPxRaLLFygqXFrzxnaSHhKJ0RMskgPCJ1veAFe71DmE/Weqi3qbl9Jp%2BWfhSSCtlPnKIheoydBjmxWvUtEh9qV4RXkSil0AWr5P5f8V4jL/OnQQxXgTeOBhhsJK7140Iuc/kdtw0qP' url = 'https://ios.riyiwk.com//user/shareCallback' message = json_2_dict( Requests.get_url_body(method='post', use_proxy=False, url=url, headers=headers, cookies=cookies, data=data)).get('message', '') label, res = ( '+', True, ) if message == '成功' else ( '-', False, ) print('[{}] 分享微信成功!'.format(label)) return res
def get_jump_to_url_and_is_hk(self, body): ''' 得到跳转地址和is_hk :param body: 待解析的url的body :return: (body, sign_direct_url, is_hk) | 类型: str, str, boolean ''' if re.compile(r'_sign_direct_url = ').findall( body) != []: # 表明是跳转,一般会出现这种情况的是拼团商品 # 出现跳转时 try: sign_direct_url = re.compile( r"_sign_direct_url = '(.*?)';").findall(body)[0] print('*** 获取到跳转地址为: ', sign_direct_url) except IndexError: sign_direct_url = '' print('获取跳转的地址时出错!') body = Requests.get_url_body(url=sign_direct_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) if re.compile(r'://m.miyabaobei.hk/').findall( sign_direct_url) != []: # 表示为全球购商品 print('*** 此商品为全球购商品!') is_hk = True else: is_hk = False else: is_hk = False sign_direct_url = '' return (body, sign_direct_url, is_hk)
def turn_one_time() -> dict: cookies = { 'Hm_lpvt_fa0ddec29ac177a2d127cebe209832e3': str(datetime_to_timestamp(get_shanghai_time())), 'Hm_lvt_fa0ddec29ac177a2d127cebe209832e3': '1537161510,1537228200,1537353114,1537411854', # 定值 'wk_': '9umq63s8g6leobk2p285frmp583nhm9t', # 定值 } headers = { 'Host': 'm.riyiwk.com', 'accept': 'application/json, text/javascript, */*; q=0.01', 'origin': 'https://m.riyiwk.com', 'referer': 'https://m.riyiwk.com/lottery.html?check_login=1', 'accept-language': 'zh-cn', 'x-requested-with': 'XMLHttpRequest', 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Mobile/15A5341f/RIYIWK 2.6.0/USER_ID 203793/TOKEN 3a3988e07be98db064a70fc635c0b590', } url = 'https://m.riyiwk.com/lottery/start.html' res = json_2_dict( Requests.get_url_body(method='post', use_proxy=False, url=url, headers=headers, cookies=cookies)) # pprint(res) return res
def _get_this_goods_member_id(self, goods_id): ''' 获取member_id :param goods_id: :return: '' or str ''' headers = { 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_pc_ua(), # 'X-DevTools-Emulate-Network-Conditions-Client-Id': '5C1ED6AF76F4F84D961F136EAA06C40F', } params = ( ('offerId', str(goods_id)), ) url = 'https://m.1688.com/page/offerRemark.htm' body = Requests.get_url_body( url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': self.lg.error('获取到的body为空值!此处跳过!') return '' try: member_id = re.compile(r'"memberId":"(.*?)",').findall(body)[0] except IndexError: self.lg.error('获取member_id时索引异常!请检查!') return '' return member_id
def _get_tmall_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取tmall销量靠前的商品 :param keyword: :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id ''' '''方案: tmall m站的搜索''' # 搜索: 偶尔不稳定但是还是能用 headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', 'authority': 'list.tmall.com', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte', } params = { 'page_size': '20', 'page_no': '1', 'q': str(keyword[1]), 'type': 'p', 'spm': 'a220m.6910245.a2227oh.d100', 'from': 'mallfp..m_1_suggest', 'sort': 'd', } s_url = 'https://list.tmall.com/m/search_items.htm' body = Requests.get_url_body(url=s_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: data = json_2_dict(json_str=body, logger=self.lg) if data == {}: self.lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: _ = data.get('item', []) if _ is None or _ == []: self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] try: goods_id_list = [str(item.get('url', '')) for item in _] except Exception as e: self.lg.exception(e) self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] return goods_id_list
def fetch(pid): url = 'http://json-time.appspot.com/time.json' body = Requests.get_url_body(url=url, use_proxy=False) # print(body) json_result = json.loads(body) datetime = json_result['datetime'] print('Process %s: %s' % (pid, datetime)) return json_result['datetime']
def bg_login(bg_username, bg_pwd): ''' 后台login :param validate: :return: ''' cookies = { '_9755xjdesxxd_': '32', 'gdxidpyhxdE': 'gjKCnDWASVwyJpOSGKLIaqHXYt0Qjq7Ycs7JzzLNWoZV2S%5CTam6fybIabIljeoL4JpfrI%2Bl6Xp9wLy5bHanMUDVPQdC3%2B3ihW%2BrP1cH6ktTTEvKfaPLQSHkkL5Wn7BpLALiek4J2Bq9nan1om%2B8dA%2FYyoxxDwX7vLusi5dLf%2Bni%2Fyrot%3A1536833525662', } validate = crack_wy_point_select_captcha(username=username, pwd=pwd, id=id, referer=referer) validate = '' if validate == '' else unquote_plus(validate) print('获取到的validate:{}'.format(validate)) headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'http://120.26.119.135', 'Upgrade-Insecure-Requests': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': get_random_pc_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://120.26.119.135/Login.aspx', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } data = { '__VIEWSTATE': '/wEPDwUKLTUwOTQ0NDQ3MWRk/ffecNvOMZIyPoiGxLPop3/5ERoE5/VlszxMMNbpijg=', '__VIEWSTATEGENERATOR': 'C2EE9ABB', 'txtUserName': bg_username, 'txtPwd': bg_pwd, # 验证码认证str # 'NECaptchaValidate': 'jrhSRTTEM4fZR9oXGRxtC4oiups4od-qu7zvHUkrheMvtGBDV-UPUNmpcigljb2adxT.49aFGB6.Ez2EfgTbMvjMLp54AF9KAmfNAjVoN7.UWqxQac6zbtrU-nWbFc-22a_E85FotOmPBIQFb1U68mRGd0.xBv_N5BIqAFqi495WKS0XQwyQE7frGovtg0OQoah9eXFaLall-rRlaWQrHe6ifSAGnCrLYpfU7P1W561gIUssJJ0Jfs_BGSQshsQ_XivpGyt84K9ISOTijZ45h1NQbaSwupv_EGXSgkXv4T8gnJHao1E9d5e7rqeGw_YgYLQiEzhm1.uuG2xQVPPdYbYVdk0kbQDyTDTTfyMrVkfMdwnjh.XupVrShm1vEPI9YHJGFuh.GwezkeQJCLb1BwbJ_gXPLE9evLEUGa.R4mvLZuxjkzS28qksNpyzFSs0NDobMc18Y81Vr_XiRZu.mGCmfemIE.yWSmgNnPpS.IbY6w6laJkEF1oT5sI3', 'NECaptchaValidate': validate, 'btnLogin': '******', } url = 'http://admin.k85u.com/index.aspx' body = Requests.get_url_body(method='post', url=url, headers=headers, cookies=cookies, data=data, use_proxy=False) print(body) return body
def _get_div_desc(self, data): ''' 得到div_desc :param data: :return: ''' def _get_right_body(body): '''得到main_body''' # 处理data-lazy-src body = re.compile(r'<img src=').sub('<img data-lazy-src=', body) body = re.compile(r'data-lazy-src=').sub('src=', body) body = re.compile(r'<img data-src=').sub('<img src=', body) body = re.compile(r';opacity:0').sub('', body) # 不替换否则不显示图片 # print(body) try: main_body = re.compile(r'<main .*?>(.*)</main>').findall(body)[0] except IndexError: main_body = re.compile(r'<body>(.*?)</body>').findall(body)[0] main_body = re.compile(r'<script.*?>.*?</script>').sub('', main_body) return main_body try: intros = data.get('good', {}).get('intros', [])[0] except IndexError: raise IndexError('获取intros获取异常!') tabs = intros.get('tabs', []) # pprint(tabs) div_desc_url = '' title_list = [ '功能详情', '产品介绍', '概述', '商品详情', ] for item in tabs: if item.get('title', '') in title_list: div_desc_url = item.get('url', '') break if div_desc_url == '': raise ValueError('获取div_desc_url为空值!') body = Requests.get_url_body(url=div_desc_url, headers=self.headers, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': raise ValueError('获取到的div_desc为空值!') div_desc = '<div>' + _get_right_body(body) + '</div>' # self.lg.info(str(div_desc)) return div_desc
def _get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' pintuan_goods_id_list = [] for page in range(0, 100): tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format( str(page)) print('正在抓取的页面地址为: ', tmp_url) try: body = Requests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) assert body != '', 'body为空值!' tmp_data = json_2_dict(json_str=body, default_res={}).get( 'data', {}).get('goods', []) # print(tmp_data) assert tmp_data != [], '该tmp_url得到的goods为空list, 此处跳过!' sleep(.5) except AssertionError as e: print(e) sleep(.5) break tmp_pintuan_goods_id_list = [{ 'goods_id': item.get('goods_id', ''), 'begin_time': timestamp_to_regulartime(int(item.get('start_time', ''))), 'end_time': timestamp_to_regulartime(int(item.get('end_time', ''))), 'all_sell_count': str(item.get('join_number_int', '')), 'page': page, } for item in tmp_data] # print(tmp_pintuan_goods_id_list) for item in tmp_pintuan_goods_id_list: if item.get('goods_id', '') not in [ item2.get('goods_id', '') for item2 in pintuan_goods_id_list ]: pintuan_goods_id_list.append(item) print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list)) print(pintuan_goods_id_list) return pintuan_goods_id_list
def _get_p_info(self, goods_id): p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str( goods_id) tmp_p_info_body = Requests.get_url_body(url=p_info_api_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(tmp_p_info_body) assert tmp_p_info_body != '', '获取到的tmp_p_info_body为空值, 请检查!' p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body) return p_info, tmp_p_info_body
def _get_origin_comment_list(self, **kwargs) -> list: ''' 得到加密的接口数据信息 :param kwargs: :return: ''' csrf = kwargs.get('csrf', '') goods_id = kwargs.get('goods_id', '') cookies = kwargs.get('cookies', '') url = 'https://m.1688.com/page/offerRemark.htm' headers = { 'cookie': cookies, 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': 'application/json, text/javascript, */*; q=0.01', 'referer': 'https://m.1688.com/page/offerRemark.htm?offerId={}'.format(goods_id), 'authority': 'm.1688.com', 'x-requested-with': 'XMLHttpRequest', } origin_comment_list = [] for i in range(1, self.max_page): __wing_navigate_options = { 'data': { 'bizType': 'trade', 'itemId': int(goods_id), 'offerId': str(goods_id), 'page': i, 'pageSize': 5, # 'receiveUserId': 989036456, 'starLevel': 7 } } params = ( ('_csrf', csrf), ('__wing_navigate_type', 'view'), ('__wing_navigate_url', 'detail:modules/offerRemarkList/view'), ('__wing_navigate_options', dumps(__wing_navigate_options)), ('_', str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(start_num=100, end_num=999))), ) body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) data = json_2_dict(body, encoding='ascii').get('data', {}) # pprint(data) one = data.get('model', []) pprint(one) origin_comment_list += one sleep(.25) return origin_comment_list
def _get_one_page_comment_info(self, page_num, goods_id) -> tuple: """ 获取单页comment info :return: """ def _get_params(goods_id, page_num, page_size): params = ( ('productId', str(goods_id)), ('tagId', ''), ('page', str(page_num)), ('perPage', page_size), ) return params tmp_url = 'https://th5.m.zhe800.com/app/detail/comment/list' headers = get_random_headers( connection_status_keep_alive=False, upgrade_insecure_requests=False, cache_control='', ) headers.update({ 'referer': 'https://th5.m.zhe800.com/h5/comment/list?zid={0}&dealId=39890410&tagId='.format(str(goods_id)) }) params = _get_params( goods_id=goods_id, page_num=page_num, page_size=self.page_size, ) body = Requests.get_url_body( url=tmp_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) data = json_2_dict( json_str=body, logger=self.lg, default_res={}) # pprint(data) assert data.get('comments') is not None\ and data.get('hasNext') is not None, '获取到的data为None, 出错goods_id: {}'.format(goods_id) # 判断是否下页还有评论信息 # <class 'bool'> has_next_page = data.get('hasNext', False) data = data.get('comments', []) self.lg.info('[{}] page_num: {}'.format( '+' if data != [] else '-', page_num,)) return data, has_next_page
def get_true_sku_info(self, sku_info): ''' 获取每个规格对应价格跟规格以及其库存 :param sku_info: :return: {} 空字典表示出错 | (true_sku_info, i_s) ''' goods_id_str = '-'.join([item.get('goods_id') for item in sku_info]) # print(goods_id_str) tmp_url = 'https://p.mia.com/item/list/' + goods_id_str # print(tmp_url) tmp_body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(tmp_body) tmp_data = json_2_dict(json_str=tmp_body).get('data', []) if tmp_data == []: return self._data_error_init() true_sku_info = [] i_s = {} for item_1 in sku_info: for item_2 in tmp_data: if item_1.get('goods_id') == str(item_2.get('id', '')): i_s = item_2.get('i_s', {}) # print(i_s) for item_3 in i_s.keys(): tmp = {} if item_3 == 'SINGLE': spec_value = item_1.get('color_name') else: spec_value = item_1.get( 'color_name') + '|' + item_3 normal_price = str(item_2.get('mp')) detail_price = str(item_2.get('sp')) img_url = item_1.get('img_url') rest_number = i_s.get(item_3) if rest_number == 0: pass else: tmp['spec_value'] = spec_value tmp['normal_price'] = normal_price tmp['detail_price'] = detail_price tmp['img_url'] = img_url tmp['rest_number'] = rest_number true_sku_info.append(tmp) return (true_sku_info, i_s)
def _get_aweme_api_videos_info(self, user_id): self.user_id = user_id params = ( ('user_id', self.user_id), ('max_cursor', '0'), ('count', '20'), ) url = 'https://www.douyin.com/aweme/v1/aweme/post/' body = Requests.get_url_body(url=url, headers=self.headers, params=params) # print(body) self.deal_with_data(body=body)
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] for index in range(1, 1000): # 0跟1返回一样,所有从1开始遍历 tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str( index) + '/0/' print('正在抓取: ', tmp_url) body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, high_conceal=True, ip_pool_type=self.ip_pool_type) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: tmp_data = json_2_dict(json_str=body) if tmp_data == {}: print('json.loads转换body时出错, 此处跳过!') if tmp_data.get('data_list', []) == []: print('得到的data_list为[], 此处跳过!') break else: # print(tmp_data) data_list = [{ 'goods_id': item.get('sku', ''), 'sub_title': item.get('intro', ''), 'pid': index, } for item in tmp_data.get('data_list', [])] # pprint(data_list) for item in data_list: goods_list.append(item) sleep(.5) pprint(goods_list) self.deal_with_data(goods_list=goods_list) sleep(8) return None
def _get_66_ip_list(): ''' 先获取66高匿名ip :return: ''' global a_66_ip headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'http://www.66ip.cn/nm.html', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } params = ( ('getnum', ''), ('isp', '0'), ('anonymoustype', '3'), ('start', ''), ('ports', ''), ('export', ''), ('ipaddress', ''), ('area', '0'), ('proxytype', '2'), ('api', '66ip'), ) response = requests.get('http://www.66ip.cn/nmtq.php', headers=headers, params=params, cookies=None) body = Requests._wash_html(response.content.decode('gbk')) try: part = re.compile(r'</script>(.*)</div>').findall(body)[0] except IndexError: part = '' part = re.compile('<script>.*?</script>|</div>.*</div>').sub('', part) # print(part) ip_list = delete_list_null_str(part.split('<br />')) # print(ip_list) a_66_ip = ip_list if ip_list != [] else [] return ip_list
async def _search(self, search_key) -> list: ''' 天眼查搜索功能 :param search_key: 待搜索key :return: ''' headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_phone_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Referer': 'https://m.tianyancha.com/', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', } params = (('key', str(search_key)), ) url = 'https://m.tianyancha.com/search' body = Requests.get_url_body(url=url, headers=headers, params=params, cookies=None) # print(body) if body == '': return [] search_list = [] try: # div.new-border-bottom search_res = Selector(text=body).css( 'div.search_result_container ::text').extract_first() or '' company_name = Selector( text=search_res).css('div.new-border-bottom a span text ::text' ).extract_first() or '' assert company_name != '', 'company_name为空值!' url = Selector(text=search_res).css( 'div.new-border-bottom a ::attr("href")').extract_first() or '' assert url != '', 'url为空值!' legal_person = Selector(text=search_res).css( 'a.legalPersonName ::text').extract_first() or '' legal_person_url = Selector(text=search_res).css( 'a.legalPersonName ::attr("href")').extract_first() or '' legal_person_url = 'https://m.tianyancha.com' + legal_person_url if legal_person_url != '' else '' except AssertionError as e: print(e) return []
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) self.goods_id = goods_id self.headers.update({ 'referer': 'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id), }) # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 3): _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json' params = self._set_params(goods_id=goods_id, current_page=current_page) body = Requests.get_url_body(url=_url, headers=self.headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) _data = json_2_dict(json_str=body, logger=self.lg).get('wareDetailComment', {}).get('commentInfoList', []) if _data == []: self.lg.error('出错goods_id:{0}'.format(self.goods_id)) _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list(_tmp_comment_list=_tmp_comment_list) except Exception as e: self.lg.error('出错goods_id:{0}'.format(goods_id)) self.lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def get_stock_info_dict(self, goods_id): ''' 得到实时库存信息 :param goods_id: :return: 返回dict类型 ''' stock_info_url = 'https://pina.m.zhe800.com/cns/products/' + str(goods_id) + '/realtime_info.json' stock_info_body = Requests.get_url_body(url=stock_info_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) if stock_info_body == '': print('获取到的stock_info_body为空值!') stock_info_body = '{}' tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {}) if tmp_stock_info == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return tmp_stock_info
def _get_1688_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取1688销量靠前的商品信息 :param keyword: :return: a list eg: ['11111', ...] ''' '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品''' headers = { 'authority': 'm.1688.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE', } params = ( ('sortType', 'booked'), ('filtId', ''), ('keywords', keyword[1]), ('descendOrder', 'true'), ) url = 'https://m.1688.com/offer_search/-6161.html' body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: try: goods_id_list = Selector(text=body).css( 'div.list_group-item::attr("data-offer-id")').extract() # pprint(goods_id_list) except Exception as e: self.lg.exception(e) self.lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) goods_id_list = [] return goods_id_list
def _get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' pintuan_goods_id_list = [] for page in range(0, 100): tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format( str(page) ) print('正在抓取的页面地址为: ', tmp_url) body = Requests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) if body == '': body = '{}' try: tmp_data = json.loads(body) tmp_data = tmp_data.get('data', {}).get('goods', []) except: print('json.loads转换tmp_data时出错!') tmp_data = [] # print(tmp_data) sleep(.5) if tmp_data == []: print('该tmp_url得到的goods为空list, 此处跳过!') break tmp_pintuan_goods_id_list = [{ 'goods_id': item.get('goods_id', ''), 'begin_time': timestamp_to_regulartime(int(item.get('start_time', ''))), 'end_time': timestamp_to_regulartime(int(item.get('end_time', ''))), 'all_sell_count': str(item.get('join_number_int', '')), 'page': page, } for item in tmp_data] # print(tmp_pintuan_goods_id_list) for item in tmp_pintuan_goods_id_list: if item.get('goods_id', '') not in [item2.get('goods_id', '') for item2 in pintuan_goods_id_list]: pintuan_goods_id_list.append(item) print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list)) print(pintuan_goods_id_list) return pintuan_goods_id_list
async def _get_all_ame_from_office(self) -> list: ''' 得到全国最新区码(http://xzqh.mca.gov.cn/map) :return: ''' body = Requests.get_url_body(url='http://xzqh.mca.gov.cn/map', headers=await self._get_headers(), ip_pool_type=self.ip_pool_type) # print(body) # http://www.mca.gov.cn/article/sj/tjbz/a/2018/201803131439.html data = json_2_dict(json_str=Selector( text=body).css('table.select_table td input#pyArr ::attr("value")' ).extract_first(), default_res=[]) print('总计邮编个数: {}'.format(len(data))) self.ame_list = data return data
def _get_one_page_comment_info(self, goods_id, page_num) -> list: """ 获取单页comment info :return: """ headers = { 'Referer': 'https://item.m.jd.com/product/{}.html'.format(goods_id), 'User-Agent': get_random_phone_ua(), } params = ( # ('callback', 'skuJDEvalA'), ('sorttype', '5'), ('pagesize', '10'), ('sceneval', '2'), ('score', '3'), # 取好评的 ('sku', str(goods_id)), ('page', str(page_num)), # ('t', '0.7175421988280679'), ) url = 'https://wq.jd.com/commodity/comment/getcommentlist' body = Requests.get_url_body( url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type, ) # self.lg.info(body) assert body != '', 'body不为空值!' data = [] try: data = json_2_dict( json_str=re.compile('\((.*)\)').findall(body)[0], default_res={}).get('result', {}).get('comments', []) except IndexError: pass # pprint(data) self.lg.info('[{}] page_num: {}'.format( '+' if data != [] else '-', page_num, )) # assert data != [], 'data不为空list! 出错goods_id: {}'.format(goods_id) return data
def get_one_page_goods_info(self, *params): ''' 得到一个页面的html代码 :param params: 待传入的参数 :return: '{}' or str ''' gender, page = params tmp_url = 'https://api.chuchujie.com/api/' client = { "ageGroup": "AG_0to24", "channel": "QD_web_webkit", "deviceId": "0", "gender": gender, # '0' -> 女 | '1' -> 男 "imei": "0", "packageName": "com.culiu.purchase", "platform": "wap", "sessionId": "0", "shopToken": "0", "userId": "0", "version": "1.0", "xingeToken": "" } query = { "group": 4, "module": "99", "page": page, "tab": "all" } # 切记: Query String Parameters直接这样编码发送即可 # 如果是要post的数据就得使用post的方法 data = { 'client': json.dumps(client), 'query': json.dumps(query), 'page': page } body = Requests.get_url_body(url=tmp_url, headers=self.headers, params=data, ip_pool_type=self.ip_pool_type) if body == '': body = '{}' return body
def _get_seller_id(self, _type, goods_id): ''' 得到seller_id :param type: :param goods_id: :return: ''' # TODO 与更新脚本接口冲突 # tmall = TmallParse(logger=self.lg) # _g = [_type, goods_id] # self.g_data = tmall.get_goods_data(goods_id=_g) # seller_id = str(self.g_data.get('seller', {}).get('userId', 0)) # # self.lg.info('获取到的seller_id: ' + seller_id) # try: # del tmall # except: # pass # 方案2: headers = self._get_phone_headers() headers.update({ 'authority': 'detail.m.tmall.com', }) # 测试发现: 必要字段_tb_token_, cookie2, t params = (('id', goods_id), ) # 处理天猫国际 url = 'https://detail.m.tmall.com/item.htm' if _type != 2 else 'https://detail.m.tmall.hk/item.htm' body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type, cookies=self.login_cookies_dict) # self.lg.info(body) seller_id = '0' try: seller_id = str(re.compile('\"userId\":(\d+),').findall(body)[0]) except (IndexError, Exception): pass # self.lg.info(seller_id) assert seller_id != '0', '获取到的seller_id为0!' return seller_id
def traversal_hour_timestamp(self, item): ''' 遍历每个需求的整点时间戳 :param item: :return: ''' # 先遍历today的需求的整点时间戳 tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format( str(item)) body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(body) if body == '': print('item为: ', item) print('获取到的body为空值! 此处跳过') else: try: body = re.compile('null\((.*)\)').findall(body)[0] except Exception: print('re匹配body中的数据时出错!') body = '{}' try: tmp_data = json.loads(body) except: print('json.loads转换body时出错, 此处跳过!') tmp_data = {} if tmp_data == {}: print('tmp_data为空{}!') pass else: # pprint(tmp_data) # print(tmp_data) event_time = item item_list = tmp_data.get('data', {}).get('list', []) self.deal_with_data(event_time, item_list) sleep(MOGUJIE_SLEEP_TIME)
def _get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' zid_list = [] for page in range(0, 100): tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format( str(page)) print('正在抓取的页面地址为: ', tmp_url) tmp_body = Requests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) if tmp_body == '': tmp_body = '{}' try: tmp_data = json.loads(tmp_body) tmp_data = tmp_data.get('objects', []) except: print('json.loads转换tmp_data时出错!') tmp_data = [] # print(tmp_data) if tmp_data == []: print('该tmp_url得到的object为空list, 此处跳过!') break tmp_zid_list = [(item.get('product', {}).get('zid', ''), page) for item in tmp_data] # print(tmp_zid_list) for item in tmp_zid_list: if item != '': zid_list.append(item) zid_list = list(set(zid_list)) print('该zid_list的总个数为: ', len(zid_list)) print(zid_list) return zid_list
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' mia_base_number = MIA_BASE_NUMBER self.db_goods_id_list = self._get_db_goods_id_list() assert self.db_goods_id_list is not None, 'self.db_goods_id_list为空值!' while mia_base_number < MIA_MAX_NUMBER: tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( mia_base_number) body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(body) if body == '' or body == '[]': print('mia_base_number为: ', mia_base_number) print('获取到的body为空值! 此处跳过') mia_base_number += 1 continue else: tmp_data = json_2_dict(body, default_res={}) tmp_hour = tmp_data.get('p_info', {}).get('start_time', '')[11:13] if tmp_hour == '22': # 过滤掉秒杀时间为22点的 print('--- 销售时间为22点,不抓取!') pass else: print(tmp_data) print('mia_base_number为: ', mia_base_number) pid = mia_base_number begin_time = tmp_data.get('p_info', {}).get('start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') item_list = tmp_data.get('item_list', []) self.deal_with_data(pid, begin_time, end_time, item_list) sleep(5) mia_base_number += 1