def _get_this_goods_member_id(self, goods_id): ''' 获取member_id :param goods_id: :return: '' or str ''' headers = { 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_pc_ua(), # 'X-DevTools-Emulate-Network-Conditions-Client-Id': '5C1ED6AF76F4F84D961F136EAA06C40F', } params = ( ('offerId', str(goods_id)), ) url = 'https://m.1688.com/page/offerRemark.htm' body = MyRequests.get_url_body(url=url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': self.my_lg.error('获取到的body为空值!此处跳过!') return '' try: member_id = re.compile(r'"memberId":"(.*?)",').findall(body)[0] except IndexError: self.my_lg.error('获取member_id时索引异常!请检查!') return '' return member_id
def needIdenCode(self): #第一次登录获取验证码尝试,构建request # request = Request(self.loginURL, self.postData, self.loginHeaders) # response = self.opener.open(request) #得到第一次登录尝试的相应 # content = response.read().decode('gbk') # status = response.getcode() # 获取状态吗 response = requests.post(url=self.loginURL, headers=self.loginHeaders, data=json.dumps(self.postData), proxies=MyRequests._get_proxies()) content = response.content.decode('gbk') status = response.status_code #状态码为200,获取成功 if status == 200: print("获取请求成功") #u8bf7u8f93u5165u9a8cu8bc1u7801这六个字是请输入验证码的utf-8编码 pattern = re.compile(u'u8bf7u8f93u5165u9a8cu8bc1u7801', re.S) result = re.search(pattern, content) #如果找到该字符,代表需要输入验证码 if result: print("此次安全验证异常,您需要输入验证码") return content #否则不需要 else: print("此次安全验证通过,您这次不需要输入验证码") return False else: print("获取请求失败")
def get_all_img_url(self, goods_id, is_hk): ''' 得到all_img_url :param goods_id: :param is_hk: :return: ''' if is_hk is True: # 全球购 tmp_url_2 = 'https://www.miyabaobei.hk/item-' + str( goods_id) + '.html' else: tmp_url_2 = 'https://www.mia.com/item-' + str(goods_id) + '.html' tmp_body_2 = MyRequests.get_url_body(url=tmp_url_2, headers=self.headers, had_referer=True) # print(Selector(text=tmp_body_2).css('div.small').extract()) if tmp_body_2 == '': print('请求tmp_body_2为空值, 此处先跳过!') return '' all_img_url = [] for item in Selector(text=tmp_body_2).css('div.small img').extract(): # print(item) tmp_img_url = Selector( text=item).css('img::attr("src")').extract_first() all_img_url.append({'img_url': tmp_img_url}) return all_img_url
def get_jump_to_url_and_is_hk(self, body): ''' 得到跳转地址和is_hk :param body: 待解析的url的body :return: (body, sign_direct_url, is_hk) | 类型: str, str, boolean ''' if re.compile(r'_sign_direct_url = ').findall( body) != []: # 表明是跳转,一般会出现这种情况的是拼团商品 # 出现跳转时 try: sign_direct_url = re.compile( r"_sign_direct_url = '(.*?)';").findall(body)[0] print('*** 获取到跳转地址为: ', sign_direct_url) except IndexError: sign_direct_url = '' print('获取跳转的地址时出错!') body = MyRequests.get_url_body(url=sign_direct_url, headers=self.headers, had_referer=True) if re.compile(r'://m.miyabaobei.hk/').findall( sign_direct_url) != []: # 表示为全球购商品 print('*** 此商品为全球购商品!') is_hk = True else: is_hk = False else: is_hk = False sign_direct_url = '' return (body, sign_direct_url, is_hk)
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} _tmp_comment_list = [] self.my_lg.info('------>>>| 待抓取的goods_id: %s' % goods_id) ''' 下面抓取的是pc端的数据地址 ''' # 获取评论数据 for current_page_num in range(1, 4): self.my_lg.info('------>>>| 正在抓取第%s页评论...' % str(current_page_num)) tmp_url = 'https://rate.taobao.com/feedRateList.htm' _params = self._set_params(current_page_num=current_page_num, goods_id=goods_id) self.headers.update({'referer': 'https://item.taobao.com/item.htm?id='+goods_id}) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=_params, encoding='gbk') # self.my_lg.info(str(body)) try: body = re.compile('\((.*)\)').findall(body)[0] except IndexError: self.my_lg.error('re得到需求body时出错! 出错goods_id: ' + goods_id) sleep(.5) self.result_data = {} return {} data = json_2_dict(json_str=body, logger=self.my_lg).get('comments') # pprint(data) if data is None: self.my_lg.error('出错goods_id: ' + goods_id) self.result_data = {} return {} if data == []: # 该页的"comments"=[], 跳出本次循环 continue _tmp_comment_list += data sleep(self.comment_page_switch_sleep_time) # self.my_lg.info(str(len(_tmp_comment_list))) try: _comment_list = self._get_comment_list(_tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id: ' + goods_id) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_shop_name(self, **kwargs): ''' 得到shop_name ''' data = kwargs.get('data', {}) seller_id = data.get('/app/detail/product/base', {}).get('sellerId', 0) tmp_seller_id_url = 'https://th5.m.zhe800.com/api/getsellerandswitch?sellerId=' + str(seller_id) seller_info_body = MyRequests.get_url_body(url=tmp_seller_id_url, headers=self.headers, high_conceal=True) if seller_info_body == '': print('seller_info为空!') return {} else: seller_info = [seller_info_body] seller_info_str = '' for item_ss in seller_info: # 拼接字符串 seller_info_str += item_ss seller_info = [seller_info_str] # print(seller_info) if seller_info != []: seller_info = json_2_dict(json_str=seller_info[0]) if seller_info == {}: print('卖家信息在转换时出现错误, 此处跳过') return {} # pprint(seller_info) shop_name = seller_info.get('sellerInfo', {}).get('nickName', '') else: shop_name = '' # print(shop_name) return shop_name
def get_p_info_list(self, goods_id): ''' 得到详情介绍信息 :param goods_id: :return: 返回一个list ''' p_info_url = 'https://pina.m.zhe800.com/cns/products/get_product_properties_list.json?productId=' + str(goods_id) p_info_body = MyRequests.get_url_body(url=p_info_url, headers=self.headers, high_conceal=True) if p_info_body == '': print('获取到的p_info_body为空值, 此处跳过!') p_info_body = '{}' tmp_p_info = json_2_dict(json_str=p_info_body).get('perportieslist', []) if tmp_p_info == []: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 if tmp_p_info != []: p_info = [{ 'p_name': item.get('name', ''), 'p_value': item.get('value'), } for item in tmp_p_info] else: p_info = tmp_p_info return p_info
def get_div_desc_body(self, div_desc_url): ''' 得到div_desc的html页面 :param div_desc_url: :return: str类型的data, 出错的情况下返回{} ''' # 使用requests div_desc_body = MyRequests.get_url_body(url=div_desc_url, headers=self.headers) if div_desc_body == '': div_desc_body = '{}' # 使用phantomjs # div_desc_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=div_desc_url) # # print(div_desc_body) # if div_desc_body == '': # div_desc_body = '{}' # else: # try: # div_desc_body = re.compile(r'<body><pre .*?>(.*)</pre></body>').findall(div_desc_body)[0] # div_desc_body = re.compile(r'>').sub('>', div_desc_body) # div_desc_body = re.compile(r'<').sub('<', div_desc_body) # except: # div_desc_body = '{}' tmp_body = json_2_dict(json_str=div_desc_body).get('data', '') if tmp_body == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 tmp_body = self._wash_div_desc(tmp_body=tmp_body) if tmp_body != '': tmp_body = '<div>' + tmp_body + '</div>' return tmp_body
def get_p_info_list(self, p_info_url): ''' 得到详情介绍信息 :param p_info_url: :return: 返回一个list ''' # 使用requests p_info_body = MyRequests.get_url_body(url=p_info_url, headers=self.headers) if p_info_body == '': print('获取到的p_info_body为空值, 此处跳过!') p_info_body = '{}' tmp_p_info = json_2_dict(json_str=p_info_body).get( 'perportieslist', []) if tmp_p_info == []: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 if tmp_p_info != []: p_info = [{ 'p_name': item.get('name', ''), 'p_value': item.get('value'), } for item in tmp_p_info] else: p_info = tmp_p_info return p_info
def _get_one_fund_info(self, fund_code): ''' 得到一只基金的info,并处理 :return: ''' cookies = { 'st_pvi': '11586003301354', 'st_si': '46806950936799', 'ASP.NET_SessionId': 'fhllwae2zicg00o0x4ub1fxs', 'EMFUND1': 'null', 'EMFUND0': 'null', # 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884', 'EMFUND2': '07-10 18:01:38@#$华润元大现金通货币B@#$002884', # 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106', 'EMFUND3': '07-10 18:01:48@#$天弘现金管家货币B@#$420106', # 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301', 'EMFUND4': '07-10 18:11:53@#$方正富邦保险主题指数分级@#$167301', # 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723', 'EMFUND5': '07-10 18:04:32@#$招商中证银行指数分级@#$161723', # 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595', 'EMFUND6': '07-10 18:05:13@#$天弘中证银行指数C@#$001595', # 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594', 'EMFUND7': '07-10 18:06:13@#$天弘中证银行指数A@#$001594', # 'EMFUND8': '07-10%2018%3A11%3A22@%23%24%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148', 'EMFUND8': '07-10 18:11:22@#$申万菱信多策略灵活配置混合A@#$001148', # 'EMFUND9': '07-10 18:12:26@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092', 'EMFUND9': '07-10 18:12:26@#$广发生物科技指数(QDII)@#$001092', } cookies = unquote_cookies(cookies) # pprint(cookies) headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', # 'Referer': 'http://fund.eastmoney.com/001092.html', 'Proxy-Connection': 'keep-alive', } v = re.compile(r'-| |:').sub('', str(get_shanghai_time())) # 2018-07-10 18:30:46 -> 20180710183046 # print(v) params = ( # ('v', '20180710175951'), # 时间 ('v', v), # 时间 ) fund_url = 'http://fund.eastmoney.com/pingzhongdata/{0}.js'.format(fund_code) # response = requests.get(fund_url, headers=headers, params=params, cookies=None) # body = response.text # print(body) body = MyRequests.get_url_body(url=fund_url, headers=headers, params=params, cookies=None) # print(body) self._get_this_fund_info(body=body) return True
def _get_tmall_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取tmall销量靠前的商品 :param keyword: :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id ''' '''方案: tmall m站的搜索''' # 搜索: 偶尔不稳定但是还是能用 headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', 'authority': 'list.tmall.com', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte', } params = { 'page_size': '20', 'page_no': '1', 'q': str(keyword[1]), 'type': 'p', 'spm': 'a220m.6910245.a2227oh.d100', 'from': 'mallfp..m_1_suggest', 'sort': 'd', } s_url = 'https://list.tmall.com/m/search_items.htm' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: data = json_2_dict(json_str=body, logger=self.my_lg) if data == {}: self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: _ = data.get('item', []) if _ is None or _ == []: self.my_lg.error( '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] try: goods_id_list = [str(item.get('url', '')) for item in _] except Exception as e: self.my_lg.exception(e) self.my_lg.error( '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] return goods_id_list
def get_true_sku_info(self, sku_info): ''' 获取每个规格对应价格跟规格以及其库存 :param sku_info: :return: {} 空字典表示出错 | (true_sku_info, i_s) ''' goods_id_str = '-'.join([item.get('goods_id') for item in sku_info]) # print(goods_id_str) tmp_url = 'https://p.mia.com/item/list/' + goods_id_str # print(tmp_url) tmp_body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(tmp_body) try: tmp_data = json.loads(tmp_body).get('data', []) # pprint(tmp_data) except Exception as e: print('json.loads转换tmp_body时出错!') tmp_data = [] self.result_data = {} return {} true_sku_info = [] i_s = {} for item_1 in sku_info: for item_2 in tmp_data: if item_1.get('goods_id') == str(item_2.get('id', '')): i_s = item_2.get('i_s', {}) # print(i_s) for item_3 in i_s.keys(): tmp = {} if item_3 == 'SINGLE': spec_value = item_1.get('color_name') else: spec_value = item_1.get( 'color_name') + '|' + item_3 normal_price = str(item_2.get('mp')) detail_price = str(item_2.get('sp')) img_url = item_1.get('img_url') rest_number = i_s.get(item_3) if rest_number == 0: pass else: tmp['spec_value'] = spec_value tmp['normal_price'] = normal_price tmp['detail_price'] = detail_price tmp['img_url'] = img_url tmp['rest_number'] = rest_number true_sku_info.append(tmp) return (true_sku_info, i_s)
def _get_p_info(self, goods_id): p_info_api_url = 'https://shop.mogujie.com/ajax/mgj.pc.detailinfo/v1?_ajax=1&itemId=' + str( goods_id) tmp_p_info_body = MyRequests.get_url_body(url=p_info_api_url, headers=self.headers, had_referer=True) # print(tmp_p_info_body) assert tmp_p_info_body != '', '获取到的tmp_p_info_body为空值, 请检查!' p_info = self.get_goods_p_info(tmp_p_info_body=tmp_p_info_body) return p_info, tmp_p_info_body
def _get_div_desc(self, data): ''' 得到div_desc :param data: :return: ''' def _get_right_body(body): '''得到main_body''' # 处理data-lazy-src body = re.compile(r'<img src=').sub('<img data-lazy-src=', body) body = re.compile(r'data-lazy-src=').sub('src=', body) body = re.compile(r'<img data-src=').sub('<img src=', body) body = re.compile(r';opacity:0').sub('', body) # 不替换否则不显示图片 # print(body) try: main_body = re.compile(r'<main .*?>(.*)</main>').findall(body)[0] except IndexError: main_body = re.compile(r'<body>(.*?)</body>').findall(body)[0] main_body = re.compile(r'<script.*?>.*?</script>').sub('', main_body) return main_body try: intros = data.get('good', {}).get('intros', [])[0] except IndexError: raise IndexError('获取intros获取异常!') tabs = intros.get('tabs', []) # pprint(tabs) div_desc_url = '' title_list = [ '功能详情', '产品介绍', '概述', '商品详情', ] for item in tabs: if item.get('title', '') in title_list: div_desc_url = item.get('url', '') break if div_desc_url == '': raise ValueError('获取div_desc_url为空值!') body = MyRequests.get_url_body(url=div_desc_url, headers=self.headers) # self.my_lg.info(str(body)) if body == '': raise ValueError('获取到的div_desc为空值!') div_desc = '<div>' + _get_right_body(body) + '</div>' # self.my_lg.info(str(div_desc)) return div_desc
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) self.goods_id = goods_id self.headers.update({ 'referer': 'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id), }) # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 3): _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json' params = self._set_params(goods_id=goods_id, current_page=current_page) body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params) # self.my_lg.info(str(body)) _data = json_2_dict(json_str=body, logger=self.my_lg).get( 'wareDetailComment', {}).get('commentInfoList', []) if _data == []: self.my_lg.error('出错goods_id:{0}'.format(self.goods_id)) _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id:{0}'.format(goods_id)) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
async def _get_target_url_and_content_id_and_csid(self, taobao_short_url): ''' 根据给与的淘宝分享短链接, 得到target_url, content_id, csid :param taobao_short_url: :return: ''' if re.compile(r'contentId').findall(taobao_short_url) != []: # 先检查是否已为目标地址 target_url = taobao_short_url else: body = MyRequests.get_url_body(url=taobao_short_url, headers=self.headers) # self.my_lg.info(str(body)) if body == '': self.my_lg.error('获取到的body为空值, 出错短链接地址: {0}'.format(str(taobao_short_url))) return '', '', '' try: # 获取短连接的目标地址 target_url = re.compile('var url = \'(.*?)\';').findall(body)[0] # self.my_lg.info(str(target_url)) except IndexError: self.my_lg.error('获取target_url的时候IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) target_url = '' try: # 得到contentId content_id = re.compile('contentId=(\d+)').findall(target_url)[0] # self.my_lg.info(content_id) except IndexError: self.my_lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) content_id = '' try: # 得到csid csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall(target_url)[0] # self.my_lg.info(csid) except IndexError: self.my_lg.info('此链接为无csid情况的链接...') # self.my_lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) csid = '' try: tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0] except IndexError: tag_name = '' try: tag = re.compile('tag=(.*?)&').findall(target_url)[0] except IndexError: tag = '' return target_url, content_id, csid, tag_name, tag
def _get_pintuan_goods_info(self): ''' 模拟构造得到data的url, 得到近期所有的限时拼团商品信息 :return: ''' pintuan_goods_id_list = [] for page in range(0, 100): tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format( str(page)) print('正在抓取的页面地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if body == '': body = '{}' try: tmp_data = json.loads(body) tmp_data = tmp_data.get('data', {}).get('goods', []) except: print('json.loads转换tmp_data时出错!') tmp_data = [] # print(tmp_data) sleep(.5) if tmp_data == []: print('该tmp_url得到的goods为空list, 此处跳过!') break tmp_pintuan_goods_id_list = [{ 'goods_id': item.get('goods_id', ''), 'begin_time': timestamp_to_regulartime(int(item.get('start_time', ''))), 'end_time': timestamp_to_regulartime(int(item.get('end_time', ''))), 'all_sell_count': str(item.get('join_number_int', '')), 'page': page, } for item in tmp_data] # print(tmp_pintuan_goods_id_list) for item in tmp_pintuan_goods_id_list: if item.get('goods_id', '') not in [ item2.get('goods_id', '') for item2 in pintuan_goods_id_list ]: pintuan_goods_id_list.append(item) print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list)) print(pintuan_goods_id_list) return pintuan_goods_id_list
def _z8_get_parent_dir(goods_id) -> str: ''' 折800获取parent_dir (常规, 拼团, 秒杀都可用) :param goods_id: :return: '' | 'xxx/xxx' ''' headers = { 'authority': 'shop.zhe800.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'referer': 'https://brand.zhe800.com/yl?brandid=353130&page_stats_w=ju_tag/taofushi/1*1&ju_flag=1&pos_type=jutag&pos_value=taofushi&x=1&y=1&n=1&listversion=&sourcetype=brand&brand_id=353130', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'cookie': 'has_webp=1; __utmz=148564220.1533879745.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); utm_csr=direct; session_id=1833465439.1533879745; utm_ccn=notset_c0; utm_cmd=; utm_ctr=; utm_cct=; utm_etr=tao.home; firstTime=2018-08-11; qd_user=32364805.1533966002852; f_jk=9269921533966002852qyOwt6Jn; f_jk_t=1533966002857; f_jk_e_t=1536558002; f_jk_r=https://www.zhe800.com/ju_tag/taofushi; user_type=0; downloadGuide_config=%257B%25220direct%2522%253A%257B%2522open%2522%253A1%257D%257D; user_role=1; student=0; gr_user_id=cb0301be-4ee0-4f86-80c5-7d880106ed87; user_id=; utm_csr_first=direct; ac_token=15345580381942356; frequency=1%2C0%2C0%2C0%2C1%2C0%2C0%2C1%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0; lastTime=2018-08-18; cart_mark=1%7C0%7C0%7Cnil%7C0; __utma=148564220.1330126799.1533879745.1533965999.1534583229.3; __utmc=148564220; __utmt=1; screenversion=2; ju_rv=BD001_BAYES_RCMD; unix_time=1534583232; ju_version=3; gr_session_id_655df36e87c2496389d319bd67d56fec=c953414c-5377-42f3-b449-99ed57b65b31; gr_session_id_655df36e87c2496389d319bd67d56fec_c953414c-5377-42f3-b449-99ed57b65b31=true; jk=9451481534583233650qyOwt6Jn; __utmb=148564220.5.10.1534583229; new_old_user=1; city_id=330000; source=; platform=; version=; channelId=; deviceId=; userId=; cType=; cId=; dealId=; visit=7; wris_session_id=424077871.1534583351', } params = ( ('jump_source', '1'), ('qd_key', 'qyOwt6Jn'), ) url = 'https://shop.zhe800.com/products/{0}'.format(goods_id) body = MyRequests.get_url_body(url=url, headers=headers, params=None, high_conceal=True) # print(body) parent_dir = [] try: aside = Selector(text=body).css('aside.pos.area').extract_first() # print(aside) assert aside is not None, '获取到的aside为None!获取parent_dir失败!' _1 = Selector(text=aside).css('em::text').extract_first() # print(_1) parent_dir.append(_1) _2 = re.compile('</i>(.*?)<i>').findall(aside)[1].replace(' ', '') # print(_2) except Exception as e: print('获取parent_dir时遇到错误(默认为""):', e) return '' parent_dir.append(_2) # 父级路径 parent_dir = '/'.join(parent_dir) # print(parent_dir) return parent_dir
def getRandomExternalLink(startingPage): html = MyRequests.get_url_body(url=startingPage, headers=headers) bsObj = BeautifulSoup(html, "html.parser") externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc) if len(externalLinks) == 0: print("没有外部链接,准备遍历整个网站") domain = urlparse(startingPage).scheme + "://" + urlparse( startingPage).netloc internalLinks = getInternalLinks(bsObj, domain) return getRandomExternalLink(internalLinks[random.randint( 0, len(internalLinks) - 1)]) else: return externalLinks[random.randint(0, len(externalLinks) - 1)]
def _get_aweme_api_videos_info(self, user_id): self.user_id = user_id params = ( ('user_id', self.user_id), ('max_cursor', '0'), ('count', '20'), ) url = 'https://www.douyin.com/aweme/v1/aweme/post/' body = MyRequests.get_url_body(url=url, headers=self.headers, params=params) # print(body) self.deal_with_data(body=body)
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] for index in range(1, 1000): # 0跟1返回一样,所有从1开始遍历 tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str( index) + '/0/' print('正在抓取: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') if tmp_data.get('data_list', []) == []: print('得到的data_list为[], 此处跳过!') break else: # print(tmp_data) data_list = [{ 'goods_id': item.get('sku', ''), 'sub_title': item.get('intro', ''), 'pid': index, } for item in tmp_data.get('data_list', [])] # pprint(data_list) for item in data_list: goods_list.append(item) sleep(.5) pprint(goods_list) self.deal_with_data(goods_list=goods_list) sleep(8) return None
def get_stock_info_dict(self, goods_id): ''' 得到实时库存信息 :param goods_id: :return: 返回dict类型 ''' stock_info_url = 'https://pina.m.zhe800.com/cns/products/' + str(goods_id) + '/realtime_info.json' stock_info_body = MyRequests.get_url_body(url=stock_info_url, headers=self.headers, high_conceal=True) if stock_info_body == '': print('获取到的stock_info_body为空值!') stock_info_body = '{}' tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {}) if tmp_stock_info == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return tmp_stock_info
def get_stock_info_dict(self, stock_info_url): ''' 得到实时库存信息 :param stock_info_url: :return: 返回dict类型 ''' stock_info_body = MyRequests.get_url_body(url=stock_info_url, headers=self.headers) if stock_info_body == '': print('获取到的stock_info_body为空值!') stock_info_body = '{}' tmp_stock_info = json_2_dict(json_str=stock_info_body).get('data', {}) if tmp_stock_info == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return tmp_stock_info
def _judge_is_taobao_head_img(self, url): ''' 判断是否为淘宝默认头像地址 :param url: :return: ''' tmp_proxies = MyRequests._get_proxies() try: _res = requests.get(url=url, headers=self.headers, proxies=tmp_proxies) self.my_lg.info(str(_res.url)) if _res.url == 'https://gw.alicdn.com/tps/i3/TB1yeWeIFXXXXX5XFXXuAZJYXXX-210-210.png_40x40.jpg': return True else: return False except: self.my_lg.info('检测图片地址时网络错误! 跳过!') return False
def get_one_page_goods_info(self, *params): ''' 得到一个页面的html代码 :param params: 待传入的参数 :return: '{}' or str ''' gender, page = params tmp_url = 'https://api.chuchujie.com/api/' client = { "ageGroup": "AG_0to24", "channel": "QD_web_webkit", "deviceId": "0", "gender": gender, # '0' -> 女 | '1' -> 男 "imei": "0", "packageName": "com.culiu.purchase", "platform": "wap", "sessionId": "0", "shopToken": "0", "userId": "0", "version": "1.0", "xingeToken": "" } query = { "group": 4, "module": "99", "page": page, "tab": "all" } # 切记: Query String Parameters直接这样编码发送即可 # 如果是要post的数据就得使用post的方法 data = { 'client': json.dumps(client), 'query': json.dumps(query), 'page': page } body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=data) if body == '': body = '{}' return body
def _get_1688_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取1688销量靠前的商品信息 :param keyword: :return: a list eg: ['11111', ...] ''' '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品''' headers = { 'authority': 'm.1688.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE', } params = ( ('sortType', 'booked'), ('filtId', ''), ('keywords', keyword[1]), ('descendOrder', 'true'), ) url = 'https://m.1688.com/offer_search/-6161.html' body = MyRequests.get_url_body(url=url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: try: goods_id_list = Selector(text=body).css( 'div.list_group-item::attr("data-offer-id")').extract() # pprint(goods_id_list) except Exception as e: self.my_lg.exception(e) self.my_lg.error( '获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) goods_id_list = [] return goods_id_list
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' mia_base_number = MIA_BASE_NUMBER while mia_base_number < MIA_MAX_NUMBER: tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str( mia_base_number) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '' or body == '[]': print('mia_base_number为: ', mia_base_number) print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') tmp_hour = tmp_data.get('p_info', {}).get('start_time', '')[11:13] if tmp_hour == '22': # 过滤掉秒杀时间为22点的 print('--- 销售时间为22点,不抓取!') pass else: print(tmp_data) print('mia_base_number为: ', mia_base_number) pid = mia_base_number begin_time = tmp_data.get('p_info', {}).get('start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') item_list = tmp_data.get('item_list', []) self.deal_with_data(pid, begin_time, end_time, item_list) sleep(.35) mia_base_number += 1
def traversal_hour_timestamp(self, item): ''' 遍历每个需求的整点时间戳 :param item: :return: ''' # 先遍历today的需求的整点时间戳 tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format( str(item)) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('item为: ', item) print('获取到的body为空值! 此处跳过') else: try: body = re.compile('null\((.*)\)').findall(body)[0] except Exception: print('re匹配body中的数据时出错!') body = '{}' try: tmp_data = json.loads(body) except: print('json.loads转换body时出错, 此处跳过!') tmp_data = {} if tmp_data == {}: print('tmp_data为空{}!') pass else: # pprint(tmp_data) # print(tmp_data) event_time = item item_list = tmp_data.get('data', {}).get('list', []) self.deal_with_data(event_time, item_list) sleep(MOGUJIE_SLEEP_TIME)
def get_item_list(self, event_time): ''' 得到event_time中所有的商品信息 :param event_time: :return: item_list 类型 list ''' tmp_url = 'https://qiang.mogujie.com//jsonp/fastBuyListActionLet/1?eventTime={0}&bizKey=rush_main'.format( str(event_time)) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '': print('获取到的body为空值! 此处跳过') item_list = '' else: try: body = re.compile('null\((.*)\)').findall(body)[0] except Exception: print('re匹配body中的数据时出错!') body = '{}' try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') if tmp_data == {}: print('tmp_data为空{}!') item_list = [] else: # pprint(tmp_data) # print(tmp_data) item_list = tmp_data.get('data', {}).get('list', []) sleep(.5) return item_list
def get_one_page_all_goods_list(self, *params): ''' 得到一个页面地址的所有商品list :return: str | list 类型 ''' page = params[0] all_goods_list = [] tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format( str(page)) # print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} return '网络错误!' this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: return [] for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) # sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] return all_goods_list