def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': return self._data_error() tmp_url = 'https://th5.m.zhe800.com/gateway/app/detail/product?productId=' + str(goods_id) # print('------>>>| 得到的detail信息的地址为: ', tmp_url) body = Requests.get_url_body( url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type) data = json_2_dict(json_str=body, default_res={}) if body == '' \ or data == {}: return self._data_error() # 处理base base = json_2_dict( json_str=data.get('/app/detail/product/base', ''), default_res={}) # 处理profiles profiles = data.get('/app/detail/product/profiles', '') profiles = json_2_dict(json_str=profiles) if profiles == {}: print("json.loads转换出错,得到profiles值可能为空,此处跳过") profiles = '' # 处理score score = json_2_dict( json_str=data.get('/app/detail/product/score', ''), default_res={}) try: score.pop('contents') except: pass # 处理sku sku = json_2_dict( json_str=data.get('/app/detail/product/sku', ''), default_res={}) # pprint(sku) data['/app/detail/product/base'] = base data['/app/detail/product/profiles'] = profiles data['/app/detail/product/score'] = score data['/app/detail/product/sku'] = sku # pprint(base) try: # 得到手机版地址 phone_url = 'http://th5.m.zhe800.com/h5/shopdeal?id=' + str(base.get('dealId', '')) except AttributeError: print('获取手机版地址失败,此处跳过') return self._data_error() print('------>>>| 得到商品手机版地址为: ', phone_url) # 得到并处理detail(即图文详情显示信息) tmp_detail_url = 'https://th5.m.zhe800.com/gateway/app/detail/graph?productId=' + str(goods_id) detail_data_body = Requests.get_url_body( url=tmp_detail_url, headers=self.headers, ip_pool_type=self.ip_pool_type) # print(detail_data_body) if detail_data_body == '': print('detail_data为[]!') return self._data_error() detail_data = json_2_dict(json_str=detail_data_body, default_res={}) if detail_data == {}: print('json.loads(detail_data)时报错, 此处跳过') return self._data_error() detail = json_2_dict( json_str=detail_data.get('/app/detail/graph/detail', ''), default_res={}) try: detail.pop('small') except: pass # print(detail) # div_desc tmp_div_desc = self._get_div_desc(detail=detail, goods_id=goods_id) if tmp_div_desc == '': return self._data_error() # print(tmp_div_desc) data['/app/detail/graph/detail'] = tmp_div_desc # shop_name shop_name = self._get_shop_name(data=data) if isinstance(shop_name, dict): if shop_name == {}: return self._data_error() data['shop_name'] = shop_name ''' 得到秒杀开始时间和结束时间 ''' schedule_and_stock_url = 'https://th5.m.zhe800.com/gateway/app/detail/status?productId=' + str(goods_id) schedule_and_stock_info_body = Requests.get_url_body( url=schedule_and_stock_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) if schedule_and_stock_info_body == '': print('schedule_and_stock_info为空!') return self._data_error() schedule_and_stock_info = json_2_dict(json_str=schedule_and_stock_info_body) if schedule_and_stock_info == {}: print('得到秒杀开始时间和结束时间时错误, 此处跳过') return self._data_error() schedule = json_2_dict( json_str=schedule_and_stock_info.get('/app/detail/status/schedule', None), default_res={}) stock = json_2_dict( json_str=schedule_and_stock_info.get('/app/detail/status/stock', None), default_res={}) data['schedule'] = schedule data['stock'] = stock data['parent_dir'] = _z8_get_parent_dir(goods_id) data['goods_id'] = goods_id self.result_data = data # pprint(data) return data
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': return self._data_error_init() else: tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(goods_id) print('------>>>| 得到的商品手机版地址为: ', tmp_url) ''' 原先采用requests来模拟的,之前能用,但是数据多了请求多了sleep也不管用后面会获取不到信息 ''' body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True) # print(body) if body == '': print('获取到的tmp_url的body为空值, 此处跳过!') return self._data_error_init() # 不用这个了因为会影响到正常情况的商品 try: if re.compile(r'很抱歉,您查看的页面木有了~').findall(body) != [] and (len(body)< 660 and len(body)>640): # 单独处理商品页面不存在的情况 print('很抱歉,您查看的页面木有了~') self.result_data = {} return str(goods_id) else: pass except: pass try: data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(body) except: data = [] ''' 采用phantomjs ''' # main_body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, css='div.title') # # print(main_body) # if main_body == '': # print('获取到的main_body为空值, 此处跳过!') # return self._data_error_init() # # try: # data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(main_body) # 贪婪匹配匹配所有 # # print(data) # except: # data = [] if data != []: data = json_2_dict(json_str=data[0]) # pprint(data) if data == {}: return self._data_error_init() # div_desc div_desc_body = self.get_div_desc_body(goods_id=goods_id) # print(div_desc_body) if div_desc_body == '': print('获取到的div_desc_body为空!') return {} # p_info p_info = self.get_p_info_list(goods_id=goods_id) # pprint(p_info) if p_info == []: return {} # 获取商品实时库存信息 stock_info = self.get_stock_info_dict(goods_id=goods_id) if stock_info == {}: print('获取到的库存信息为{}!') return {} # pprint(stock_info) data['div_desc'] = div_desc_body data['p_info'] = p_info data['stock_info'] = stock_info if stock_info.get('pin_status', 2) == 3: print('##### 该拼团商品已经被抢光 ...') is_delete = 1 else: is_delete = 0 data['is_delete'] = is_delete data['parent_dir'] = _z8_get_parent_dir(goods_id) self.result_data = data # pprint(data) return data else: print('data为空!') return self._data_error_init()
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': return self._data_error_init() tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(goods_id) print('------>>>| 得到的商品手机版地址为: ', tmp_url) try: ''' 原先采用requests来模拟的,之前能用,但是数据多了请求多了sleep也不管用后面会获取不到信息 ''' body = Requests.get_url_body( url=tmp_url, headers=self.headers, proxy_type=self.proxy_type, ip_pool_type=self.ip_pool_type, num_retries=self.req_num_retries,) assert body != '' # print(body) ''' 采用phantomjs ''' # main_body = self.driver.use_phantomjs_to_get_url_body(url=tmp_url, css='div.title') # # print(main_body) # if main_body == '': # print('获取到的main_body为空值, 此处跳过!') # return self._data_error_init() # 不用这个了因为会影响到正常情况的商品 try: if re.compile(r'很抱歉,您查看的页面木有了~').findall(body) != [] and ( len(body) < 660 and len(body) > 640): # 单独处理商品页面不存在的情况 print('很抱歉,您查看的页面木有了~') self.result_data = {} return str(goods_id) else: pass except: pass data = re.compile(r'window.prod_info = (.*?);seajs.use\(.*?\);</script>').findall(body) assert data != [], 'data为空!' data = json_2_dict( json_str=data[0], default_res={},) assert data != {} # pprint(data) # div_desc div_desc_body = self.get_div_desc_body(goods_id=goods_id) # print(div_desc_body) assert div_desc_body != '', '获取到的div_desc_body为空!' p_info = self.get_p_info_list(goods_id=goods_id) # pprint(p_info) assert p_info != [] # 获取商品实时库存信息 stock_info = self.get_stock_info_dict(goods_id=goods_id) assert stock_info != {}, '获取到的库存信息为{}!' # pprint(stock_info) except (IndexError, AssertionError, Exception) as e: print(e) return self._data_error_init() data['div_desc'] = div_desc_body data['p_info'] = p_info data['stock_info'] = stock_info if stock_info.get('pin_status', 2) == 3: print('##### 该拼团商品已经被抢光 ...') is_delete = 1 else: is_delete = 0 data['is_delete'] = is_delete data['parent_dir'] = _z8_get_parent_dir(goods_id) self.result_data = data # pprint(data) return data
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: tmp_url = 'https://th5.m.zhe800.com/gateway/app/detail/product?productId=' + str( goods_id) # print('------>>>| 得到的detail信息的地址为: ', tmp_url) body = Requests.get_url_body(url=tmp_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) if body == '': self.result_data = {} return {} else: data = [body] if data != []: data = json_2_dict(json_str=data[0]) if data == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(data) # 处理base base = data.get('/app/detail/product/base', '') base = json_2_dict(json_str=base) if base == {}: print("json.loads转换出错,得到base值可能为空,此处跳过") base = '' # 处理profiles profiles = data.get('/app/detail/product/profiles', '') profiles = json_2_dict(json_str=profiles) if profiles == {}: print("json.loads转换出错,得到profiles值可能为空,此处跳过") profiles = '' # 处理score score = data.get('/app/detail/product/score', '') score = json_2_dict(json_str=score) try: score.pop('contents') except: pass if score == {}: print("json.loads转换出错,得到score值可能为空,此处跳过") score = '' # 处理sku sku = data.get('/app/detail/product/sku', '') sku = json_2_dict(json_str=sku) # pprint(sku) if sku == {}: print("json.loads转换出错,得到sku值可能为空,此处跳过") sku = '' data['/app/detail/product/base'] = base data['/app/detail/product/profiles'] = profiles data['/app/detail/product/score'] = score data['/app/detail/product/sku'] = sku # 得到手机版地址 try: phone_url = 'http://th5.m.zhe800.com/h5/shopdeal?id=' + str( base.get('dealId', '')) except AttributeError: print('获取手机版地址失败,此处跳过') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} print('------>>>| 得到商品手机版地址为: ', phone_url) # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) # 得到并处理detail(即图文详情显示信息) tmp_detail_url = 'https://th5.m.zhe800.com/gateway/app/detail/graph?productId=' + str( goods_id) detail_data_body = Requests.get_url_body( url=tmp_detail_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) # print(detail_data_body) if detail_data_body == '': print('detail_data为[]!') self.result_data = {} return {} else: detail_data = [detail_data_body] if detail_data != []: detail_data = json_2_dict(json_str=detail_data[0]) if detail_data == {}: print('json.loads(detail_data)时报错, 此处跳过') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(detail_data) detail = detail_data.get('/app/detail/graph/detail', '') detail = json_2_dict(json_str=detail) try: detail.pop('small') except: pass if detail == {}: print("json.loads转换出错,得到detail值可能为空,此处跳过") detail = '' # print(detail) # div_desc tmp_div_desc = self._get_div_desc(detail=detail, goods_id=goods_id) if tmp_div_desc == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # print(tmp_div_desc) data['/app/detail/graph/detail'] = tmp_div_desc # shop_name shop_name = self._get_shop_name(data=data) if isinstance(shop_name, dict): if shop_name == {}: self.result_data = {} return {} data['shop_name'] = shop_name ''' 得到秒杀开始时间和结束时间 ''' schedule_and_stock_url = 'https://th5.m.zhe800.com/gateway/app/detail/status?productId=' + str( goods_id) schedule_and_stock_info_body = Requests.get_url_body( url=schedule_and_stock_url, headers=self.headers, high_conceal=True, ip_pool_type=self.ip_pool_type) if schedule_and_stock_info_body == '': print('schedule_and_stock_info为空!') self.result_data = {} return {} else: schedule_and_stock_info = [ schedule_and_stock_info_body ] if schedule_and_stock_info != []: schedule_and_stock_info = json_2_dict( json_str=schedule_and_stock_info[0]) if schedule_and_stock_info == {}: print('得到秒杀开始时间和结束时间时错误, 此处跳过') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} schedule = schedule_and_stock_info.get( '/app/detail/status/schedule') if schedule is None: schedule = {} else: schedule = json_2_dict(json_str=schedule) stock = schedule_and_stock_info.get( '/app/detail/status/stock') if stock is None: stock = {} else: stock = json_2_dict(json_str=stock) else: schedule = {} stock = {} data['schedule'] = schedule data['stock'] = stock data['parent_dir'] = _z8_get_parent_dir(goods_id) self.result_data = data # pprint(data) return data else: print('detail_data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}