def __init__(self, logger=None): self._set_headers() self._set_pc_headers() self.result_data = {} self._set_logger(logger) self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg)
def __init__(self): self._set_headers() self.page_sleep_time = 1.2 self.phantomjs_sleep_time = 2 self.my_phantomjs = MyPhantomjs( load_images=True) # load_images为True才加载图片! self.qrcode_base_path = '/Users/afa/myFiles/tmp/外卖券qrcode/'
def __init__(self, logger=None): super(ALi1688LoginAndParse, self).__init__() self._set_headers() self.result_data = {} self.is_activity_goods = False self._set_logger(logger) self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg)
def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.comment_page_switch_sleep_time = 1.2 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) self._add_headers_cookies()
def __init__(self, logger=None): self._set_sort_type_name() self._set_logger(logger) self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.update_sql = 'update dbo.sina_weibo set head_img_url=%s, modify_time=%s where id=%s' self.phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) self.id_list = [] self.update_index = 0
def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.page_size = '10' self.comment_page_switch_sleep_time = 1.5 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) self.g_data = {} # 临时数据 self.random_sku_info_list = [] # 临时数据(存该商品所有的规格)
def __init__(self, base_path='/Users/afa/myFiles/tmp/基金/伪好基/'): ''' :param base_path: 基金图片存储path ''' self.page_num_start = 1 # 开放基金排行开始page self.page_num_end = 3 self.CRAWL_FUND_TIME = 1.5 # 抓取每只基金的sleep time self.plot_pic = None self.base_path = base_path self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_PATH)
async def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: ''' s_time = time.time() goods_list = [] my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) for key in self.tab_dict: self.msg = '正在抓取的分类为: ' + key self.my_lg.info(self.msg) for index in range(1, 20): item_list = await self.get_one_page_goods_list( my_phantomjs=my_phantomjs, key=key, tab=self.tab_dict[key], index=index) all_goods_id = list( set([s.get('goods_id', '') for s in goods_list])) for item in item_list: if item.get('goods_id', '') not in all_goods_id: goods_list.append(item) # await asyncio.sleep(.5) # break # break try: del my_phantomjs except: pass self.my_lg.info(str(goods_list)) self.my_lg.info('本次抓到所有拼团商品个数为: ' + str(len(goods_list))) e_time = time.time() self.my_lg.info('总用时:' + str(e_time - s_time)) await asyncio.sleep(3) return goods_list
class JdParse(object): def __init__(self, logger=None): self._set_headers() self._set_pc_headers() self.result_data = {} self._set_logger(logger) self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/jd/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'jd.com;jd.hk', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def _set_pc_headers(self): # pc头, 只识别小写 self.pc_headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', 'connection': 'keep-alive', 'user-agent': get_random_pc_ua(), } def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == []: self.my_lg.error('goods_id为空list') return self._data_error_init() if isinstance(self._get_need_url(goods_id=goods_id), dict): # 即返回{} return self._data_error_init() self.error_record = '出错goods_id:{0}'.format(goods_id[1]) phone_url, tmp_url, comment_url = self._get_need_url(goods_id=goods_id) self.my_lg.info('------>>>| 得到的移动端地址为: {0}'.format(phone_url)) # self.my_lg.info(str(tmp_url)) if goods_id[0] == 1: # ** 注意: 先预加载让driver获取到sid ** # 研究分析发现京东全球购,大药房商品访问需要cookies中的sid值 self.my_phantomjs.use_phantomjs_to_get_url_body( url='https://mitem.jd.hk/cart/cartNum.json') elif goods_id[0] == 2: # 研究分析发现京东全球购,大药房商品访问需要cookies中的sid值 self.my_phantomjs.use_phantomjs_to_get_url_body( url='https://m.yiyaojd.com/cart/cartNum.json') # 得到总销售量 comment_body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=comment_url) if comment_body == '': # 网络问题或者ip切换出错 return self._data_error_init() comment_body = self._wash_url_body(body=comment_body) # self.my_lg.info(str(comment_body)) comment_body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall( comment_body) if comment_body_1 != []: comment_data = comment_body_1[0] comment_data = json_2_dict(json_str=comment_data) # pprint(comment_data) all_sell_count = comment_data.get('wareDetailComment', {}).get('allCnt', '0') else: self.my_lg.error('获取到的comment的销售量data为空!' + self.error_record) return self._data_error_init() body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) if body == '': return self._data_error_init() body = self._wash_url_body(body=body) # self.my_lg.info(str(body)) body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(body) ## ** 起初是拿phantomjs来进行url请求的,本来想着用requests来优化,但是改动有点大,就先暂时不改动 ** # body_1 = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # if body_1 == '': # body_1 = [] # else: # # self.my_lg.info(str(body_1[0])) if body_1 != []: data = body_1[0] data = json_2_dict(json_str=data) if data == {}: self.my_lg.error(r'此处直接返回data为{}' + self.error_record) return self._data_error_init() # pprint(data) wdis = data.get('wdis', '') # 图文描述 data = data.get('ware', {}) try: data.pop('wdisHtml') data.get('wi', {})['afterServiceList'] = [] except Exception: pass # 处理'wi' 'code' if data.get('wi') is not None: # 用于获取p_info code = data.get('wi', {}).get('code', '') # self.my_lg.info('wi,code的为: {}'.format(code)) if code != '': code = json_2_dict(json_str=code) try: data.get('wi', {})['code'] = code except Exception as e: # 对应p_info解析错误的, 换方法解析 self.my_lg.info('wi中的code对应json解析错误, 为:', e) code = data.get('wi', {}).get('wareQD', '') data.get('wi', {})['code'] = code else: data['wi'] = {'code': []} # 处理wdis data['wdis'] = wdis # 商品总销售量 data['all_sell_count'] = all_sell_count if data != {}: self.result_data = data # pprint(data) return data else: self.my_lg.error('获取到的data的key值ware为空!' + self.error_record) return self._data_error_init() else: self.my_lg.error('获取到的data为空!' + self.error_record) return self._data_error_init() def deal_with_data(self, goods_id): ''' 处理result_data, 返回需要的信息 :return: 字典类型 ''' data = self.result_data if data != {}: shop_name = self._get_shop_name(data=data) account = '' title = data.get('wname', '') sub_title = '' detail_name_list = self._get_detail_name_list(data=data) ''' 要存储的每个标签对应规格的价格及其库存(京东无库存抓取, 只有对应规格商品是否可买) ''' price_info_list = self.get_price_info_list(goods_id, detail_name_list, data) # pprint(price_info_list) # 获取is_delete, price, taobao_price _ = self._get_price_and_taobao_price_and_is_delete( detail_name_list=detail_name_list, price_info_list=price_info_list, goods_id=goods_id) if _ == [0, '', '']: # 异常退出 return self._data_error_init() else: is_delete, price, taobao_price = _ # self.my_lg.info('最高价: {0}, 最低价: {1}'.format(price, taobao_price)) # 所有示例图片地址 ''' 新增: 由于手机版获取到的jd示例图片数据有京东的水印,所以单独先通过pc端来获取图片,pc获取失败就用phone端的 ''' all_img_url = self.get_pc_no_watermark_picture(goods_id=goods_id) if all_img_url == {}: # 意外退出 return self._data_error_init() if all_img_url == []: # 获取pc端失败, 即获取phone示例图 if data.get('images') is not None: all_img_url = [{ 'img_url': item.get('bigpath') } for item in data.get('images')] else: all_img_url = [] else: pass # pprint(all_img_url) p_info = self.get_p_info(data=data) # pprint(p_info) # 爬取是手机端的所以没有第一行的,就是手机端的规格 div_desc = self.get_right_div_desc(data=data) # self.my_lg.info(str(div_desc)) jd_type = self._get_jd_type(is_jd_market=data.get('isJdMarket'), type=goods_id[0]) # self.my_lg.info('jd_type为: {0}'.format(jd_type)) # 商品总销售量 all_sell_count = str(data.get('all_sell_count', '0')) if is_delete == 1: self.my_lg.info('**** 该商品已下架...') result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存(京东隐藏库存无法爬取,只能能买或不能买) 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 # 'pc_div_url': pc_div_url, # pc端描述地址 'div_desc': div_desc, # div_desc 'is_delete': is_delete, # 是否下架判断 'jd_type': jd_type, # 京东类型,(京东常规商品为7,京东超市为8) 'all_sell_count': all_sell_count, # 商品总销售量 } # pprint(result) # self.my_lg.info(str(result)) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # self.my_lg.info(str(json_data)) gc.collect() return result else: self.my_lg.info('待处理的data为空的dict' + self.error_record) return {} def _data_error_init(self): ''' 错误初始化 :return: ''' self.result_data = {} return {} def _get_jd_type(self, is_jd_market, type): ''' 判断是否是京东商品类型 ''' # self.my_lg.info(str(data.get('isJdMarket'))) if is_jd_market: # False不是京东超市 self.my_lg.info('该链接为京东超市') jd_type = 8 # 7为京东常规商品, 8表示京东超市, 9表示京东全球购, 10表示京东大药房 elif type == 1: self.my_lg.info('该链接为京东全球购') jd_type = 9 elif type == 2: self.my_lg.info('该链接为京东大药房') jd_type = 10 else: jd_type = 7 return jd_type def _get_price_and_taobao_price_and_is_delete(self, **kwargs): ''' 获取is_delete, price, taobao_price :return: [0, '', ''] 表示异常退出 | [x, xx, xx] 表示成功 ''' detail_name_list = kwargs.get('detail_name_list', []) price_info_list = kwargs.get('price_info_list', []) goods_id = kwargs.get('goods_id', []) # 是否下架判断 is_delete = 0 # 商品价格 ''' 最高价和最低价处理 从已经获取到的规格对应价格中筛选最高价和最低价即可 ''' if detail_name_list == []: # 说明没有规格,所有价格只能根据当前的goods_id来获取 if self.from_ware_id_get_price_info(ware_id=goods_id)[0] == '暂无报价': is_delete = 1 # 说明已经下架 price, taobao_price = ( 0, 0, ) else: try: # self.my_lg.info(str(self.from_ware_id_get_price_info(ware_id=goods_id)[0])) price = round( float( self.from_ware_id_get_price_info( ware_id=goods_id)[0]), 2) taobao_price = price except TypeError: is_delete = 1 # 说明该商品暂无报价 price, taobao_price = ( 0, 0, ) else: try: tmp_price_list = sorted([ round(float(item.get('detail_price', '')), 2) for item in price_info_list ]) except ValueError: self.my_lg.error('tmp_price_list的ValueError,此处设置为跳过' + self.error_record) return [0, '', ''] # self.my_lg.info(str(tmp_price_list)) if tmp_price_list != []: price = tmp_price_list[-1] taobao_price = tmp_price_list[0] else: self.my_lg.error('获取最高价最低价时错误' + self.error_record) return [0, '', ''] return [is_delete, price, taobao_price] def _get_need_url(self, goods_id): ''' 获取需求的url :param goods_id: :return: ''' phone_url = '' tmp_url = '' comment_url = '' if goods_id[0] == 0: # 表示为京东常规商品 phone_url = 'https://item.m.jd.com/ware/view.action?wareId=' + str( goods_id[1]) # 用于得到常规信息 tmp_url = 'https://item.m.jd.com/ware/detail.json?wareId=' + str( goods_id[1]) comment_url = 'https://item.m.jd.com/ware/getDetailCommentList.json?wareId=' + str( goods_id[1]) elif goods_id[0] == 1: # 表示为京东全球购商品 (此处由于进口关税无法计算先不处理京东全球购) phone_url = 'https://mitem.jd.hk/ware/view.action?wareId=' + str( goods_id[1]) tmp_url = 'https://mitem.jd.hk/ware/detail.json?wareId=' + str( goods_id[1]) comment_url = 'https://mitem.jd.hk/ware/getDetailCommentList.json?wareId=' + str( goods_id[1]) self.my_lg.info('此商品为京东全球购商品,由于进口关税无法计算,先不处理京东全球购') return {} elif goods_id[0] == 2: # 表示京东大药房商品 phone_url = 'https://m.yiyaojd.com/ware/view.action?wareId=' + str( goods_id[1]) tmp_url = 'https://m.yiyaojd.com/ware/detail.json?wareId=' + str( goods_id[1]) comment_url = 'https://m.yiyaojd.com/ware/getDetailCommentList.json?wareId=' + str( goods_id[1]) return phone_url, tmp_url, comment_url def from_ware_id_get_price_info(self, ware_id): ''' 得到价格信息,由于过滤了requests所以用phantomjs ''' price_url = '' if ware_id[0] == 0: # 表示为京东常规商品 price_url = 'https://item.m.jd.com/ware/getSpecInfo.json?wareId=' + str( ware_id[1]) elif ware_id[0] == 1: # 表示为京东全球购商品 price_url = 'https://mitem.jd.hk/ware/getSpecInfo.json?wareId=' + str( ware_id[1]) elif ware_id[0] == 2: # 表示京东大药房商品 price_url = 'https://m.yiyaojd.com/ware/getSpecInfo.json?wareId=' + str( ware_id[1]) # self.my_lg.info(str(price_url)) price_body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=price_url) price_body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(price_body) if price_body_1 != []: price_data = json_2_dict(json_str=price_body_1[0]) try: price_data.pop('defaultAddress') price_data.pop('commonConfigJson') except Exception: pass try: price_data.pop('newYanBaoInfo') except Exception: pass # 处理newYanBaoInfo new_yan_bao_info = price_data.get('newYanBaoInfo') if new_yan_bao_info is not None: new_yan_bao_info = json_2_dict(json_str=new_yan_bao_info) price_data['newYanBaoInfo'] = new_yan_bao_info # 处理allColorSet all_color_set = price_data.get('allColorSet') if all_color_set is not None: all_color_set = json_2_dict(json_str=all_color_set) price_data['allColorSet'] = all_color_set # 处理allSpecSet all_spec_set = price_data.get('allSpecSet') if all_spec_set is not None: all_spec_set = json_2_dict(json_str=all_spec_set) price_data['allSpecSet'] = all_spec_set # 处理allSizeSet all_size_set = price_data.get('allSizeSet') if all_size_set is not None: all_size_set = json_2_dict(json_str=all_size_set) price_data['allSizeSet'] = all_size_set # pprint(price_data) if price_data.get('wareMainImageUrl') is not None: main_image_url = price_data.get('wareMainImageUrl') else: main_image_url = '' return [ price_data.get('warePrice', ''), # 价格 main_image_url, # 主图地址 ] else: # self.my_lg.error('获取到的price_data为空!') return [] def _get_shop_name(self, data): ''' 获取shop_name :param data: :return: ''' return data.get('shopInfo', {}).get('shop', {}).get('name', '') \ if data.get('shopInfo', {}).get('shop') is not None \ else '' def _get_detail_name_list(self, data): ''' 获取detail_name_list :param data: :return: ''' detail_name_list = [] color_size_title = data.get('skuColorSize', {}).get('colorSizeTitle', {}) # pprint(data.get('skuColorSize', {})) # pprint(color_size_title) if color_size_title != {}: for key, value in color_size_title.items(): img_here = 0 if key == 'colorName': if value is not None: if value != '': # 不为空则说明有图 img_here = 1 detail_name_list.append({ 'spec_name': value, 'img_here': img_here, }) return detail_name_list def get_price_info_list(self, *params): ''' 得到规范的price_info_list :param *params: :return: ''' goods_id = params[0] detail_name_list = params[1] data = params[2] # tmp_price_info_list = data.get('skuColorSize', {}).get('colorSize') # pprint(tmp_price_info_list) price_info_list = [] if detail_name_list != []: # 有规格 tmp_price_info_list = data.get('skuColorSize', {}).get('colorSize') # pprint(tmp_price_info_list) if tmp_price_info_list is not None: for item in tmp_price_info_list: tmp = {} tmp_spec_value = [] if item.get('color') != '*': tmp_spec_value.append(item.get('color')) if item.get('size') != '*': tmp_spec_value.append(item.get('size')) if item.get('spec') != '*': tmp_spec_value.append(item.get('spec')) tmp_spec_value = '|'.join(tmp_spec_value) # 具体规格 # self.my_lg.info(str(tmp_spec_value)) sku_id = item.get('skuId') # 对每个sku_id就行一次请求,来获得对应sku_id的价格数据 if goods_id[0] == 0: sku_id = [0, sku_id] elif goods_id[0] == 1: sku_id = [1, sku_id] elif goods_id[0] == 2: sku_id = [2, sku_id] ware_price_and_main_img_url_list = self.from_ware_id_get_price_info( ware_id=sku_id) tmp['spec_value'] = tmp_spec_value if ware_price_and_main_img_url_list != []: tmp['detail_price'] = ware_price_and_main_img_url_list[ 0] tmp['img'] = ware_price_and_main_img_url_list[1] else: tmp['detail_price'] = '' tmp['img'] = '' tmp['rest_number'] = '' if tmp.get( 'detail_price') is None: # detail_price为None的跳过! continue price_info_list.append(tmp) # pprint(price_info_list) return price_info_list def get_right_div_desc(self, data): ''' 得到处理后的div_desc :param data: :return: ''' wdis = '' # 特殊处理script动态生成的 if data.get('popWareDetailWebViewMap') is not None: if data.get('popWareDetailWebViewMap').get( 'cssContent') is not None: wdis = data.get('popWareDetailWebViewMap', {}).get('cssContent', '') wdis = self._wash_div_desc(wdis=wdis) wdis = wdis + data.get('wdis', '') # 如果获取到script就与wdis重组 div_desc = self._wash_div_desc(wdis=wdis) return div_desc def _wash_div_desc(self, wdis): ''' 清洗div_desc :param wdis: :return: ''' wdis = re.compile(r'<').sub( '<', wdis ) # self.driver.page_source转码成字符串时'<','>'都被替代成><此外还有其他也类似被替换 wdis = re.compile(r'>').sub('>', wdis) wdis = re.compile(r'&').sub('&', wdis) wdis = re.compile(r' ').sub(' ', wdis) wdis = re.compile(r'\n').sub('', wdis) wdis = re.compile(r'src=\"https:').sub('src=\"', wdis) # 先替换部分带有https的 wdis = re.compile(r'src="').sub('src=\"https:', wdis) # 再把所欲的换成https的 wdis = re.compile(r'<html>|</html>').sub('', wdis) wdis = re.compile(r'<head.*?>.*?</head>').sub('', wdis) wdis = re.compile(r'<body>|</body>').sub('', wdis) return wdis def get_p_info(self, data): ''' 得到p_info :param data: :return: list ''' tmp_p_info = data.get('wi', {}).get('code') # pprint(tmp_p_info) p_info = [] if tmp_p_info is not None: if isinstance(tmp_p_info, str): p_info = [{'p_name': '规格和包装', 'p_value': tmp_p_info}] elif isinstance(tmp_p_info, list): for item in tmp_p_info: tmp = {} tmp['p_name'] = list(item.keys())[0] tmp_p_value = list(item.values())[0] tmp_p_value_2 = [] if isinstance(tmp_p_value, list): for i in tmp_p_value: tmp_2 = {} tmp_2['name'] = list(i.keys())[0] tmp_2['value'] = list(i.values())[0] tmp_p_value_2.append(tmp_2) tmp['p_value'] = tmp_p_value_2 else: tmp['p_value'] = tmp_p_value p_info.append(tmp) else: pass return p_info def to_right_and_update_data(self, data, pipeline): ''' 实时更新数据 :param data: :param pipeline: :return: ''' site_id = self._from_jd_type_get_site_id_value( jd_type=data.get('jd_type')) tmp = _get_right_model_data(data=data, site_id=site_id) params = self.get_db_update_params(item=tmp) base_sql_str = jd_update_str_1 if tmp['delete_time'] == '': sql_str = base_sql_str.format('shelf_time=%s', '') elif tmp['shelf_time'] == '': sql_str = base_sql_str.format('delete_time=%s', '') else: sql_str = base_sql_str.format('shelf_time=%s,', 'delete_time=%s') res = pipeline._update_table_2(sql_str=sql_str, params=params, logger=self.my_lg) return res def insert_into_jd_table(self, data, pipeline): site_id = self._from_jd_type_get_site_id_value( jd_type=data.get('jd_type')) if site_id == 0: self.my_lg.error('site_id获取异常, 请检查!') return False tmp = _get_right_model_data(data=data, site_id=site_id) self.my_lg.info('------>>>| 待存储的数据信息为:{0}'.format(tmp.get('goods_id'))) pipeline.insert_into_jd_table(item=tmp) return True def old_jd_goods_insert_into_new_table(self, data, pipeline): ''' 老数据转到新表 :param data: :param pipeline: :return: ''' site_id = self._from_jd_type_get_site_id_value( jd_type=data.get('jd_type')) if site_id == 0: self.my_lg.error('site_id获取异常, 请检查!') return False tmp = _get_right_model_data(data=data, site_id=site_id) self.my_lg.info('------>>>| 待存储的数据信息为: {0}'.format( tmp.get('goods_id'))) params = self._get_db_insert_params(item=tmp) if tmp.get('main_goods_id') is not None: sql_str = jd_insert_str_1 else: sql_str = jd_insert_str_2 result = pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.my_lg) return result def _get_db_insert_params(self, item): ''' 初始化存储参数 :param item: :return: ''' params = [ item['goods_id'], item['goods_url'], item['username'], item['create_time'], item['modify_time'], item['shop_name'], item['account'], item['title'], item['sub_title'], item['link_name'], item['price'], item['taobao_price'], dumps(item['price_info'], ensure_ascii=False), dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo item['all_sell_count'], item['site_id'], item['is_delete'], ] if item.get('main_goods_id') is not None: params.append(item.get('main_goods_id')) return tuple(params) def get_db_update_params(self, item): ''' 得到db待更新参数 :param item: :return: ''' params = [ item['modify_time'], item['shop_name'], item['account'], item['title'], item['sub_title'], item['link_name'], # item['price'], # item['taobao_price'], dumps(item['price_info'], ensure_ascii=False), dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], item['all_sell_count'], # item['delete_time'], item['is_delete'], item['is_price_change'], dumps(item['price_change_info'], ensure_ascii=False), item['sku_info_trans_time'], item['goods_id'], ] if item.get('delete_time', '') == '': params.insert(-1, item['shelf_time']) elif item.get('shelf_time', '') == '': params.insert(-1, item['delete_time']) else: params.insert(-1, item['shelf_time']) params.insert(-1, item['delete_time']) return tuple(params) def _wash_url_body(self, body): ''' 清洗body :param body: :return: ''' body = re.compile('\n|\t| ').sub('', body) return body def _from_jd_type_get_site_id_value(self, jd_type): ''' 根据jd_type来获取对应的site_id的值 :param jd_type: :return: a int object ''' # 采集的来源地 if jd_type == 7: site_id = 7 # 采集来源地(京东) elif jd_type == 8: site_id = 8 # 采集来源地(京东超市) elif jd_type == 9: site_id = 9 # 采集来源地(京东全球购) elif jd_type == 10: site_id = 10 # 采集来源地(京东大药房) else: site_id = 0 # 表示错误 return site_id def get_goods_id_from_url(self, jd_url): ''' 注意: 初始地址可以直接用这个[https://item.jd.com/xxxxx.html]因为jd会给你重定向到正确地址 :param jd_url: :return: ''' is_jd_url = re.compile(r'https://item.jd.com/.*?').findall(jd_url) if is_jd_url != []: goods_id = re.compile( r'https://item.jd.com/(.*?).html.*?').findall(jd_url)[0] self.my_lg.info('------>>>| 得到的京东商品id为:{0}'.format(goods_id)) return [0, goods_id] # 0表示京东常规商品, 包括京东超市, 京东精选 else: is_jd_hk_url = re.compile(r'https://item.jd.hk/.*?').findall( jd_url) if is_jd_hk_url != []: goods_id = re.compile( r'https://item.jd.hk/(.*?).html.*?').findall(jd_url)[0] self.my_lg.info( '------>>>| 得到的京东全球购商品id为:{0}'.format(goods_id)) return [1, goods_id] # 1表示京东全球购商品 else: is_yiyao_jd_url = re.compile( r'https://item.yiyaojd.com/.*?').findall(jd_url) if is_yiyao_jd_url != []: goods_id = re.compile( r'https://item.yiyaojd.com/(.*?).html.*?').findall( jd_url)[0] self.my_lg.info( '------>>>| 得到的京东大药房商品id为:{}'.format(goods_id)) return [2, goods_id] # 2表示京东大药房 else: self.my_lg.info( '京东商品url错误, 非正规的url, 请参照格式(https://item.jd.com/)或者(https://item.jd.hk/)开头的...' ) return [] def get_pc_no_watermark_picture(self, goods_id): ''' 获取pc端无水印示例图片 :param goods_id: eg: [0, '111111'] :return: {} 表示意外退出 | [] 表示获取pc无水印图片失败 | [{'img_url': 'xxxxx'}, ...] 表示success ''' if goods_id == []: return {} elif goods_id[0] == 0: # 京东常规商品,京东超市 tmp_pc_url = 'https://item.jd.com/' + str(goods_id[1]) + '.html' elif goods_id[0] == 1: # 京东全球购(税率无法计算忽略抓取) tmp_pc_url = 'https://item.jd.hk/' + str(goods_id[1]) + '.html' elif goods_id[0] == 2: # 京东大药房 tmp_pc_url = 'https://item.yiyaojd.com/' + str( goods_id[1]) + '.html' else: return {} # 常规requests被过滤重定向到jd主页, 直接用 自己写的phantomjs方法获取 # tmp_pc_body = MyRequests.get_url_body(url=tmp_pc_url, headers=self.pc_headers) tmp_pc_body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_pc_url, css_selector='div#spec-list ul.lh li img') # 该css为示例图片 # self.my_lg.info(str(tmp_pc_body)) if tmp_pc_body == '': self.my_lg.info('#### 获取该商品的无水印示例图片失败! 导致原因: tmp_pc_body为空str!') all_img_url = [] else: try: all_img_url = list( Selector(text=tmp_pc_body).css( 'div#spec-list ul.lh li img::attr("src")').extract()) if all_img_url != []: all_img_url = [ 'https:' + item_img_url for item_img_url in all_img_url if re.compile(r'^http').findall(item_img_url) == [] ] all_img_url = [ re.compile(r'/n5.*?jfs/').sub('/n1/jfs/', item_img_url) for item_img_url in all_img_url ] all_img_url = [{ 'img_url': item_img_url, } for item_img_url in all_img_url] else: all_img_url = [] except Exception as e: self.my_lg.error('获取商品pc版无水印示例图片时出错: ', e) all_img_url = [] return all_img_url def __del__(self): try: del self.my_phantomjs del self.my_lg except: pass gc.collect()
def __init__(self): self._set_headers() self.result_data = {} # self.set_cookies_key_api_uid() # 设置cookie中的api_uid的值 self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
class ALi1688LoginAndParse(object): def __init__(self, logger=None): super(ALi1688LoginAndParse, self).__init__() self._set_headers() self.result_data = {} self.is_activity_goods = False self._set_logger(logger) self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/1688/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '1688.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def get_ali_1688_data(self, goods_id): if goods_id == '': return self._data_error_init() wait_to_deal_with_url = 'https://m.1688.com/offer/' + str( goods_id) + '.html' self.my_lg.info( '------>>>| 待处理的阿里1688地址为: {0}'.format(wait_to_deal_with_url)) self.error_base_record = '出错goods_id:{0}'.format(goods_id) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=wait_to_deal_with_url, css_selector='div.d-content') # self.my_lg.info(str(body)) if body == '': self.my_lg.error('获取到的body为空str!请检查!' + self.error_base_record) return self._data_error_init() tmp_body = body try: pull_off_shelves = Selector( text=body).css('div.d-content p.info::text').extract_first() except: pull_off_shelves = '' if pull_off_shelves == '该商品无法查看或已下架': # 表示商品已下架, 同样执行插入数据操作 try: tmp_my_pipeline = SqlServerMyPageInfoSaveItemPipeline() is_in_db = tmp_my_pipeline._select_table( sql_str=al_select_str_1, params=(str(goods_id), )) # self.my_lg.info(str(is_in_db)) except Exception: self.my_lg.error('数据库连接失败!' + self.error_base_record, exc_info=True) return self._data_error_init() if is_in_db != []: # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可 tmp_my_pipeline._update_table_2(sql_str=al_update_str_1, params=(goods_id), logger=self.my_lg) self.my_lg.info('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1') tmp_data_s = self.init_pull_off_shelves_goods() # 初始化下架商品的属性 tmp_data_s['before'] = True # 用来判断原先该goods是否在db中 self.result_data = {} return tmp_data_s else: # 表示该goods_id没存在于db中 self.my_lg.info('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...') tmp_data_s = self.init_pull_off_shelves_goods() # 初始化下架商品的属性 tmp_data_s['before'] = False self.result_data = {} return tmp_data_s body = re.compile(r'{"beginAmount"(.*?)</script></div></div>').findall( body) if body != []: body = body[0] body = r'{"beginAmount"' + body # self.my_lg.info(str(body)) body = json_2_dict(json_str=body) # pprint(body) if body.get('discountPriceRanges') is not None: self.result_data = self._wash_discountPriceRanges(body=body) return self.result_data else: self.my_lg.error('data为空!' + self.error_base_record) return self._data_error_init() else: self.my_lg.info('解析ing..., 该商品正在参与火拼, 此处为火拼价, 为短期活动价格!') body = re.compile( r'{"activityId"(.*?)</script></div></div>').findall(tmp_body) if body != []: body = body[0] body = r'{"activityId"' + body # self.my_lg.info(str(body)) body = json_2_dict(json_str=body) # pprint(body) if body.get('discountPriceRanges') is not None: self.result_data = self._wash_discountPriceRanges( body=body) self.is_activity_goods = True return self.result_data else: self.my_lg.error('data为空!' + self.error_base_record) return self._data_error_init() else: self.my_lg.error('这个商品对应活动属性未知, 此处不解析, 设置为跳过!' + self.error_base_record) return self._data_error_init() def deal_with_data(self): ''' 处理返回的result_data, 并返回需要的信息 :return: 字典类型 ''' data = self.result_data # pprint(data) if data != {}: company_name = data.get('companyName', '') title = self._wash_sensitive_words(data.get('subject', '')) link_name = '' # 商品价格信息, 及其对应起批量 [{'price': '119.00', 'begin': '3'}, ...] price_info = self._get_price_info(data=data) # self.my_lg.info(str(price_info)) # 标签属性名称及其对应的值 # (可能有图片(url), 无图(imageUrl=None)) [{'value': [{'imageUrl': 'https://cbu01.alicdn.com/img/ibank/2017/520/684/4707486025_608602289.jpg', 'name': '白色'}, {'imageUrl': 'https://cbu01.alicdn.com/img/ibank/2017/554/084/4707480455_608602289.jpg', 'name': '卡其色'}, {'imageUrl': 'https://cbu01.alicdn.com/img/ibank/2017/539/381/4705183935_608602289.jpg', 'name': '黑色'}], 'prop': '颜色'}, {'value': [{'imageUrl': None, 'name': 'L'}, {'imageUrl': None, 'name': 'XL'}, {'imageUrl': None, 'name': '2XL'}], 'prop': '尺码'}] sku_props = self._get_sku_props(data=data) # self.my_lg.info(str(sku_props)) # 每个规格对应价格, 及其库存量 try: sku_map = self._get_sku_map(data=data, price_info=price_info, detail_name_list=sku_props) # pprint(sku_map) except Exception: self.my_lg.error('获取sku_map时, 遇到错误!' + self.error_base_record, exc_info=True) self.is_activity_goods = False return self._data_error_init() price, taobao_price = self._get_price(price_info=price_info) all_img_url = self._get_all_img_url(data=data) # 即: p_info property_info = self._get_p_info(data=data) # 即: div_desc detail_info_url = data.get('detailUrl') if detail_info_url is not None: # self.my_lg.info(str(detail_info_url)) detail_info = self.get_detail_info_url_div(detail_info_url) else: detail_info = '' # self.my_lg.info(str(detail_info)) is_delete = self._get_is_delete(title=title) result = { 'company_name': company_name, # 公司名称 'title': title, # 商品名称 'link_name': link_name, # 卖家姓名 'price_info': price_info, # 商品价格信息, 及其对应起批量 'price': price, # 起批的最高价 'taobao_price': taobao_price, # 起批的最低价 'sku_props': sku_props, # 标签属性名称及其对应的值 (可能有图片(url), 无图(imageUrl=None)) 'sku_map': sku_map, # 每个规格对应价格, 及其库存量 'all_img_url': all_img_url, # 所有示例图片地址 'property_info': property_info, # 详细信息的标签名, 及其对应的值 'detail_info': detail_info, # 下方详细div块 'is_delete': is_delete, # 判断是否下架 } # pprint(result) # self.my_lg.info(str(result)) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # self.my_lg.info(str(json_data)) # 重置self.is_activity_goods = False self.is_activity_goods = False return result else: self.my_lg.error('待处理的data为空值!' + self.error_base_record) self.is_activity_goods = False return {} def _data_error_init(self): self.result_data = {} return {} def to_right_and_update_data(self, data, pipeline): tmp = _get_right_model_data(data=data, site_id=2) params = self._get_db_update_params(item=tmp) # 不改价格的sql语句 base_sql_str = al_update_str_2 if tmp['delete_time'] == '': sql_str = base_sql_str.format('shelf_time=%s', '') elif tmp['shelf_time'] == '': sql_str = base_sql_str.format('delete_time=%s', '') else: sql_str = base_sql_str.format('shelf_time=%s,', 'delete_time=%s') pipeline._update_table_2(sql_str=sql_str, params=params, logger=self.my_lg) def _get_sku_props(self, **kwargs): ''' 得到sku_props :param kwargs: :return: ''' data = kwargs.get('data', {}) sku_props = data.get('skuProps') # self.my_lg.info(str(sku_props)) if sku_props is not None: # 这里还是保留unit为单位值 for i in sku_props: value = i.get('value', []) i.update({'img_here': 0}) # 用于判断有示例图放在哪个属性 if value != []: for j in value: if j.get('imageUrl') is not None: i.update({'img_here': 1}) else: pass else: sku_props = [] # 存在没有规格属性的 return sku_props def _get_price_info(self, **kwargs): ''' 得到price_info :return: ''' data = kwargs.get('data', {}) # 商品价格信息, 及其对应起批量 [{'price': '119.00', 'begin': '3'}, ...] price_info = [] if self.is_activity_goods: # 火拼商品处理 tmp = {} tmp_price = data.get('ltPromotionPriceDisplay') tmp_trade_number = data.get('beginAmount') tmp['begin'] = tmp_trade_number tmp['price'] = tmp_price price_info.append(tmp) else: # 常规商品处理 if data.get( 'isLimitedTimePromotion', 'true' ) == 'false': # isLimitedTimePromotion 限时优惠, 'true'表示限时优惠价, 'flase'表示非限时优惠 price_info = data.get('discountPriceRanges') for item in price_info: try: item.pop('convertPrice') except KeyError: pass # self.my_lg.info(str(price_info)) else: # 限时优惠 tmp = { 'begin': data.get('beginAmount', ''), 'price': data.get('skuDiscountPrice', '') } price_info.append(tmp) return price_info def _get_sku_map(self, **kwargs): ''' 得到sku_map :param kwargs: :return: ''' # 每个规格对应价格, 及其库存量 '''skuMap == SKUInfo''' data = kwargs.get('data', {}) price_info = kwargs.get('price_info', []) detail_name_list = kwargs.get('detail_name_list', []) tmp_sku_map = data.get('skuMap') # pprint(tmp_sku_map) if tmp_sku_map is not None: sku_map = [] for key, value in tmp_sku_map.items(): tmp = {} # 处理key得到需要的值 key = re.compile(r'>').sub('|', key) tmp['spec_type'] = key # 处理value得到需要的值 # pprint(price_info) if value.get('discountPrice') is None: # 如果没有折扣价, 价格就为起批价 try: value['discountPrice'] = price_info[0].get('price') except IndexError: self.my_lg.error('获取价格失败, 此处跳过!') raise IndexError else: if self.is_activity_goods: pass else: if data.get('isLimitedTimePromotion') == 'false': if float(value.get('discountPrice')) < float( price_info[0].get('price')): value['discountPrice'] = price_info[0].get( 'price') else: pass else: pass tmp['spec_value'] = self._wash_sku_value(value=value) sku_map.append(tmp) else: sku_map = [] # 存在没有规格时的情况 # 添加示例图 if sku_map != []: img_url_list = [] for i in detail_name_list: if i.get('img_here', 0) == 1: img_url_list = i.get('value', []) # self.my_lg.info(str(img_url_list)) for i in img_url_list: img_url = i.get('imageUrl', '') name = i.get('name', '') for j in sku_map: if name in j.get('spec_type', ''): j.update({ 'img_url': img_url, }) else: pass return sku_map def _get_all_img_url(self, **kwargs): ''' 得到all_img_url :param kwargs: :return: ''' data = kwargs.get('data', {}) tmp_all_img_url = data.get('imageList') if tmp_all_img_url is not None: all_img_url = [] for item in tmp_all_img_url: tmp = {} try: item.pop('size310x310URL') except KeyError: # self.my_lg.info('KeyError, [size310x310URL], 此处设置为跳过') pass tmp['img_url'] = item['originalImageURI'] all_img_url.append(tmp) else: all_img_url = [] return all_img_url def _get_p_info(self, **kwargs): ''' 得到p_info :param kwargs: :return: ''' data = kwargs.get('data', {}) property_info = [] tmp_property_info = data.get('productFeatureList') if tmp_property_info is not None: for item in tmp_property_info: try: item.pop('unit') except KeyError: # self.my_lg.info('KeyError, [unit], 此处设置为跳过') pass item['id'] = '0' property_info = tmp_property_info else: pass return property_info def _get_is_delete(self, **kwargs): ''' 得到is_delete :param kwargs: :return: ''' title = kwargs.get('title') is_delete = 0 if re.compile(r'下架').findall(title) != []: if re.compile(r'待下架').findall(title) != []: pass else: is_delete = 1 else: pass return is_delete def _wash_sku_value(self, value): ''' 清洗value :param value: :return: ''' try: value.pop('skuId') except KeyError: pass try: value.pop('specId') except KeyError: pass try: value.pop('saleCount') except KeyError: pass try: value.pop('discountStandardPrice') except KeyError: pass try: value.pop('price') except KeyError: pass try: value.pop('retailPrice') except KeyError: pass try: value.pop('standardPrice') except KeyError: # self.my_lg.info('KeyError, [skuId, specId, saleCount]错误, 此处跳过') pass return value def _wash_sensitive_words(self, word): ''' 清洗敏感字眼 :param word: :return: ''' word = re.compile(r'淘宝网').sub('', word) return word def _wash_discountPriceRanges(self, body): ''' 清洗discountPriceRanges :param body: :return: ''' # 过滤无用属性 try: body.pop('action') body.pop('offerSign') body.pop('rateDsrItems') body.pop('rateStarLevelMapOfMerge') body.pop('wirelessVideoInfo') body.pop('freightCost') except KeyError: # self.my_lg.info('KeyError错误, 此处跳过!') pass return body def _get_db_update_params(self, item): ''' 得到待存储的params :param item: :return: tuple ''' params = [ item['modify_time'], item['shop_name'], item['title'], item['link_name'], # item['price'], # item['taobao_price'], dumps(item['price_info'], ensure_ascii=False), dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), item['div_desc'], dumps(item['p_info'], ensure_ascii=False), # item['delete_time'], item['is_delete'], item['is_price_change'], dumps(item['price_change_info'], ensure_ascii=False), item['sku_info_trans_time'], item['goods_id'], ] if item.get('delete_time', '') == '': params.insert(-1, item['shelf_time']) elif item.get('shelf_time', '') == '': params.insert(-1, item['delete_time']) else: params.insert(-1, item['shelf_time']) params.insert(-1, item['delete_time']) return tuple(params) def _get_price(self, price_info): ''' 获取商品的最高价跟最低价 :param price_info: :return: price, taobao_price type float ''' # 设置最高价price, 最低价taobao_price if len(price_info) > 1: tmp_ali_price = [] for item in price_info: tmp_ali_price.append(float(item.get('price'))) if tmp_ali_price == []: price = Decimal(0).__round__(2) taobao_price = Decimal(0).__round__(2) else: price = Decimal(sorted(tmp_ali_price)[-1]).__round__( 2) # 得到最大值并转换为精度为2的decimal类型 taobao_price = Decimal(sorted(tmp_ali_price)[0]).__round__(2) elif len( price_info ) == 1: # 由于可能是促销价, 只有一组然后价格 类似[{'begin': '1', 'price': '485.46-555.06'}] if re.compile(r'-').findall(price_info[0].get('price')) != []: tmp_price_range = price_info[0].get('price') tmp_price_range = tmp_price_range.split('-') price = tmp_price_range[1] taobao_price = tmp_price_range[0] else: price = Decimal(price_info[0].get('price')).__round__( 2) # 得到最大值并转换为精度为2的decimal类型 taobao_price = price else: # 少于1 price = Decimal(0).__round__(2) taobao_price = Decimal(0).__round__(2) return float(price), float(taobao_price) def init_pull_off_shelves_goods(self): ''' 初始化原先就下架的商品信息 :return: ''' is_delete = 1 result = { 'company_name': '', # 公司名称 'title': '', # 商品名称 'link_name': '', # 卖家姓名 'price_info': [], # 商品价格信息, 及其对应起批量 'price': 0, 'taobao_price': 0, 'sku_props': [], # 标签属性名称及其对应的值 (可能有图片(url), 无图(imageUrl=None)) 'sku_map': [], # 每个规格对应价格, 及其库存量 'all_img_url': [], # 所有示例图片地址 'property_info': [], # 详细信息的标签名, 及其对应的值 'detail_info': '', # 下方详细div块 'is_delete': is_delete, # 判断是否下架 } return result def old_ali_1688_goods_insert_into_new_table(self, data, pipeline): tmp = _get_right_model_data(data=data, site_id=2) params = self._get_db_insert_params(item=tmp) if tmp.get('main_goods_id') is not None: sql_str = al_insert_str_1 else: sql_str = al_insert_str_2 result = pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.my_lg) return result def _get_db_insert_params(self, item): params = [ item['goods_id'], item['goods_url'], item['username'], item['create_time'], item['modify_time'], item['shop_name'], item['title'], item['link_name'], item['price'], item['taobao_price'], dumps(item['price_info'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), item['div_desc'], # 存入到DetailInfo dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['site_id'], item['is_delete'], ] if item.get('main_goods_id') is not None: params.append(item.get('main_goods_id')) return tuple(params) def get_detail_info_url_div(self, detail_info_url): ''' 此处过滤得到data_tfs_url的div块 :return: ''' # self.my_lg.info(str(detail_info_url)) if re.compile(r'https').findall(detail_info_url) == []: detail_info_url = 'https:' + detail_info_url # self.my_lg.info(str(detail_info_url)) else: pass # data_tfs_url_response = requests.get(detail_info_url, headers=self.headers) # data_tfs_url_body = data_tfs_url_response.content.decode('gbk') data_tfs_url_body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=detail_info_url) # ''' # 改用requests # ''' # body = MyRequests.get_url_body(url=detail_info_url, headers=self.headers) # self.my_lg.info(str(body)) # if body == '': # detail_info = '' # # data_tfs_url_body = body is_offer_details = re.compile(r'offer_details').findall( data_tfs_url_body) detail_info = '' if is_offer_details != []: data_tfs_url_body = re.compile(r'.*?{"content":"(.*?)"};').findall( data_tfs_url_body) # self.my_lg.info(str(body)) if data_tfs_url_body != []: detail_info = data_tfs_url_body[0] detail_info = re.compile(r'\\').sub('', detail_info) detail_info = self._wash_div_desc(detail_info=detail_info) else: is_desc = re.compile(r'var desc=').findall(data_tfs_url_body) if is_desc != []: desc = re.compile(r'var desc=\'(.*)\';').findall( data_tfs_url_body) if desc != []: detail_info = desc[0] detail_info = self._wash_div_desc(detail_info=detail_info) detail_info = re.compile(r'src=\"https:').sub( 'src=\"', detail_info) # 先替换部分带有https的 detail_info = re.compile(r'src="').sub( 'src=\"https:', detail_info) # 再把所欲的换成https的 # self.my_lg.info(str(detail_info)) return detail_info def _wash_div_desc(self, detail_info): ''' 清洗detail_info :param detail_info: :return: ''' detail_info = re.compile(r'<').sub( '<', detail_info ) # self.driver.page_source转码成字符串时'<','>'都被替代成><此外还有其他也类似被替换 detail_info = re.compile(r'>').sub('>', detail_info) detail_info = re.compile(r'&').sub('&', detail_info) detail_info = re.compile(r' ').sub(' ', detail_info) return detail_info def get_goods_id_from_url(self, ali_1688_url): # https://detail.1688.com/offer/559526148757.html?spm=b26110380.sw1688.mof001.28.sBWF6s is_ali_1688_url = re.compile( r'https://detail.1688.com/offer/.*?').findall(ali_1688_url) if is_ali_1688_url != []: ali_1688_url = re.compile( r'https://detail.1688.com/offer/(.*?).html.*?').findall( ali_1688_url)[0] self.my_lg.info( '------>>>| 得到的阿里1688商品id为:{0}'.format(ali_1688_url)) return ali_1688_url else: self.my_lg.info( '阿里1688商品url错误, 非正规的url, 请参照格式(https://detail.1688.com/offer/)开头的...' ) return '' def __del__(self): try: del self.my_phantomjs del self.my_lg except Exception: self.my_lg.error("self.my_phantomjs释放失败!") pass gc.collect()
class Zhe800Spike(object): def __init__(self): self._set_headers() self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'zhe800.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) data = self._get_one_session_id_data( base_session_id=base_session_id) sleep(.3) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 pass else: # 否则session_id存在 try: _ = str( data.get('data', {}).get('blocks', [])[0].get('deal', {}).get('begin_time', ''))[:10] if _ != '': pass elif data.get('data', {}).get('blocks', [])[0].get( 'showcase', {}) != {}: # 未来时间 print('*** 未来时间 ***') # pprint(data.get('data', {})) _ = str( data.get('data', {}).get('blocks', [])[1].get( 'deal', {}).get('begin_time', ''))[:10] else: raise Exception begin_times_timestamp = int( _) # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整 except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp)) if self.is_recent_time( timestamp=begin_times_timestamp): # 说明秒杀日期合法 try: data = [ item_s.get('deal', {}) for item_s in data.get( 'data', {}).get('blocks', []) ] except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = 'select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14' db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table(sql_str=sql_str)) ] for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str( item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url( tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = str( item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get( 'taobao_price') goods_data['sub_title'] = item.get( 'sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get( 'miaosha_time')) goods_data['session_id'] = str( base_session_id) # print(goods_data['miaosha_time']) # print(goods_data) zhe_800.insert_into_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 # sleep(2) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') # return {} pass else: pass base_session_id += 2 def _get_one_session_id_data(self, base_session_id): ''' 得到一个session_id的data :param base_session_id: :return: ''' _data = [] for _page in range(1, 20): '''per_page为20固定,其他不返回数据''' tmp_url = 'https://zapi.zhe800.com/zhe800_n_api/xsq/m/session_deals?session_id={0}&page={1}&per_page=20'.format( str(base_session_id), _page) body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) # print(body) body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(body) if body_1 != []: data = body_1[0] data = json.loads(data) # pprint(data) # print(type(data.get('data', {}).get('has_next'))) if data.get('msg', '') == '无效场次': print('该session_id不存在,此处跳过') break if not data.get('data', {}).get('has_next', True): print('该session_id没有下页了!!') break else: print('正在抓取该session_id的第 {0} 页...'.format(_page)) for _i in data.get('data', {}).get('blocks', []): _data.append(_i) sleep(.3) return { 'data': { 'blocks': _data, } } def get_miaoshao_goods_info_list(self, data): ''' 得到秒杀商品有用信息 :param data: 待解析的data :return: 有用信息list ''' miaosha_goods_list = [] for item in data: # pprint(item) tmp = {} # 秒杀开始时间和结束时间 try: tmp['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int(str(item.get('begin_time'))[:10])), 'miaosha_end_time': timestamp_to_regulartime( int(str(item.get('end_time'))[:10])), } except ValueError: continue # 折800商品地址 tmp['zid'] = item.get('zid') # 是否包邮 # tmp['is_baoyou'] = item.get('is_baoyou', 0) # 限时秒杀的库存信息 tmp['stock_info'] = { 'activity_stock': item.get('activity_stock', 0), # activity_stock为限时抢的剩余数量 'stock': item.get('stock', 0), # stock为限时秒杀的总库存 } # 原始价格 tmp['price'] = float(item.get('list_price')) # 秒杀的价格, float类型 tmp['taobao_price'] = float(item.get('price')) # 子标题 tmp['sub_title'] = item.get('description', '') miaosha_goods_list.append(tmp) # pprint(miaosha_goods_list) return miaosha_goods_list def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: True or False ''' time_1 = int(timestamp) time_2 = time.time() # 当前的时间戳 time_1 = time.localtime(time_1) time_2 = time.localtime(time_2) if time_1.tm_year > time_2.tm_year: print('** 该年份为未来时间年份 **') if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format(SPIDER_START_HOUR, SPIDER_END_HOUR)) return False if time_1.tm_year == time_2.tm_year: if time_1.tm_mon > time_2.tm_mon: # 先处理得到的time_1的月份大于当前月份的信息(即未来月份的) print('** 该月份为未来时间月份 **') if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format( SPIDER_START_HOUR, SPIDER_END_HOUR)) return False if time_1.tm_mon >= time_2.tm_mon: # 如果目标时间的月份时间 >= 当前月份(月份合法, 表示是当前月份或者是今年其他月份) if time_1.tm_mday >= time_2.tm_mday - 2: # 这样能抓到今天的前两天的信息 if time_1.tm_hour >= SPIDER_START_HOUR and time_1.tm_hour <= SPIDER_END_HOUR: # 规定到SPIDER_START_HOUR点到SPIDER_END_HOUR点的商品信息 print('合法时间') # diff_days = abs(time_1.tm_mday - time_2.tm_mday) return True else: print('该小时在{0}点到{1}点以外,此处不处理跳过'.format( SPIDER_START_HOUR, SPIDER_END_HOUR)) return False else: print('该日时间已过期, 此处跳过') return False else: # 月份过期 print('该月份时间已过期,此处跳过') return False else: print('非本年度的限时秒杀时间,此处跳过') return False def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] ''' 方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据 ''' # mw_appkey = '100028' # mw_t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # mw_ttid = 'NMMain%40mgj_h5_1.0' # # _ = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # data = { # "pid": "93745", # "platform": "m", # "cKey": "mwp_mait", # "fcid": "", # } # # params = { # 'data': data # } # # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648 # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid # # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format( # mw_appkey, mw_t, mw_uuid, mw_ttid, _ # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # try: # response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # body = response.content.decode('utf-8') # print(body) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 方法二: 通过pc端来获取拼团商品列表 ''' self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) for key in self.fcid_dict: print('正在抓取的分类为: ', key) for index in range(1, 100): if index % 5 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) fcid = self.fcid_dict[key] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( str(index), fcid) # requests请求数据被过滤(起初能用),改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) try: body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) except: print('json.loads转换body时出错, 请检查') continue if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: # 表示拼团数据为空则跳出循环 break # pprint(tmp_data) # print(tmp_data) tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int(time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), 'fcid': fcid, 'page': index, 'sort': key, } for item in tmp_item_list] print(item_list) for item_1 in item_list: goods_list.append(item_1) sleep(MOGUJIE_SLEEP_TIME) # 处理goods_list数据 print(goods_list) self.deal_with_data(goods_list) sleep(5)
class MoGuJiePinTuan(object): def __init__(self): self._set_headers() self._set_fcid_dict() def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'api.mogujie.com', 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def _set_fcid_dict(self): self.fcid_dict = { '女装': 10053171, # '精选': 10053172, '男友': 10053173, '内衣': 10053174, '女鞋': 10053175, '包包': 10053176, '美妆': 10053177, '生活': 10053178, '配饰': 10053179, '母婴': 10053180, '食品': 10053181, } def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] ''' 方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据 ''' # mw_appkey = '100028' # mw_t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # mw_ttid = 'NMMain%40mgj_h5_1.0' # # _ = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # data = { # "pid": "93745", # "platform": "m", # "cKey": "mwp_mait", # "fcid": "", # } # # params = { # 'data': data # } # # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648 # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid # # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format( # mw_appkey, mw_t, mw_uuid, mw_ttid, _ # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # try: # response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # body = response.content.decode('utf-8') # print(body) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 方法二: 通过pc端来获取拼团商品列表 ''' self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) for key in self.fcid_dict: print('正在抓取的分类为: ', key) for index in range(1, 100): if index % 5 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) fcid = self.fcid_dict[key] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( str(index), fcid) # requests请求数据被过滤(起初能用),改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) try: body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) except: print('json.loads转换body时出错, 请检查') continue if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: # 表示拼团数据为空则跳出循环 break # pprint(tmp_data) # print(tmp_data) tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int(time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), 'fcid': fcid, 'page': index, 'sort': key, } for item in tmp_item_list] print(item_list) for item_1 in item_list: goods_list.append(item_1) sleep(MOGUJIE_SLEEP_TIME) # 处理goods_list数据 print(goods_list) self.deal_with_data(goods_list) sleep(5) def deal_with_data(self, *params): ''' 处理并存储相关拼团商品的数据 :param params: 待传参数 :return: ''' goods_list = params[0] mogujie = MoGuJieParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = r'select goods_id, miaosha_time, fcid, page from dbo.mogujie_pintuan where site_id=23' db_goods_id_list = [ item[0] for item in list(my_pipeline._select_table(sql_str=sql_str)) ] print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://shop.mogujie.com/detail/' + str( goods_id) mogujie.get_goods_data(goods_id=str(goods_id)) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 # 规范化 tmp_price_info_list = goods_data['price_info_list'] price_info_list = [{ 'spec_value': item_4.get('spec_value'), 'pintuan_price': item_4.get('detail_price'), 'normal_price': item_4.get('normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get('rest_number'), } for item_4 in tmp_price_info_list] goods_data['price_info_list'] = price_info_list goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['pintuan_time'] = item.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=item.get('pintuan_time', {})) goods_data['all_sell_count'] = item.get( 'all_sell_count', '') goods_data['fcid'] = str(item.get('fcid')) goods_data['page'] = str(item.get('page')) goods_data['sort'] = str(item.get('sort', '')) # pprint(goods_data) # print(goods_data) _r = mogujie.insert_into_mogujie_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 更新 db_goods_id_list.append(goods_id) db_goods_id_list = list(set(db_goods_id_list)) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect() def get_pintuan_end_time(self, begin_time, left_time): ''' 处理并得到拼团结束时间 :param begin_time: 秒杀开始时间戳 :param left_time: 剩余时间字符串 :return: end_time 时间戳(int) ''' # 'leftTimeOrg': '6天13小时' # 'leftTimeOrg': '13小时57分' had_day = re.compile(r'天').findall(left_time) had_hour = re.compile(r'小时').findall(left_time) had_min = re.compile(r'分').findall(left_time) tmp = re.compile(r'\d+').findall(left_time) if had_day != [] and had_hour != []: # left_time 格式为 '6天13小时' day, hour, min = int(tmp[0]), int(tmp[1]), 0 elif had_day == [] and had_hour != []: # left_time 格式为 '13小时57分' day, hour, min = 0, int(tmp[0]), int(tmp[1]) elif had_day == [] and had_hour == []: # left_time 格式为 '36分' print('left_time = ', left_time) day, hour, min = 0, 0, int(tmp[0]) else: # 无天, 小时, 分 print('day, hour, min = 0, 0, 0', 'left_time = ', left_time) day, hour, min = 0, 0, 0 left_end_time_timestamp = \ day * 24 * 60 * 60 + \ hour * 60 * 60 + \ min * 60 return begin_time + left_end_time_timestamp def get_pintuan_begin_time_and_pintuan_end_time(self, pintuan_time): ''' 返回拼团开始和结束时间 :param pintuan_time: :return: tuple pintuan_begin_time, pintuan_end_time ''' pintuan_begin_time = pintuan_time.get('begin_time') pintuan_end_time = pintuan_time.get('end_time') # 将字符串转换为datetime类型 pintuan_begin_time = datetime.datetime.strptime( pintuan_begin_time, '%Y-%m-%d %H:%M:%S') pintuan_end_time = datetime.datetime.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S') return pintuan_begin_time, pintuan_end_time def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
async def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = await tmp_sql_server.select_jumeiyoupin_pintuan_all_goods_id( logger=self.my_lg) except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(result) self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: time_number = await self.is_recent_time(pintuan_end_time) if time_number == 0: await tmp_sql_server.delete_jumeiyoupin_pintuan_expired_goods_id( goods_id=item[0], logger=self.my_lg) self.msg = '过期的goods_id为(%s)' % item[ 0] + ', 拼团结束时间为(%s), 删除成功!' % str( json.loads(item[1]).get('begin_time')) self.my_lg.info(self.msg) elif time_number == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 self.msg = '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( item[0], str(index)) self.my_lg.info(self.msg) data['goods_id'] = item[0] jumeiyoupin_2 = JuMeiYouPinPinTuan(logger=self.my_lg) _ = item[2] + '-' + str( item[3]) # 格式: 'coutuan_baby-1' item_list = self.api_all_goods_id.get( _, []) # 用于判断tab, index已在self.api_all_goods_id中 if item_list == []: my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) item_list = await jumeiyoupin_2.get_one_page_goods_list( my_phantomjs=my_phantomjs, tab=item[2], index=item[3]) try: del my_phantomjs except: pass if item_list == []: self.my_lg.info('获取到的body为空str, 网络原因, 此处先跳过!') pass else: if self.api_all_goods_id.get(_) is None: self.api_all_goods_id[_] = item_list pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] jumeiyoupin_pintuan = JuMeiYouPinPinTuanParse( logger=self.my_lg) # 内部已经下架的(测试发现官方不会提前下架活动商品) if item[0] not in pintuan_goods_all_goods_id: await self.update_data_2( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumei_pintuan_url=item[4], goods_id=item[0], pipeline=tmp_sql_server) else: # 未内部下架 await self.update_data_1( jumeiyoupin_pintuan=jumeiyoupin_pintuan, jumeiyoupin_2=jumeiyoupin_2, jumei_pintuan_url=item[4], goods_id=item[0], item_list=item_list, pipeline=tmp_sql_server) else: self.my_lg.error('数据库连接失败,此处跳过!') pass index += 1 gc.collect() self.my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() return None
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=jm_delete_str_2) result = list( tmp_sql_server._select_table(sql_str=jm_select_str_1)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 获取cookies my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) cookies = my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 jumeiyoupin_miaosha = JuMeiYouPinParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] this_page_all_goods_list = self.get_one_page_all_goods_list( item[2]) if this_page_all_goods_list == '网络错误!': print('网络错误!先跳过') continue elif this_page_all_goods_list == []: print( '#### 该page对应得到的this_page_all_goods_list为空[]!') print('** 该商品已被下架限时秒杀活动, 此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # # else: # 未下架的 tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url( item[3]) jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get( 'begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time'] ) # print(goods_data) jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(JUMEIYOUPIN_SLEEP_TIME) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
class JdCommentParse(object): def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.comment_page_switch_sleep_time = 1.2 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) self._add_headers_cookies() def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) self.goods_id = goods_id self.headers.update({ 'referer': 'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id), }) # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 3): _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json' params = self._set_params(goods_id=goods_id, current_page=current_page) body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params) # self.my_lg.info(str(body)) _data = self._json_2_dict(body).get('wareDetailComment', {}).get('commentInfoList', []) _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id:{0}'.format(goods_id)) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data def _get_comment_list(self, _tmp_comment_list): ''' 转换成需求的结果集 :param _tmp_comment_list: :return: ''' _comment_list = [] for item in _tmp_comment_list: _comment_date = item.get('commentDate', '') assert _comment_date != '', '得到的_comment_date为空str!请检查!' # sku_info(有些商品评论是没有规格的所以默认为空即可,不加assert检查!) ware_attributes = item.get('wareAttributes', []) # self.my_lg.info(str(ware_attributes)) sku_info = ' '.join([ i.get('key', '') + ':' + i.get('value', '') for i in ware_attributes ]) # assert sku_info != '', '得到的sku_info为空str!请检查!' _comment_content = item.get('commentData', '') assert _comment_content != '', '得到的评论内容为空str!请检查!' _comment_content = self._wash_comment(comment=_comment_content) buyer_name = item.get('userNickName', '') assert buyer_name != '', '得到的用户昵称为空值!请检查!' # jd设置默认 购买量为1 quantify = 1 head_img = item.get('userImgURL', '') assert head_img != '', '得到的用户头像为空值!请检查!' head_img = 'https://' + head_img # 第一次评论图片 _comment_img_list = item.get('pictureInfoList', []) if _comment_img_list != []: _comment_img_list = [{ 'img_url': img.get('largePicURL', '') } for img in _comment_img_list] '''追评''' append_comment = {} # star_level star_level = int(item.get('commentScore', '5')) if not filter_invalid_comment_content(_comment_content): continue comment = [{ 'comment': _comment_content, 'comment_date': _comment_date, 'sku_info': sku_info, 'img_url_list': _comment_img_list, 'star_level': star_level, 'video': '', }] _comment_list.append({ 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 评论数量 'head_img': head_img, # 头像 'append_comment': append_comment, # 追评 }) return _comment_list def _add_headers_cookies(self): # 测试发现得带cookies, 详细到cookies中的sid字符必须有 # 先获取cookies _cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://item.m.jd.com/') # self.my_lg.info(str(_cookies)) self.headers.update({ 'cookie': _cookies, }) return None def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/京东/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_headers(self): self.headers = { 'origin': 'https://item.m.jd.com', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'content-type': 'application/x-www-form-urlencoded', 'accept': 'application/json', 'referer': 'https://item.m.jd.com/ware/view.action?wareId=5025518', 'x-requested-with': 'XMLHttpRequest', } def _wash_comment(self, comment): ''' 清洗评论 :param comment: :return: ''' comment = re.compile(r'jd|\n|Jd|JD').sub('', comment) comment = re.compile('京东').sub('优秀网', comment) return comment def _json_2_dict(self, json_str): ''' json2dict :param json_str: :return: ''' try: _ = json.loads(json_str) except: self.my_lg.error('json.loads转换json_str时出错! 出错goods_id: ' + self.goods_id) return {} return _ def _set_params(self, goods_id, current_page): ''' 设置params :param goods_id: :param current_page: :return: ''' _params = [ ('wareId', goods_id), ('offset', str(current_page)), ('num', '10'), ('checkParam', 'LUIPPTP'), ('category', '670_671_1105'), ('isUseMobile', 'true'), ('evokeType', ''), ('type', '3'), # '0' 全部评论 | '3' 好评 ('isCurrentSku', 'false'), ] return _params def __del__(self): try: del self.my_lg del self.my_phantomjs del self.headers except: pass gc.collect()
class MoGuJiePinTuanRealTimesUpdate(object): def __init__(self): self._set_headers() self.delete_sql_str = mg_delete_str_1 def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Accept-Encoding:': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'list.mogujie.com', # 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_2) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() def get_pintuan_end_time(self, begin_time, left_time): ''' 处理并得到拼团结束时间 :param begin_time: 秒杀开始时间戳 :param left_time: 剩余时间字符串 :return: end_time 时间戳(int) ''' # 'leftTimeOrg': '6天13小时' # 'leftTimeOrg': '13小时57分' had_day = re.compile(r'天').findall(left_time) had_hour = re.compile(r'小时').findall(left_time) had_min = re.compile(r'分').findall(left_time) tmp = re.compile(r'\d+').findall(left_time) if had_day != [] and had_hour != []: # left_time 格式为 '6天13小时' day, hour, min = int(tmp[0]), int(tmp[1]), 0 elif had_day == [] and had_hour != []: # left_time 格式为 '13小时57分' day, hour, min = 0, int(tmp[0]), int(tmp[1]) elif had_day == [] and had_hour == []: # left_time 格式为 '36分' print('left_time = ', left_time) day, hour, min = 0, 0, int(tmp[0]) else: # 无天, 小时, 分 print('day, hour, min = 0, 0, 0', 'left_time = ', left_time) day, hour, min = 0, 0, 0 left_end_time_timestamp = \ day * 24 * 60 * 60 + \ hour * 60 * 60 + \ min * 60 return begin_time + left_end_time_timestamp def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(datetime_to_timestamp(get_shanghai_time())) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_2) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
class JuMeiYouPinSpike(object): def __init__(self): self._set_headers() def _set_headers(self): self.headers = { 'Accept': 'application/json,text/javascript,text/plain,*/*;q=0.01', # 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'h5.jumei.com', 'Referer': 'https://h5.jumei.com/', 'Cache-Control': 'max-age=0', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_goods_list = [] self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del self.my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) print('开始抓取在售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format( str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) sleep(.5) print('开始抓取预售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format( str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] print(all_goods_list) print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__()) self.deal_with_data(all_goods_list) return True def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table(sql_str=jm_select_str_2)) ] # print(db_goods_id_list) for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: jumei = JuMeiYouPinParse() goods_id = item.get('goods_id', '') type = item.get('type', '') tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format( goods_id, type) jumei.get_goods_data(goods_id=[goods_id, type]) goods_data = jumei.deal_with_data() if goods_data == {}: pass elif goods_data.get('is_delete', 0) == 1: print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['page'] = item.get('page') # pprint(goods_data) # print(goods_data) jumei.insert_into_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(JUMEIYOUPIN_SLEEP_TIME ) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 try: del jumei except: pass else: print('数据库连接失败,此处跳过!') pass gc.collect() def __del__(self): gc.collect()
def __init__(self): self._set_headers() self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
class WMYHQSpider(object): def __init__(self): self._set_headers() self.page_sleep_time = 1.2 self.phantomjs_sleep_time = 2 self.my_phantomjs = MyPhantomjs(load_images=True) # load_images为True才加载图片! self.qrcode_base_path = '/Users/afa/myFiles/tmp/外卖券qrcode/' def _set_headers(self): self.headers = { 'Accept': '*/*', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Encoding': 'br, gzip, deflate', 'Host': 'app.quanmama.com', 'User-Agent': get_random_phone_ua(), 'Content-Length': '885', 'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9', } def _get_wm_page_info(self): ''' 获取外卖页面的json推荐 :return: ''' # cookies = { # 'ASP.NET_SessionId': 'rxnstx4qhayrkqdne3coeevj', # } all_rows = [] print('开始采集券妈妈外卖券!') for page_index in range(1, 5): print('正在抓取第{0}页...'.format(page_index)) data = self._set_data(page_index=page_index) url = 'https://app.quanmama.com/apios/v5/appZdmList.ashx' body = MyRequests.get_url_body(method='post', url=url, headers=self.headers, cookies=None, data=data) # print(body) if body == '': print('获取到的body为空值!此处跳过!') continue # print(body) rows = json_2_dict(json_str=body).get('data', {}).get('rows', []) if rows == []: print('得到的rows为空值!此处跳过!') continue # pprint(rows) all_rows += rows sleep(self.page_sleep_time) print('\n@@@@@@ 抓取完毕!') wm_list = self._parse_wm_page(all_rows) # pprint(wm_list) self._deal_with_wm_info(wm_list) def _deal_with_wm_info(self, wm_list): ''' 处理wm_list :param wm_list: :return: ''' # 先清空昨日的 os.system('cd {0} && rm -rf *'.format(self.qrcode_base_path)) for item in wm_list: print('正在处理文章id: {0}'.format(item.get('article_id'))) exec_code = ''' self.driver.find_element_by_css_selector('div.go-action a').send_keys(Keys.ENTER) sleep({0}) '''.format(self.phantomjs_sleep_time) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=item.get('article_link', ''), exec_code=exec_code) # div.appcoupon-qrcode img qrcode_str = Selector(text=body).css('div.appcoupon-qrcode img::attr("src")').extract_first() # print(qrcode_str) img_file_name = '[代码{0}]'.format(item.get('article_id', '')) + \ item.get('article_title', '') + '@' + \ item.get('article_vicetitle', '') + '.png' save_path = self.qrcode_base_path + img_file_name result = save_base64_img_2_local(save_path=save_path, base64_img_str=qrcode_str) if result: print('[+] {0}'.format(img_file_name)) else: print('[-] {0}'.format(img_file_name)) sleep(self.page_sleep_time) print('@@@ 抓取二维码操作完成!') return None def _parse_wm_page(self, rows): ''' :param rows: :return: ''' _ = [] for item in rows: try: article_is_timeout = item.get('article_is_timeout') assert article_is_timeout is not None, 'article_is_timeout为None!' if article_is_timeout == 1: # 0 未过期; 1过期 continue article_id = item.get('article_id') assert article_id is not None, 'article_id为空!' article_mall = item.get('article_mall', '') assert article_mall != '', 'article_mall为空值!' article_pic = item.get('article_pic', '') assert article_pic != '', 'article_pic为空值!' article_vicetitle = self.replace_chinese_str(item.get('article_vicetitle', '')) assert article_vicetitle != '', 'article_vicetitle为空值!' article_title = self.replace_chinese_str(item.get('article_title', '')) assert article_title != '', 'article_title为空值!' article_link = item.get('article_link', '') assert article_link != '', 'article_link为空值!' article_begin_time = item.get('article_begintime', '') assert article_begin_time != '', 'article_begin_time为空值!' article_end_time = item.get('article_endtime', '') assert article_end_time != '', 'article_end_time为空值!' except Exception as e: print('遇到错误:', e) continue _.append({ 'article_id': article_id, # 文章id 'article_mall': article_mall, # 文章发布至今多久 'article_pic': article_pic, # 文章缩略图 'article_vicetitle': article_vicetitle, # 文章子标题 'article_title': article_title, # 文章标题 'article_link': article_link, # 文章link 'article_begin_time': article_begin_time, # 活动开始时间 'article_end_time': article_end_time, # 活动结束时间 }) return _ def replace_chinese_str(self, data): ''' replace 中文符号 :param data: :return: ''' return data.replace(':', ':').replace('、', ',').replace(',', ',').replace('/', '|') def _set_data(self, page_index): ''' post的data参数 :return: ''' data = [ ('AgeType', '2'), ('ProfessionType', '2'), ('SexType', '1'), ('appname', '券妈妈'), ('category', '5391'), ('code', '532'), ('devicename', 'iOS'), ('f', 'ios'), ('identifiernumber', 'F037B84D-A211-44B3-BA56-D5033A1328D4'), # ('imei', 'DA8C3A83-C08C-4881-86A8-1E67849F5BB2'), ('isiosmajia', '0'), ('localScheme', 'qmm'), ('logintype', '4'), ('mac', '02:00:00:00:00:00'), ('net', '2'), ('pageindex', str(page_index)), ('phonemodel', 'iPhone'), ('phoneversion', '11.0'), ('platform', 'App Store'), ('rtime', '0_'), ('sort', '1'), ('test', '0'), # ('userphonename', '\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79\uD83D\uDC79'), # ('usertoken', '09EAD2E9E9DD9BC28F7C26D004062CA57AD0B2FAD785BBAF2E47EA62C988583E6EB9759E8BD401086322FD88EE3C741CB015E4AE3ADE06EC8FF1F188CF647C4BDA41DD1A3A8D8E20DBFA4E6DB4DCDC9588ACBE676B0EF6F66A137BEDD1B51FC8157FDD1FBC34CCACA97DF5ACE152C83494903ED1CBEEAA283856534EEAB79D678CDC3E6A2FEA9DE2463DCB5D8D61F3D365E2971E17720EDBDC4E0A218616B79ADBD4D86C5BD89C67B8A008DA67139EFD4954DD44301BE380DE25093C216928F7'), ('v', '5.3.2'), ] return data def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_goods_list = [] self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session( url='https://h5.jumei.com/') try: del self.my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) print('开始抓取在售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format( str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) sleep(.5) print('开始抓取预售商品...') for page in range(1, 50): # 1, 开始 tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format( str(page)) print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers) # print(body) try: json_body = json.loads(body) # print(json_body) except: print('json.loads转换body时出错!请检查') json_body = {} pass this_page_item_list = json_body.get('item_list', []) if this_page_item_list == []: print('@@@@@@ 所有接口数据抓取完毕 !') break for item in this_page_item_list: if item.get('item_id', '') not in [ item_1.get('item_id', '') for item_1 in all_goods_list ]: item['page'] = page all_goods_list.append(item) sleep(.5) all_goods_list = [{ 'goods_id': str(item.get('item_id', '')), 'type': item.get('type', ''), 'page': item.get('page') } for item in all_goods_list if item.get('item_id') is not None] print(all_goods_list) print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__()) self.deal_with_data(all_goods_list) return True
def __init__(self): super().__init__() self._set_headers() self.result_data = {} self.is_activity_goods = False self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)
class BaseFund(object): def __init__(self, base_path='/Users/afa/myFiles/tmp/基金/伪好基/'): ''' :param base_path: 基金图片存储path ''' self.page_num_start = 1 # 开放基金排行开始page self.page_num_end = 3 self.CRAWL_FUND_TIME = 1.5 # 抓取每只基金的sleep time self.plot_pic = None self.base_path = base_path self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_PATH) def _get_rank_fund_info(self): ''' 得到天天基金全部基金的rank_fund :return: a list ''' rank_fund_list = [] for page_num in range(self.page_num_start, self.page_num_end): print('正在抓取第{0}页的基金信息...'.format(page_num)) cookies = { 'st_pvi': '11586003301354', 'EMFUND1': 'null', 'EMFUND0': 'null', 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884', 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106', 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301', 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723', 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595', 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594', 'st_si': '38764934559714', 'ASP.NET_SessionId': 'hqeo1xk5oqgwb0cqzxicytda', 'EMFUND8': '07-11 11:28:55@#$%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148', 'EMFUND9': '07-11 11:28:55@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092', } headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', # 'Referer': 'http://fund.eastmoney.com/data/fundranking.html', 'Proxy-Connection': 'keep-alive', } end_date = str(get_shanghai_time())[:10] start_date = str( datetime.datetime(year=get_shanghai_time().year - 1, month=get_shanghai_time().month, day=get_shanghai_time().day))[:10] print('开始时间: {0}, 结束时间: {1}'.format(start_date, end_date)) params = ( ('op', 'ph'), ('dt', 'kf'), ('ft', 'all'), ('rs', ''), ('gs', '0'), ('sc', 'zzf'), ('st', 'desc'), ('sd', start_date), # '2017-07-10' ('ed', end_date), # '2018-07-10' ('qdii', ''), ('tabSubtype', ',,,,,'), ('pi', str(page_num)), # rank_data的页码 ('pn', '50'), ('dx', '1'), # ('v', '0.5290053467389759'), ) url = 'http://fund.eastmoney.com/data/rankhandler.aspx' # TODO 常规requests被502 # body = MyRequests.get_url_body(url=url, headers=headers, params=params, cookies=None) # print(body) # 用phantomjs body = self.my_phantomjs.get_url_body( url=_get_url_contain_params(url, params)) try: body = re.compile('<body>(.*)</body>').findall(body)[0] this_page_rank_data = re.compile(r'rankData = (.*);').findall( body)[0] # print(this_page_rank_data) except IndexError: print('在获取this_page_rank_data时索引异常!请检查!') continue # 报错: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) # 解决方案: 用demjson处理下 this_page_rank_data = demjson.decode(this_page_rank_data).get( 'datas', {}) # pprint(this_page_rank_data) if this_page_rank_data == {}: return [] for item in this_page_rank_data: _i = item.split(',') rank_fund_list.append({ '基金代码': _i[0], '基金简称': _i[1], '当天日期': _i[3], '单位净值': _i[4], '累计净值': _i[5], '日增长率': _i[6], '近1周': _i[7], '近1月': _i[8], '近3月': _i[9], '近6月': _i[10], '近1年': _i[11], '近2年': _i[12], '近3年': _i[13], '今年来': _i[14], '成立来': _i[15], '手续费': _i[20], }) sleep(2.5) print('\n抓取完毕!\n') # pprint(rank_fund_list) return rank_fund_list def _deal_with_rank_fund_info(self): ''' 处理rank_fund_info :return: ''' rank_fund_list = self._get_rank_fund_info() for item in rank_fund_list: fund_code = item.get('基金代码', '') print('正在处理基金代码: {0}...'.format(fund_code)) self._get_one_fund_info(fund_code=fund_code) sleep(self.CRAWL_FUND_TIME) print('\n@@@ 所有操作完成!\n') return True def _get_one_fund_info(self, fund_code): ''' 得到一只基金的info,并处理 :return: ''' cookies = { 'st_pvi': '11586003301354', 'st_si': '46806950936799', 'ASP.NET_SessionId': 'fhllwae2zicg00o0x4ub1fxs', 'EMFUND1': 'null', 'EMFUND0': 'null', # 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884', 'EMFUND2': '07-10 18:01:38@#$华润元大现金通货币B@#$002884', # 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106', 'EMFUND3': '07-10 18:01:48@#$天弘现金管家货币B@#$420106', # 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301', 'EMFUND4': '07-10 18:11:53@#$方正富邦保险主题指数分级@#$167301', # 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723', 'EMFUND5': '07-10 18:04:32@#$招商中证银行指数分级@#$161723', # 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595', 'EMFUND6': '07-10 18:05:13@#$天弘中证银行指数C@#$001595', # 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594', 'EMFUND7': '07-10 18:06:13@#$天弘中证银行指数A@#$001594', # 'EMFUND8': '07-10%2018%3A11%3A22@%23%24%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148', 'EMFUND8': '07-10 18:11:22@#$申万菱信多策略灵活配置混合A@#$001148', # 'EMFUND9': '07-10 18:12:26@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092', 'EMFUND9': '07-10 18:12:26@#$广发生物科技指数(QDII)@#$001092', } cookies = unquote_cookies(cookies) # pprint(cookies) headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', # 'Referer': 'http://fund.eastmoney.com/001092.html', 'Proxy-Connection': 'keep-alive', } v = re.compile(r'-| |:').sub('', str( get_shanghai_time())) # 2018-07-10 18:30:46 -> 20180710183046 # print(v) params = ( # ('v', '20180710175951'), # 时间 ('v', v), # 时间 ) fund_url = 'http://fund.eastmoney.com/pingzhongdata/{0}.js'.format( fund_code) # response = requests.get(fund_url, headers=headers, params=params, cookies=None) # body = response.text # print(body) # body = MyRequests.get_url_body(url=fund_url, headers=headers, params=params, cookies=None) # print(body) body = self.my_phantomjs.get_url_body( url=_get_url_contain_params(fund_url, params)) # print(body) self._get_this_fund_info(body=body) return True def _get_this_fund_info(self, body): try: # 基金名 fund_name = re.compile(r'fS_name = "(.*?)";').findall(body)[0] # 基金代码 fund_code = re.compile(r'fS_code = "(.*?)";').findall(body)[0] print('基金名: {0}, 基金代码: {1}'.format(fund_name, fund_code)) # 购买手续费 fund_source_rate = re.compile(r'fund_sourceRate="(.*?)";').findall( body)[0] # 现费率 fund_rate = re.compile('fund_Rate="(.*?)";').findall(body)[0] # 最小起购金额 fund_minsg = re.compile(r'fund_minsg="(.*?)";').findall(body)[0] print('购买手续费: {0}%, 现费率: {1}%, 最小起购金额: {2}RMB'.format( fund_source_rate, fund_rate, fund_minsg)) '''收益率''' # 近一年收益率 syl_1n = re.compile(r'syl_1n="(.*?)";').findall(body)[0] # 近6月收益率 syl_6y = re.compile(r'syl_6y="(.*?)";').findall(body)[0] # 近三月收益率 syl_3y = re.compile(r'syl_3y="(.*?)";').findall(body)[0] # 近一月收益率 syl_1y = re.compile(r'syl_1y="(.*?)";').findall(body)[0] msg = '@@收益率:\n\t近1年: {0}%, 近6月: {1}%, 近3月: {2}%, 近1月: {3}%'.format( syl_1n, syl_6y, syl_3y, syl_1y) print(msg) # 单位净值走势 equityReturn-净值回报 unitMoney-每份派送金 data_net_worth_trend = json_2_dict( re.compile(r'Data_netWorthTrend = (.*?);').findall(body)[0]) # pprint(data_net_worth_trend) # print('单位净值走势: {0}'.format(data_net_worth_trend)) self._deal_with_data_net_worth_trend( fund_name=fund_name, fund_code=fund_code, data_net_worth_trend=data_net_worth_trend) # 累计净值走势 data_ac_worth_trend = json_2_dict( re.compile(r'Data_ACWorthTrend = (.*?);').findall(body)[0]) # pprint(data_ac_worth_trend) # print('累计净值走势: {0}'.format(data_ac_worth_trend)) # 累计收益率走势 data_grand_total = json_2_dict( re.compile(r'Data_grandTotal = (.*?);').findall(body)[0]) # print('累计收益率走势: {0}'.format(data_grand_total)) # 同类排名走势 data_rate_in_similar_type = json_2_dict( re.compile(r'Data_rateInSimilarType = (.*?);').findall(body) [0]) # print('同类排名走势: {0}'.format(data_rate_in_similar_type)) # 同类排名百分比 data_rate_in_similar_persent = json_2_dict( re.compile(r'Data_rateInSimilarPersent=(.*?);').findall(body) [0]) # print('同类排名百分比: {0}'.format(data_rate_in_similar_persent)) # 同类型基金涨幅榜(页面底部通栏) swith_same_type = json_2_dict( re.compile(r'swithSameType = (.*?);').findall(body)[0]) # print('同类型基金涨幅榜: {0}'.format(swith_same_type)) except IndexError as e: print(e) return None def _deal_with_data_net_worth_trend(self, **kwargs): ''' 处理data_net_worth_trend(单位净值走势), 并成像 :param fund_name: :param fund_code: :param data_net_worth_trend: :return: ''' fund_name = kwargs.get('fund_name') fund_code = kwargs.get('fund_code') data_net_worth_trend = kwargs.get('data_net_worth_trend', []) [ item.update( {'x': str(timestamp_to_regulartime(str(item.get('x'))[:10]))}) for item in data_net_worth_trend ] print('时间格式转换成功!') # pprint(data_net_worth_trend) x = [item.get('x') for item in data_net_worth_trend] y = [item.get('y') for item in data_net_worth_trend] '''绘图''' self.plot_pic = self._drawing(fund_name=fund_name, fund_code=fund_code, x=x, y=y) try: del self.plot_pic except: pass gc.collect() return True def _drawing(self, **kwargs): ''' 初始化画笔 :param kwargs: :return: ''' import matplotlib.pyplot as plt from random import randint figure_num = randint(1, 10000) plt.figure(figure_num) # 创建图表1, 一个Figure对象可以包含多个子图(Axes), 从而避免图都画在一张上 fund_name = kwargs.get('fund_name') fund_code = kwargs.get('fund_code') x = kwargs.get('x') y = kwargs.get('y') # 加载字体 font = FontProperties(fname='/Library/Fonts/Songti.ttc', size=10) # 显示标题 plt.title('{0}(代码{1})的单位净值走势图'.format(fund_name, fund_code), fontproperties=font, fontsize=15) plt.xlabel('日期', fontproperties=font) plt.ylabel('单位净值', fontproperties=font) # 显示网格 # plt.grid() # 太密集了不显示 # 设置坐标轴步长step x_axis_label = self._get_x_axis_label(x) # pprint(x_axis_label) y_axis_label = self._get_y_axis_label(y) # pprint(y_axis_label) plt.xticks(arange(len(x_axis_label)), x_axis_label, rotation=30, fontsize=5) # 放str得先处理成这个格式 # plt.yticks(y_axis_label) # 设置x轴值区间 # plt.xlim(x[0], x[-2]) # 显示图例 plt.legend(['单位:元'], loc=1, prop=font) plt.figure(figure_num) # 调用绘制线性图函数plot() plot_pic = plt.plot( x, y, marker='.', markerfacecolor='r', markersize=1, # 标记的点的size linewidth=.4, # 线宽 color='#7EB6EA' # 线的颜色 ) # 标识数字标签 # for a, b in zip(x, y): # plt.text(a, b, '%.3f' % (b,), fontsize=5) # 调用show方法显式 # plt.show() # 保存pic pic_file_name = '{0}(代码{1}).png'.format(fund_name, fund_code) pic_path = self.base_path + pic_file_name if os.path.exists(pic_path): # 原先存在,就删除! # print('文件已存在!') os.remove(pic_path) savefig(fname=pic_path, dpi=400) # dpi控制图片像素 print('[+] {0} 保存完毕!'.format(pic_file_name)) plt.cla() # 清空当前图像 return plot_pic def _get_x_axis_label(self, x): ''' 得到x轴的刻度list :param x: :return: list ''' now_time = datetime.datetime.now() x_axis_label = [] for _x in x: if _x is not None and month_differ( now_time, string_to_datetime(_x)) % 6 == 0: if str(_x)[:7] in x_axis_label: # 如果已存在append('') x_axis_label.append('') else: x_axis_label.append(str(_x)[:7]) else: x_axis_label.append('') return x_axis_label def _get_y_axis_label(self, y): ''' 得到y轴的刻度list :param y: :return: ''' y_step = .1 y_axis_label = [ _y for _y in arange(min(y) - y_step, max(y) + y_step, y_step) ] return y_axis_label def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
class YanXuanParse(object): def __init__(self, logger=None): super(YanXuanParse, self).__init__() self.result_data = {} self._set_logger(logger) self._set_headers() self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) def _set_logger(self, logger): if logger is None: self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) else: self.my_lg = logger def _set_headers(self): self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_phone_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } def _get_goods_data(self, goods_id): ''' 得到需求数据 :param goods_id: :return: ''' if goods_id == '': self.my_lg.error('获取到的goods_id为空值!此处跳过!') return self._get_data_error_init() # 网易严选m站抓取 url = 'http://m.you.163.com/item/detail' params = self._get_params(goods_id=goods_id) m_url = url + '?id={0}'.format(goods_id) self.my_lg.info('------>>>| 正在抓取严选地址为: {0}'.format(m_url)) write_info = '出错goods_id:{0}, 出错地址: {1}'.format(goods_id, m_url) '''requests被无限转发''' # body = MyRequests.get_url_body(url=url, headers=self.headers, params=params) # self.my_lg.info(str(body)) '''改用phantomjs''' body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=_get_url_contain_params(url=url, params=params)) if body == '': self.my_lg.error('获取到的body为空值!'+write_info) return self._get_data_error_init() try: body = re.compile('var jsonData=(.*?),policyList=').findall(body)[0] except IndexError: self.my_lg.error('获取body时索引异常!'+write_info, exc_info=True) return self._get_data_error_init() body = nonstandard_json_str_handle(json_str=body) # self.my_lg.info(str(body)) _ = json_2_dict( json_str=body, logger=self.my_lg) # pprint(_) if _ == {}: self.my_lg.error('获取到的data为空dict!'+write_info) return self._get_data_error_init() _ = self._wash_data(_) data = {} try: data['title'] = self._wash_sensitive_info(self._get_title(data=_)) data['sub_title'] = self._wash_sensitive_info(self._get_sub_title(data=_)) data['shop_name'] = '' data['all_img_url'] = self._get_all_img_url(data=_) data['p_info'] = self._get_p_info(data=_) data['div_desc'] = self._get_div_desc(data=_) data['sell_time'] = self._get_sell_time(data=_) data['detail_name_list'] = self._get_detail_name_list(data=_.get('skuSpecList', [])) data['price_info_list'] = self._get_price_info_list(data=_.get('skuList', [])) data['price'], data['taobao_price'] = self._get_price_and_taobao_price( price_info_list=data['price_info_list'] ) if data['price'] == 0 or data['taobao_price'] == 0: # 售罄商品处理 data['is_delete'] = 1 else: data['is_delete'] = self._get_is_delete(price_info_list=data['price_info_list'], data=data, other=_) except Exception: self.my_lg.error('遇到错误:', exc_info=True) self.my_lg.error(write_info) return self._get_data_error_init() if data != {}: self.result_data = data return data else: self.my_lg.info('data为空值') return self._get_data_error_init() def _deal_with_data(self): ''' 结构化数据 :return: ''' data = self.result_data if data != {}: # 店铺名称 shop_name = data['shop_name'] # 掌柜 account = '' # 商品名称 title = data['title'] # 子标题 sub_title = data['sub_title'] # 商品标签属性名称 detail_name_list = data['detail_name_list'] # 要存储的每个标签对应规格的价格及其库存 price_info_list = data['price_info_list'] # 所有示例图片地址 all_img_url = data['all_img_url'] # 详细信息标签名对应属性 p_info = data['p_info'] # pprint(p_info) # div_desc div_desc = data['div_desc'] is_delete = data['is_delete'] # 上下架时间 if data.get('sell_time', {}) != {}: schedule = [{ 'begin_time': data.get('sell_time', {}).get('begin_time', ''), 'end_time': data.get('sell_time', {}).get('end_time', ''), }] else: schedule = [] # 销售总量 all_sell_count = '' # 商品价格和淘宝价 price, taobao_price = data['price'], data['taobao_price'] result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'schedule': schedule, # 商品特价销售时间段 'all_sell_count': all_sell_count, # 销售总量 'is_delete': is_delete # 是否下架 } # pprint(result) # print(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) self.result_data = {} return result else: self.my_lg.error('待处理的data为空的dict, 该商品可能已经转移或者下架') return self._get_data_error_init() def to_right_and_update_data(self, data, pipeline): ''' 实时更新数据 :param data: :param pipeline: :return: ''' tmp = _get_right_model_data(data, site_id=30, logger=self.my_lg) params = self._get_db_update_params(item=tmp) base_sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, IsPriceChange=%s, PriceChangeInfo=%s, {0} {1} where GoodsID = %s' if tmp['delete_time'] == '': sql_str = base_sql_str.format('shelf_time=%s', '') elif tmp['shelf_time'] == '': sql_str = base_sql_str.format('delete_time=%s', '') else: sql_str = base_sql_str.format('shelf_time=%s,', 'delete_time=%s') pipeline._update_table_2(sql_str=sql_str, params=params, logger=self.my_lg) def _get_db_update_params(self, item): params = [ item['modify_time'], item['shop_name'], item['account'], item['title'], item['sub_title'], item['link_name'], # item['price'], # item['taobao_price'], dumps(item['price_info'], ensure_ascii=False), dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], item['all_sell_count'], # item['delete_time'], item['is_delete'], item['is_price_change'], dumps(item['price_change_info'], ensure_ascii=False), item['goods_id'], ] if item.get('delete_time', '') == '': params.insert(-1, item['shelf_time']) elif item.get('shelf_time', '') == '': params.insert(-1, item['delete_time']) else: params.insert(-1, item['shelf_time']) params.insert(-1, item['delete_time']) return tuple(params) def _wash_sensitive_info(self, target_str): ''' 清洗敏感信息 :param target_str: :return: ''' add_sensitive_str_list = [ '网易', '严选', '云音乐', ] target_str = wash_sensitive_info(data=target_str, replace_str_list=[], add_sensitive_str_list=add_sensitive_str_list) return target_str def _get_title(self, data): title = data.get('name', '') assert title != '', '获取到的name为空值!请检查!' return title def _get_sub_title(self, data): sub_title = data.get('simpleDesc', '') # 可以为空 return sub_title def _get_all_img_url(self, data): tmp = data.get('itemDetail', {}) first_img_url = data.get('listPicUrl', '') assert tmp != {}, '获取到的all_img_url为空dict!' all_img_url = [{ 'img_url': first_img_url }] if first_img_url != '' else [] for key, value in tmp.items(): if re.compile('picUrl').findall(key) != []: all_img_url.append({ 'img_url': value, }) return all_img_url def _get_p_info(self, data): p_info = [{ 'p_name': item.get('attrName', ''), 'p_value': self._wash_sensitive_info(item.get('attrValue', '')), } for item in data.get('attrList', [])] return p_info def _get_div_desc(self, data): div_desc = data.get('itemDetail', {}).get('detailHtml', '') assert div_desc != '', '获取到的div_desc为空值!请检查!' # self.my_lg.info(str(div_desc)) div_desc = self._wash_div_desc(div_desc) # print(div_desc) return div_desc def _wash_div_desc(self, div_desc): ''' 清洗div_desc :param div_desc: :return: ''' # 方案1: 过滤不充分 # filter = ''' # _src=\".*?\"| # http://yanxuan.nosdn.127.net/e5f0f6b40368d7e532ff6b3a6481e6ab.jpg| # http://yanxuan.nosdn.127.net/c56658fa7b0b8a38bdb9c292a68fb176.jpg # '''.replace('\n', '').replace(' ', '') # # div_desc = re.compile(filter).sub('', div_desc) # # # 因为前面的严选声明照片地址是hash值, 每次都变 # # 所以所有div_desc统一洗去前4张 # div_desc = re.compile('<img.*?/>').sub('', div_desc, count=4) # 方案2: img_list = unique_list_and_keep_original_order(re.compile('src=\"(.*?)\"').findall(div_desc)) # pprint(img_list) _ = '' for item in img_list[3:-2:]: _ += '<p><img src="{0}" style=""/></p>'.format(item) div_desc = _ return div_desc def _get_sell_time(self, data): ''' 得到上下架时间 :param data: :return: ''' try: left_time = data.get('gradientPrice', {}).get('leftTime', 0) except AttributeError: # gradientPrice的值可能为'' return {} if left_time == 0: return {} now_time_timestamp = datetime_to_timestamp(get_shanghai_time()) sell_time = { 'begin_time': timestamp_to_regulartime(now_time_timestamp), 'end_time': timestamp_to_regulartime(now_time_timestamp + left_time), } return sell_time def _get_detail_name_list(self, data): detail_name_list = [] for item in data: if item.get('name') is None: return [] else: detail_name_list.append({ 'spec_name': item.get('name') }) return detail_name_list def _get_price_info_list(self, data): ''' 得到price_info_list :param data: :return: ''' price_info_list = [] # pprint(data) for item in data: itemSkuSpecValueList = item.get('itemSkuSpecValueList', []) # pprint(itemSkuSpecValueList) spec_value_list = [i.get('skuSpecValue', {}).get('value', '') for i in itemSkuSpecValueList] spec_value = '|'.join(spec_value_list) img_url = item.get('pic', '') # 默认为空 if item.get('promotionDesc', '') == '新人专享价': # 新人专享价处理为原价 detail_price = str(item.get('calcPrice', '')) else: detail_price = str(item.get('retailPrice', '')) # 零售价 normal_price = str(item.get('counterPrice', '')) # 市场价 account_limit_buy_count = 5 rest_number = item.get('sellVolume', 0) # 官方接口没有规格库存信息, 此处默认为20 if rest_number == 0: continue price_info_list.append({ 'spec_value': spec_value, 'img_url': img_url, 'detail_price': detail_price, 'normal_price': normal_price, 'account_limit_buy_count': account_limit_buy_count, 'rest_number': rest_number, }) return price_info_list def _get_price_and_taobao_price(self, price_info_list): # pprint(price_info_list) if price_info_list == []: # 售罄商品处理 return 0, 0 try: tmp_price_list = sorted([round(float(item.get('detail_price', '')), 2) for item in price_info_list]) price = tmp_price_list[-1] # 商品价格 taobao_price = tmp_price_list[0] # 淘宝价 except IndexError: raise IndexError('获取price, taobao_price时索引异常!请检查!') return price, taobao_price def _get_is_delete(self, price_info_list, data, other): is_delete = 0 all_rest_number = 0 if price_info_list != []: for item in price_info_list: all_rest_number += item.get('rest_number', 0) if all_rest_number == 0: is_delete = 1 else: is_delete = 1 # 当官方下架时间< 当前时间戳 则商品已下架 is_delete = 1 if data['sell_time'] != {}: end_time = datetime_to_timestamp(string_to_datetime(data.get('sell_time', {}).get('end_time', ''))) if end_time < datetime_to_timestamp(get_shanghai_time()): self.my_lg.info('该商品已经过期下架...! 进行逻辑删除 is_delete=1') is_delete = 1 # print(is_delete) if other.get('soldOut'): # True or False is_delete = 1 return is_delete def _get_data_error_init(self): ''' 获取或者失败处理 :return: ''' self.result_data = {} return {} def _get_params(self, goods_id): params = ( ('id', goods_id), ) return params def _wash_data(self, data): ''' 清洗无用数据 :param data: :return: ''' try: data['comments'] = [] data['issueList'] = [] except: pass return data def get_goods_id_from_url(self, yanxuan_url): ''' 得到goods_id :param yanxuan_url: :return: goods_id ''' # http://you.163.com/item/detail?id=1130056&_stat_area=mod_1_item_1&_stat_id=1005000&_stat_referer=itemList is_yanxuan_url = re.compile(r'you.163.com/item/detail.*?').findall(yanxuan_url) if is_yanxuan_url != []: if re.compile(r'id=(\d+)').findall(yanxuan_url) != []: goods_id = re.compile(r'id=(\d+)').findall(yanxuan_url)[0] self.my_lg.info('------>>>| 得到的严选商品的goods_id为: {0}'.format(goods_id)) return goods_id else: self.my_lg.info('网易严选商品url错误, 非正规的url, 请参照格式(https://you.163.com/item/detail)开头的...') return '' def __del__(self): try: del self.my_phantomjs del self.my_lg except: pass gc.collect()
class PinduoduoParse(object): def __init__(self): self._set_headers() self.result_data = {} # self.set_cookies_key_api_uid() # 设置cookie中的api_uid的值 self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'mobile.yangkeduo.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 # 'Cookie': 'api_uid=rBQh+FoXerAjQWaAEOcpAg==;', # 分析发现需要这个cookie值 } def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: tmp_url = 'http://mobile.yangkeduo.com/goods.html?goods_id=' + str( goods_id) print('------>>>| 得到的商品手机版地址为: ', tmp_url) ''' 1.采用requests,由于经常返回错误的body(即requests.get返回的为空的html), So pass ''' # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) ''' 2.采用phantomjs来获取 ''' body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) if body == '': print('body中re匹配到的data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} data = re.compile(r'window.rawData= (.*?);</script>').findall( body) # 贪婪匹配匹配所有 if data != []: data = json_2_dict(json_str=data[0]) if data == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} # pprint(data) try: data['goods'].pop('localGroups') data['goods'].pop('mallService') data.pop('reviews') # 评价信息跟相关统计 except: pass # pprint(data) ''' 处理detailGallery转换成能被html显示页面信息 ''' detail_data = data.get('goods', {}).get('detailGallery', []) tmp_div_desc = '' if detail_data != []: for index in range(0, len(detail_data)): if index == 0: # 跳过拼多多的提示 pass else: tmp = '' tmp_img_url = detail_data[index].get('url') tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format( tmp_img_url) tmp_div_desc += tmp detail_data = '<div>' + tmp_div_desc + '</div>' else: detail_data = '' # print(detail_data) try: data['goods'].pop('detailGallery') # 删除图文介绍的无第二次用途的信息 except: pass data['div_desc'] = detail_data # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} def deal_with_data(self): ''' 处理result_data, 返回需要的信息 :return: 字典类型 ''' data = self.result_data if data != {}: # 店铺名称 if data.get('mall') is not None: shop_name = data.get('mall', {}).get('mallName', '') else: shop_name = '' # 掌柜 account = '' # 商品名称 title = data.get('goods', {}).get('goodsName', '') # 子标题 sub_title = '' # 商品库存 # 商品标签属性对应的值 # 商品标签属性名称 if data.get('goods', {}).get('skus', []) == []: detail_name_list = [] else: if data.get('goods', {}).get('skus', [])[0].get('specs') == []: detail_name_list = [] else: detail_name_list = [{ 'spec_name': item.get('spec_key') } for item in data.get('goods', {}).get( 'skus', [])[0].get('specs')] # print(detail_name_list) # 要存储的每个标签对应规格的价格及其库存 skus = data.get('goods', {}).get('skus', []) # pprint(skus) price_info_list = [] if skus != []: # ** 注意: 拼多多商品只有一个规格时skus也不会为空的 ** for index in range(0, len(skus)): tmp = {} price = skus[index].get('groupPrice', '') # 拼团价 normal_price = skus[index].get('normalPrice', '') # 单独购买价格 spec_value = [ item.get('spec_value') for item in data.get( 'goods', {}).get('skus', [])[index].get('specs') ] spec_value = '|'.join(spec_value) img_url = skus[index].get('thumbUrl', '') rest_number = skus[index].get('quantity', 0) # 剩余库存 is_on_sale = skus[index].get( 'isOnSale', 0) # 用于判断是否在特价销售,1:特价 0:原价(normal_price) tmp['spec_value'] = spec_value tmp['detail_price'] = price tmp['normal_price'] = normal_price tmp['img_url'] = img_url if rest_number <= 0: tmp['rest_number'] = 0 else: tmp['rest_number'] = rest_number tmp['is_on_sale'] = is_on_sale price_info_list.append(tmp) if price_info_list == []: print('price_info_list为空值') return {} # 商品价格和淘宝价 tmp_price_list = sorted([ round(float(item.get('detail_price', '')), 2) for item in price_info_list ]) price = tmp_price_list[-1] # 商品价格 taobao_price = tmp_price_list[0] # 淘宝价 if detail_name_list == []: print('## detail_name_list为空值 ##') price_info_list = [] # print('最高价为: ', price) # print('最低价为: ', taobao_price) # print(len(price_info_list)) # pprint(price_info_list) # 所有示例图片地址 all_img_url = [{ 'img_url': item } for item in data.get('goods', {}).get('topGallery', [])] # print(all_img_url) # 详细信息标签名对应属性 tmp_p_value = re.compile(r'\n').sub( '', data.get('goods', {}).get('goodsDesc', '')) tmp_p_value = re.compile(r'\t').sub('', tmp_p_value) tmp_p_value = re.compile(r' ').sub('', tmp_p_value) p_info = [{'p_name': '商品描述', 'p_value': tmp_p_value}] # print(p_info) # 总销量 all_sell_count = data.get('goods', {}).get('sales', 0) # div_desc div_desc = data.get('div_desc', '') # 商品销售时间区间 schedule = [{ 'begin_time': self.timestamp_to_regulartime( data.get('goods', {}).get('groupTypes', [])[0].get('startTime')), 'end_time': self.timestamp_to_regulartime( data.get('goods', {}).get('groupTypes', [])[0].get('endTime')), }] # pprint(schedule) # 用于判断商品是否已经下架 is_delete = 0 result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 # 'shop_name_url': shop_name_url, # 店铺主页地址 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list,# 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'schedule': schedule, # 商品开卖时间和结束开卖时间 'all_sell_count': all_sell_count, # 商品总销售量 'is_delete': is_delete # 用于判断商品是否已经下架 } # pprint(result) # print(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) return result else: print('待处理的data为空的dict, 该商品可能已经转移或者下架') return {} def to_right_and_update_data(self, data, pipeline): tmp = _get_right_model_data(data=data, site_id=13) params = self._get_db_update_params(item=tmp) # 改价格的sql语句 # sql_str = r'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, Price=%s, TaoBaoPrice=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, MyShelfAndDownTime=%s, delete_time=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s where GoodsID = %s' # 不改价格的sql语句 if tmp['delete_time'] == '': sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s where GoodsID = %s' elif tmp['shelf_time'] == '': sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, delete_time=%s where GoodsID = %s' else: sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, SellCount=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s, delete_time=%s where GoodsID = %s' pipeline._update_table(sql_str=sql_str, params=params) def insert_into_pinduoduo_xianshimiaosha_table(self, data, pipeline): tmp = _get_right_model_data(data=data, site_id=16) # 采集来源地(卷皮秒杀商品) print('------>>>| 待存储的数据信息为: ', tmp.get('goods_id')) params = self._get_db_insert_miaosha_params(item=tmp) sql_str = r'insert into dbo.pinduoduo_xianshimiaosha(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, property_info, detail_info, schedule, stock_info, miaosha_time, miaosha_begin_time, miaosha_end_time, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' pipeline._insert_into_table(sql_str=sql_str, params=params) def to_update_pinduoduo_xianshimiaosha_table(self, data, pipeline): tmp = _get_right_model_data(data=data, site_id=16) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_update_miaosha_params(item=tmp) sql_str = 'update dbo.pinduoduo_xianshimiaosha set modfiy_time = %s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_info=%s, all_image_url=%s, property_info=%s, detail_info=%s, is_delete=%s, schedule=%s, stock_info=%s, miaosha_time=%s, miaosha_begin_time=%s, miaosha_end_time=%s where goods_id = %s' pipeline._update_table(sql_str=sql_str, params=params) def _get_db_update_params(self, item): ''' 得到db待存储的数据 :param item: :return: ''' params = [ item['modify_time'], item['shop_name'], item['account'], item['title'], item['sub_title'], item['link_name'], # item['price'], # item['taobao_price'], dumps(item['price_info'], ensure_ascii=False), dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], item['all_sell_count'], # item['delete_time'], item['is_delete'], dumps(item['schedule'], ensure_ascii=False), item['is_price_change'], dumps(item['price_change_info'], ensure_ascii=False), item['goods_id'], ] if item.get('delete_time', '') == '': params.insert(-1, item['shelf_time']) elif item.get('shelf_time', '') == '': params.insert(-1, item['delete_time']) else: params.insert(-1, item['shelf_time']) params.insert(-1, item['delete_time']) return tuple(params) def _get_db_insert_miaosha_params(self, item): params = ( item['goods_id'], item['goods_url'], item['username'], item['create_time'], item['modify_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), dumps(item['stock_info'], ensure_ascii=False), dumps(item['miaosha_time'], ensure_ascii=False), item['miaosha_begin_time'], item['miaosha_end_time'], item['site_id'], item['is_delete'], ) return params def _get_db_update_miaosha_params(self, item): params = ( item['modify_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], item['is_delete'], dumps(item['schedule'], ensure_ascii=False), dumps(item['stock_info'], ensure_ascii=False), dumps(item['miaosha_time'], ensure_ascii=False), item['miaosha_begin_time'], item['miaosha_end_time'], item['goods_id'], ) return params def set_cookies_key_api_uid(self): ''' 给headers增加一个cookie, 里面有个key名字为api_uid :return: ''' # 设置代理ip ip_object = MyIpPools() self.proxies = ip_object.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } # 得到cookie中的key名为api_uid的值 host_url = 'http://mobile.yangkeduo.com' try: response = requests.get( host_url, headers=self.headers, proxies=tmp_proxies, timeout=10) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 api_uid = response.cookies.get('api_uid') # print(response.cookies.items()) # if api_uid is None: # api_uid = 'rBQh+FoXerAjQWaAEOcpAg==' self.headers['Cookie'] = 'api_uid=' + str(api_uid) + ';' # print(api_uid) except Exception: print('requests.get()请求超时....') pass def timestamp_to_regulartime(self, timestamp): ''' 将时间戳转换成时间 ''' # 利用localtime()函数将时间戳转化成localtime的格式 # 利用strftime()函数重新格式化时间 # 转换成localtime time_local = time.localtime(timestamp) # 转换成新的时间格式(2016-05-05 20:28:54) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) return dt def get_goods_id_from_url(self, pinduoduo_url): ''' 得到goods_id :param pinduoduo_url: :return: goods_id (类型str) ''' is_pinduoduo_url = re.compile( r'http://mobile.yangkeduo.com/goods.html.*?').findall( pinduoduo_url) if is_pinduoduo_url != []: if re.compile( r'http://mobile.yangkeduo.com/goods.html\?.*?goods_id=(\d+).*?' ).findall(pinduoduo_url) != []: tmp_pinduoduo_url = re.compile( r'http://mobile.yangkeduo.com/goods.html\?.*?goods_id=(\d+).*?' ).findall(pinduoduo_url)[0] if tmp_pinduoduo_url != '': goods_id = tmp_pinduoduo_url else: # 只是为了在pycharm里面测试,可以不加 pinduoduo_url = re.compile(r';').sub('', pinduoduo_url) goods_id = re.compile( r'http://mobile.yangkeduo.com/goods.html\?.*?goods_id=(\d+).*?' ).findall(pinduoduo_url)[0] print('------>>>| 得到的拼多多商品id为:', goods_id) return goods_id else: pass else: print( '拼多多商品url错误, 非正规的url, 请参照格式(http://mobile.yangkeduo.com/goods.html)开头的...' ) return '' def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
class JuanPiParse(object): def __init__(self): super(JuanPiParse, self).__init__() self._set_headers() self.result_data = {} self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'web.juanpi.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id) print('------>>>| 得到的商品手机版的地址为: ', tmp_url) ''' 1.原先使用requests来模拟(起初安全的运行了一个月),但是后来发现光requests会not Found,记住使用前别翻墙 ''' # try: # response = requests.get(tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=12) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # main_body = response.content.decode('utf-8') # # print(main_body) # # main_body = re.compile(r'\n').sub('', main_body) # main_body = re.compile(r'\t').sub('', main_body) # main_body = re.compile(r' ').sub('', main_body) # print(main_body) # data = re.compile(r'__PRELOADED_STATE__=(.*),window\.__SERVER_TIME__=').findall(main_body) # 贪婪匹配匹配所有 # print(data) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 2.采用phantomjs来处理,记住使用前别翻墙 ''' body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='div.sc-kgoBCf.bTQvTk') # 该css为手机端标题块 if body == '': print('获取到的body为空str!请检查!') self.result_data = {} return {} data = re.compile( r'__PRELOADED_STATE__ = (.*);</script> <style ').findall( body) # 贪婪匹配匹配所有 # 得到skudata # 卷皮原先的skudata请求地址1(官方放弃) # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id) # 现在卷皮skudata请求地址2 skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str( goods_id) self.skudata_headers = self.headers self.skudata_headers.update({'Host': 'webservice.juanpi.com'}) skudata_body = MyRequests.get_url_body( url=skudata_url, headers=self.skudata_headers) if skudata_body == '': print('获取到的skudata_body为空str!请检查!') self.result_data = {} return {} skudata = re.compile(r'(.*)').findall(skudata_body) # 贪婪匹配匹配所有 if skudata != []: skudata = skudata[0] skudata = json_2_dict(json_str=skudata) if skudata == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} skudata = skudata.get('skudata', {}) # pprint(skudata) try: if skudata.get('info') is not None: pass # 说明得到正确的skudata else: # 否则跳出 print('skudata中info的key为None, 返回空dict') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} except AttributeError as e: print('遇到错误如下(先跳过!): ', e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('skudata为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != []: main_data = json_2_dict(json_str=data[0]) if main_data == {}: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if main_data.get('detail') is not None: main_data = self._wash_main_data( main_data.get('detail', {})) main_data['skudata'] = skudata # pprint(main_data) # print(main_data) main_data['goods_id'] = goods_id self.result_data = main_data return main_data else: print('data中detail的key为None, 返回空dict') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} def deal_with_data(self): ''' 解析data数据,得到需要的东西 :return: dict ''' data = self.result_data if data != {}: # 店铺名称 shop_name = self._get_shop_name(data=data) # 掌柜 account = '' # 商品名称 title = data.get('baseInfo', {}).get('title', '') # 子标题 sub_title = '' # 商品库存 # 商品标签属性名称 detail_name_list = self._get_detail_name_list(data=data) if isinstance(detail_name_list, str): # 单独处理下架的情况 if detail_name_list == 'is_delete=1': print('该商品已下架...') sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s' params = (self.result_data.get('goods_id', ''), ) _ = SqlServerMyPageInfoSaveItemPipeline() result = _._update_table(sql_str=sql_str, params=params) if result: print('### 该商品已经is_delete=1 ###') else: print('is_delete=1标记失败!') if detail_name_list == {}: self.result_data = {} return {} # print(detail_name_list) # 商品标签属性对应的值(pass不采集) # 要存储的每个标签对应的规格的价格及其库存 price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price( data=data) # print('最高价为: ', price) # print('最低价为: ', taobao_price) # pprint(price_info_list) # 所有示例图片的地址 # pprint(data.get('goodImages')) all_img_url = [{ 'img_url': item } for item in data.get('goodImages')] # print(all_img_url) # 详细信息标签名对应的属性 p_info = self._get_p_info(data=data) # pprint(p_info) # div_desc div_desc = self._get_div_desc(data=data) # print(div_desc) # 商品销售时间段 schedule = self._get_goods_schedule(data=data) # pprint(schedule) is_delete = self._get_is_delete(data=data, schedule=schedule) if price == 0 or taobao_price == 0: # 没有获取到价格说明商品已经下架了 is_delete = 1 # print('is_delete = ', is_delete) result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'is_delete': is_delete, # 是否下架判断 'schedule': schedule, # 商品销售时间段 } # pprint(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) gc.collect() return result else: print('待处理的data为空的dict') return {} def to_right_and_update_data(self, data, pipeline): tmp = _get_right_model_data(data=data, site_id=12) params = self._get_db_update_params(item=tmp) # 改价格的sql语句 # sql_str = r'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, Price=%s, TaoBaoPrice=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, MyShelfAndDownTime=%s, delete_time=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s where GoodsID = %s' # 不改价格的sql语句 if tmp['delete_time'] == '': sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s where GoodsID = %s' elif tmp['shelf_time'] == '': sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, delete_time=%s where GoodsID = %s' else: sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s, delete_time=%s where GoodsID = %s' pipeline._update_table(sql_str=sql_str, params=params) def insert_into_juanpi_xianshimiaosha_table(self, data, pipeline): tmp = _get_right_model_data(data=data, site_id=15) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>> | 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_insert_miaosha_params(item=tmp) sql_str = 'insert into dbo.juanpi_xianshimiaosha(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, property_info, detail_info, schedule, stock_info, miaosha_time, miaosha_begin_time, miaosha_end_time, tab_id, page, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' pipeline._insert_into_table(sql_str=sql_str, params=params) def to_update_juanpi_xianshimiaosha_table(self, data, pipeline): tmp = _get_right_model_data(data=data, site_id=15) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_update_miaosha_params(item=tmp) sql_str = 'update dbo.juanpi_xianshimiaosha set modfiy_time = %s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_info=%s, all_image_url=%s, property_info=%s, detail_info=%s, is_delete=%s, schedule=%s, stock_info=%s, miaosha_time=%s, miaosha_begin_time=%s, miaosha_end_time=%s where goods_id = %s' pipeline._update_table(sql_str=sql_str, params=params) def insert_into_juuanpi_pintuan_table(self, data, pipeline): try: tmp = _get_right_model_data(data=data, site_id=18) except: print('此处抓到的可能是卷皮拼团券所以跳过') return None # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>> | 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_insert_pintuan_params(item=tmp) sql_str = 'insert into dbo.juanpi_pintuan(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, all_sell_count, property_info, detail_info, schedule, miaosha_begin_time, miaosha_end_time, page, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' _r = pipeline._insert_into_table(sql_str=sql_str, params=params) return _r def to_right_and_update_pintuan_data(self, data, pipeline): try: tmp = _get_right_model_data(data=data, site_id=18) except: print('此处抓到的可能是卷皮拼团券所以跳过') return None # print('------>>>| 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_update_pintuan_params(item=tmp) sql_str = r'update dbo.juanpi_pintuan set modfiy_time=%s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_Info=%s, all_image_url=%s, property_info=%s, detail_info=%s, schedule=%s, is_delete=%s where goods_id = %s' pipeline._update_table(sql_str=sql_str, params=params) def _get_shop_name(self, data): ''' 获取shop_name :param data: :return: ''' if data.get('brand_info') is not None: shop_name = data.get('brand_info', {}).get('title', '') else: shop_name = data.get('schedule_info', {}).get('brand_title', '') return shop_name def _get_detail_name_list(self, data): ''' 获取detail_name_list :param data: :return: {} 表示出错 | [] 非空正常 ''' sku = data.get('skudata', {}).get('sku', []) # pprint(sku) detail_name_list = [] if sku != []: try: if sku[0].get('av_fvalue', '') == '': fav_name = '' pass else: tmp = {} fav_name = data.get('skudata', {}).get('info', {}).get('fav_name', '') tmp['spec_name'] = fav_name detail_name_list.append(tmp) except IndexError: print('IndexError错误,此处跳过!') # print(sku) if isinstance(sku, str): # 单独处理下架的 if sku == '': return 'is_delete=1' return {} if sku[0].get('av_zvalue', '') == '': zav_name = '' else: tmp = {} zav_name = data.get('skudata', {}).get('info', {}).get('zav_name', '') tmp['spec_name'] = zav_name detail_name_list.append(tmp) return detail_name_list def _get_price_info_list_and_price_and_taobao_price(self, data): ''' 获取price_info_list, price, taobao_price :param data: :return: a tuple ''' sku = data.get('skudata', {}).get('sku', []) # 分析得到sku肯定不为[] # pprint(sku) price_info_list = [] if len(sku) == 1 and sku[0].get( 'av_fvalue', '') == '' and sku[0].get('av_zvalue') == '': # 没有规格的默认只有一个{} # price最高价, taobao_price最低价 price = round(float(sku[0].get('cprice')), 2) taobao_price = price else: # 有规格的 # 通过'stock'='1'来判断是否有库存, ='0'表示无库存 # '由于卷皮不给返回库存值, 所以 'stock_tips'='库存紧张', 我就设置剩余库存为10, 如果'stock_tips'='', 就默认设置库存量为50 # print('777') for item in sku: tmp = {} tmp_1 = [] if item.get('av_fvalue', '') == '': pass else: tmp_1.append(item.get('av_fvalue')) if item.get('av_zvalue', '') == '': pass else: tmp_1.append(item.get('av_zvalue')) tmp_1 = '|'.join(tmp_1) if item.get('av_origin_zpic', '') != '': tmp['img_url'] = item.get('av_origin_zpic', '') else: tmp['img_url'] = '' if item.get('cprice', '') != '': tmp['pintuan_price'] = item.get('cprice') tmp['detail_price'] = item.get('sprice', '') tmp['normal_price'] = item.get('price') else: tmp['pintuan_price'] = item.get('price') if item.get('sprice', '') != '': tmp['detail_price'] = item.get('sprice', '') else: tmp['detail_price'] = item.get('price') tmp['normal_price'] = item.get('price') if item.get('stock') == '0': # 跳过 rest_number = '0' else: # 即'stock'='1' rest_number = '50' if item.get('stock_tips', '') != '' and item.get( 'stock_tips', '') == '库存紧张': # 库存紧张的时候设置下 rest_number = '10' tmp['spec_value'] = tmp_1 tmp['rest_number'] = rest_number price_info_list.append(tmp) # 得到有规格时的最高价和最低价 tmp_price_list = sorted([ round(float(item.get('pintuan_price', '')), 2) for item in price_info_list ]) # print(tmp_price_list) if tmp_price_list == []: price = 0 taobao_price = 0 else: price = tmp_price_list[-1] # 商品价格 taobao_price = tmp_price_list[0] # 淘宝价 return price_info_list, price, taobao_price def _get_p_info(self, data): ''' 获取p_info :param data: :return: ''' p_info = [] attr = data.get('goodsDetail', {}).get('attr', []) # print(attr) if attr != []: # item是str时跳过 p_info = [{ 'p_name': item.get('st_key'), 'p_value': item.get('st_value') } for item in attr if isinstance(item, dict)] for item in p_info: if item.get('p_name') == '运费': # 过滤掉颜色的html代码 item['p_value'] = '全国包邮(偏远地区除外)' # 过滤清洗 tmp_p_value = item.get('p_value', '') tmp_p_value = re.compile(r'\xa0').sub(' ', tmp_p_value) # 替换为一个空格 item['p_value'] = tmp_p_value return p_info def _get_div_desc(self, data): ''' 获取div_desc :param data: :return: ''' div_images_list = data.get('goodsDetail', {}).get('images', []) tmp_div_desc = '' for item in div_images_list: tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format( item) tmp_div_desc += tmp return '<div>' + tmp_div_desc + '</div>' def _get_goods_schedule(self, data): ''' 获取商品销售时间段 :param data: :return: ''' # print(data.get('skudata', {}).get('info', {})) # print(data.get('skudata', {})) begin_time = data.get('skudata', {}).get('info', {}).get( 'start_time') # 取这个时间段才是正确的销售时间, 之前baseInfo是虚假的 end_time = data.get('skudata', {}).get('info', {}).get('end_time') if begin_time is None or end_time is None: schedule = [] else: schedule = [{ 'begin_time': timestamp_to_regulartime(begin_time), 'end_time': timestamp_to_regulartime(end_time), }] return schedule def _get_is_delete(self, data, schedule): ''' 得到商品的上下架状态 :param data: :param schedule: :return: ''' end_time = data.get('skudata', {}).get('info', {}).get('end_time') is_delete = 0 # 是否下架判断 # 结束时间戳小于当前时间戳则表示已经删除无法购买, 另外每个规格卖光也不显示is_delete=1(在上面已经判断, 这个就跟销售时间段没关系了) if schedule != []: if data.get('baseInfo', {}).get('end_time') is not None: ''' 先判断如果baseInfo中的end_time=='0'表示已经下架 ''' # base_info_end_time = data.get('baseInfo', {}).get('end_time') # self.my_lg.info(base_info_end_time) # if base_info_end_time == '0': # is_delete = 1 pass if float(end_time) < time.time(): ''' 再判断日期过期的 ''' is_delete = 1 ''' 卷皮-新增下架判断: time: 2018-5-12 ''' if data.get('skudata', {}).get('info', {}).get('gstatus', '1') == '2': # 'gstatus'在售状态为'1' is_delete = 1 return is_delete def _wash_main_data(self, main_data): ''' 清洗main_data :param main_data: :return: ''' # 处理commitments try: main_data['commitments'] = '' main_data.get('discount', {})['coupon'] = '' main_data.get('discount', {})['coupon_index'] = '' main_data.get('discount', {})['vip_info'] = '' main_data['topbanner'] = '' except: pass try: main_data.get('brand_info')['sub_goods'] = '' except: pass return main_data def _get_db_update_params(self, item): ''' 得到待更新的db数据 :param item: :return: ''' params = [ item['modify_time'], item['shop_name'], item['account'], item['title'], item['sub_title'], item['link_name'], # item['price'], # item['taobao_price'], dumps(item['price_info'], ensure_ascii=False), dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], # item['delete_time'], item['is_delete'], dumps(item['schedule'], ensure_ascii=False), item['is_price_change'], dumps(item['price_change_info'], ensure_ascii=False), item['goods_id'], ] if item.get('delete_time', '') == '': params.insert(-1, item['shelf_time']) elif item.get('shelf_time', '') == '': params.insert(-1, item['delete_time']) else: params.insert(-1, item['shelf_time']) params.insert(-1, item['delete_time']) return tuple(params) def _get_db_insert_miaosha_params(self, item): params = ( item['goods_id'], item['goods_url'], item['username'], item['create_time'], item['modify_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), dumps(item['stock_info'], ensure_ascii=False), dumps(item['miaosha_time'], ensure_ascii=False), item['miaosha_begin_time'], item['miaosha_end_time'], item['tab_id'], item['page'], item['site_id'], item['is_delete'], ) return params def _get_db_update_miaosha_params(self, item): params = ( item['modify_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], item['is_delete'], dumps(item['schedule'], ensure_ascii=False), dumps(item['stock_info'], ensure_ascii=False), dumps(item['miaosha_time'], ensure_ascii=False), item['miaosha_begin_time'], item['miaosha_end_time'], item['goods_id'], ) return params def _get_db_insert_pintuan_params(self, item): params = ( item['goods_id'], item['goods_url'], item['username'], item['create_time'], item['modify_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), item['all_sell_count'], dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), item['pintuan_begin_time'], item['pintuan_end_time'], item['page'], item['site_id'], item['is_delete'], ) return params def _get_db_update_pintuan_params(self, item): params = ( item['modify_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), # item['all_sell_count'], dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), item['is_delete'], item['goods_id']) return params def get_goods_id_from_url(self, juanpi_url): ''' 得到goods_id :param juanpi_url: :return: goods_id (类型str) ''' is_juanpi_url = re.compile(r'http://shop.juanpi.com/deal/.*?').findall( juanpi_url) if is_juanpi_url != []: if re.compile(r'http://shop.juanpi.com/deal/(\d+).*?').findall( juanpi_url) != []: tmp_juanpi_url = re.compile( r'http://shop.juanpi.com/deal/(\d+).*?').findall( juanpi_url)[0] if tmp_juanpi_url != '': goods_id = tmp_juanpi_url else: # 只是为了在pycharm运行时不跳到chrome,其实else完全可以不要的 juanpi_url = re.compile(r';').sub('', juanpi_url) goods_id = re.compile( r'http://shop.juanpi.com/deal/(\d+).*?').findall( juanpi_url)[0] print('------>>>| 得到的卷皮商品的地址为:', goods_id) return goods_id else: print( '卷皮商品url错误, 非正规的url, 请参照格式(http://shop.juanpi.com/deal/)开头的...' ) return '' def __del__(self): try: del self.my_phantomjs del self.result_data except: pass gc.collect()
async def get_goods_data(self, jumei_pintuan_url): ''' 异步模拟得到原始data :param goods_id: :return: ''' goods_id = await self.get_goods_id_from_url(jumei_pintuan_url) if goods_id == []: self.result_data = {} return {} ''' 原先采用requests被过滤无返回结果, 于是用aiohttp无奈速度过慢, 换用phantomjs ''' # 拼团商品手机地址 goods_url = 'https://s.h5.jumei.com/yiqituan/detail?item_id={0}&type={1}'.format( goods_id[0], goods_id[1]) self.msg = '------>>>| 对应手机端地址为: ' + goods_url self.my_lg.info(self.msg) #** 获取ajaxDetail请求中的数据 tmp_url = 'https://s.h5.jumei.com/yiqituan/ajaxDetail?item_id={0}&type={1}'.format( str(goods_id[0]), [goods_id[1]][0]) # self.headers['Referer'] = goods_url # params = { # 'item_id': str(goods_id[0]), # 'type': [goods_id[1]][0], # } # body = await MyAiohttp.aio_get_url_body(url=tmp_url, headers=self.headers, params=params, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT) # # 获取原始url的tmp_body # tmp_body = await MyAiohttp.aio_get_url_body(url=goods_url, headers=self.headers, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT) # # print(tmp_body) ''' 换用phantomjs ''' my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) # print(body) try: body = re.compile('<pre .*?>(.*)</pre>').findall(body)[0] # print(body) except IndexError: body = '' tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(url=goods_url) # print(tmp_body) try: del my_phantomjs except: pass if body == '' or tmp_body == '': self.msg = '获取到的body为空str!' + ' 出错地址: ' + goods_url self.my_lg.error(self.msg) self.result_data = {} return {} data = await self.json_2_dict(json_str=body) if data == {}: self.msg = '出错地址: ' + goods_url self.my_lg.error(self.msg) self.result_data = {} return {} data = await self.wash_data(data=data) data = data.get('data', {}) # pprint(data) try: data['title'] = data.get('share_info', [])[1].get('text', '') data['title'] = re.compile(r'聚美').sub('', data['title']) if len(data.get('buy_alone', {})) == 1: data['sub_title'] = '' else: data['sub_title'] = data.get('buy_alone', {}).get('name', '') data['sub_title'] = re.compile(r'聚美').sub( '', data['sub_title']) # print(data['title']) if data['title'] == '': self.my_lg.error('获取到的title为空值, 请检查!') raise Exception # shop_name if data.get('shop_info') == []: data['shop_name'] = '' else: data['shop_name'] = data.get('shop_info', {}).get('store_title', '') # print(data['shop_name']) # 获取所有示例图片 all_img_url = await self.get_all_img_url(data=data) data['all_img_url'] = all_img_url # 获取p_info p_info = await self.get_p_info(body=tmp_body) data['p_info'] = p_info # 获取div_desc div_desc = await self.get_div_desc(body=tmp_body) div_desc = await MyAiohttp.wash_html(div_desc) # print(div_desc) data['div_desc'] = div_desc # 上下架时间(拼团列表数据接口里面有这里先不获取) # 设置detail_name_list detail_name_list = await self.get_detail_name_list( size_attr=data.get('buy_alone', {}).get('size_attr', [])) data['detail_name_list'] = detail_name_list # 获取每个规格对应价格以及库存 true_sku_info = await self.get_true_sku_info( buy_alone_size=data.get('buy_alone', {}).get('size', []), size=data.get('size', []), group_single_price=data.get('group_single_price', '')) data['price_info_list'] = true_sku_info # is_delete product_status = data.get('product_status', '') is_delete = await self.get_is_delete(product_status=product_status, true_sku_info=true_sku_info) data['is_delete'] = is_delete # all_sell_count all_sell_count = data.get('buyer_number_text', '') if all_sell_count != '': all_sell_count = re.compile(r'(\d+\.?\d*)').findall( all_sell_count)[0] is_W = re.compile(r'万').findall(all_sell_count) if is_W != []: all_sell_count = str(int(float(all_sell_count) * 10000)) else: all_sell_count = '0' data['all_sell_count'] = all_sell_count data['goods_url'] = goods_url except Exception as e: self.msg = '遇到错误如下: ' + str(e) + ' 出错地址: ' + goods_url self.my_lg.error(self.msg) self.my_lg.exception(e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != {}: # pprint(data) self.result_data = data return data else: self.msg = 'data为空!' + ' 出错地址: ' + goods_url self.my_lg.error(self.msg) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
def __init__(self): super(JuanPiParse, self).__init__() self._set_headers() self.result_data = {} self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH)