def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] chuchujie = ChuChuJie_9_9_Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = 'select goods_id, miaosha_time, gender, page, goods_url from dbo.chuchujie_xianshimiaosha where site_id=24' db_goods_id_list = [ item[0] for item in list(my_pipeline._select_table(sql_str=sql_str)) ] # print(db_goods_id_list) # my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) # index = 1 for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = item.get('goods_id', '') tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str( goods_id) chuchujie.get_goods_data(goods_id=goods_id) goods_data = chuchujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass elif goods_data.get('is_delete', 0) == 1: # is_delete=1(即库存为0)则跳过 print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) # 获取剩余时间 tmp_body = my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='p#activityTime span') # print(tmp_body) try: del my_phantomjs except: pass gc.collect() if tmp_body == '': # 获取手机版的页面完整html失败 sleep(.4) pass else: # p#activityTime span _t = Selector(text=tmp_body).css( 'p#activityTime span::text').extract_first() _t = re.compile(r'剩余').sub('', _t) # print(_t) if _t == '' or _t is None: print('获取到的_t为空值, 严重错误! 请检查!') miaosha_end_time = self.get_miaosha_end_time(_t) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(time.time())), 'miaosha_end_time': timestamp_to_regulartime( int(miaosha_end_time)), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['gender'] = str(item.get('gender', '0')) goods_data['page'] = item.get('page') # pprint(goods_data) # print(goods_data) chuchujie.insert_into_chuchujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) # sleep(CHUCHUJIE_SLEEP_TIME) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 # index += 1 else: print('数据库连接失败,此处跳过!') pass try: del chuchujie except: pass gc.collect()
async def get_goods_data(self, jumei_pintuan_url): ''' 异步模拟得到原始data :param goods_id: :return: ''' goods_id = await self.get_goods_id_from_url(jumei_pintuan_url) if goods_id == []: self.result_data = {} return {} ''' 原先采用requests被过滤无返回结果, 于是用aiohttp无奈速度过慢, 换用phantomjs ''' # 拼团商品手机地址 goods_url = 'https://s.h5.jumei.com/yiqituan/detail?item_id={0}&type={1}'.format( goods_id[0], goods_id[1]) self.msg = '------>>>| 对应手机端地址为: ' + goods_url self.my_lg.info(self.msg) #** 获取ajaxDetail请求中的数据 tmp_url = 'https://s.h5.jumei.com/yiqituan/ajaxDetail?item_id={0}&type={1}'.format( str(goods_id[0]), [goods_id[1]][0]) # self.headers['Referer'] = goods_url # params = { # 'item_id': str(goods_id[0]), # 'type': [goods_id[1]][0], # } # body = await MyAiohttp.aio_get_url_body(url=tmp_url, headers=self.headers, params=params, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT) # # 获取原始url的tmp_body # tmp_body = await MyAiohttp.aio_get_url_body(url=goods_url, headers=self.headers, timeout=JUMEIYOUPIN_PINTUAN_GOODS_TIMEOUT) # # print(tmp_body) ''' 换用phantomjs ''' my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) # print(body) try: body = re.compile('<pre .*?>(.*)</pre>').findall(body)[0] # print(body) except IndexError: body = '' tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(url=goods_url) # print(tmp_body) try: del my_phantomjs except: pass if body == '' or tmp_body == '': self.msg = '获取到的body为空str!' + ' 出错地址: ' + goods_url self.my_lg.error(self.msg) self.result_data = {} return {} data = await self.json_2_dict(json_str=body) if data == {}: self.msg = '出错地址: ' + goods_url self.my_lg.error(self.msg) self.result_data = {} return {} data = await self.wash_data(data=data) data = data.get('data', {}) # pprint(data) try: data['title'] = data.get('share_info', [])[1].get('text', '') data['title'] = re.compile(r'聚美').sub('', data['title']) if len(data.get('buy_alone', {})) == 1: data['sub_title'] = '' else: data['sub_title'] = data.get('buy_alone', {}).get('name', '') data['sub_title'] = re.compile(r'聚美').sub( '', data['sub_title']) # print(data['title']) if data['title'] == '': self.my_lg.error('获取到的title为空值, 请检查!') raise Exception # shop_name if data.get('shop_info') == []: data['shop_name'] = '' else: data['shop_name'] = data.get('shop_info', {}).get('store_title', '') # print(data['shop_name']) # 获取所有示例图片 all_img_url = await self.get_all_img_url(data=data) data['all_img_url'] = all_img_url # 获取p_info p_info = await self.get_p_info(body=tmp_body) data['p_info'] = p_info # 获取div_desc div_desc = await self.get_div_desc(body=tmp_body) div_desc = await MyAiohttp.wash_html(div_desc) # print(div_desc) data['div_desc'] = div_desc # 上下架时间(拼团列表数据接口里面有这里先不获取) # 设置detail_name_list detail_name_list = await self.get_detail_name_list( size_attr=data.get('buy_alone', {}).get('size_attr', [])) data['detail_name_list'] = detail_name_list # 获取每个规格对应价格以及库存 true_sku_info = await self.get_true_sku_info( buy_alone_size=data.get('buy_alone', {}).get('size', []), size=data.get('size', []), group_single_price=data.get('group_single_price', '')) data['price_info_list'] = true_sku_info # is_delete product_status = data.get('product_status', '') is_delete = await self.get_is_delete(product_status=product_status, true_sku_info=true_sku_info) data['is_delete'] = is_delete # all_sell_count all_sell_count = data.get('buyer_number_text', '') if all_sell_count != '': all_sell_count = re.compile(r'(\d+\.?\d*)').findall( all_sell_count)[0] is_W = re.compile(r'万').findall(all_sell_count) if is_W != []: all_sell_count = str(int(float(all_sell_count) * 10000)) else: all_sell_count = '0' data['all_sell_count'] = all_sell_count data['goods_url'] = goods_url except Exception as e: self.msg = '遇到错误如下: ' + str(e) + ' 出错地址: ' + goods_url self.my_lg.error(self.msg) self.my_lg.exception(e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != {}: # pprint(data) self.result_data = data return data else: self.msg = 'data为空!' + ' 出错地址: ' + goods_url self.my_lg.error(self.msg) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
class JuanPiParse(object): def __init__(self): super(JuanPiParse, self).__init__() self._set_headers() self.result_data = {} self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'web.juanpi.com', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: tmp_url = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id) print('------>>>| 得到的商品手机版的地址为: ', tmp_url) ''' 1.原先使用requests来模拟(起初安全的运行了一个月),但是后来发现光requests会not Found,记住使用前别翻墙 ''' # try: # response = requests.get(tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=12) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # main_body = response.content.decode('utf-8') # # print(main_body) # # main_body = re.compile(r'\n').sub('', main_body) # main_body = re.compile(r'\t').sub('', main_body) # main_body = re.compile(r' ').sub('', main_body) # print(main_body) # data = re.compile(r'__PRELOADED_STATE__=(.*),window\.__SERVER_TIME__=').findall(main_body) # 贪婪匹配匹配所有 # print(data) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 2.采用phantomjs来处理,记住使用前别翻墙 ''' body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='div.sc-kgoBCf.bTQvTk') # 该css为手机端标题块 if body == '': print('获取到的body为空str!请检查!') self.result_data = {} return {} data = re.compile( r'__PRELOADED_STATE__ = (.*);</script> <style ').findall( body) # 贪婪匹配匹配所有 # 得到skudata # 卷皮原先的skudata请求地址1(官方放弃) # skudata_url = 'https://webservice.juanpi.com/api/getOtherInfo?goods_id=' + str(goods_id) # 现在卷皮skudata请求地址2 skudata_url = 'https://webservice.juanpi.com/api/getMemberAboutInfo?goods_id=' + str( goods_id) self.skudata_headers = self.headers self.skudata_headers.update({'Host': 'webservice.juanpi.com'}) skudata_body = MyRequests.get_url_body( url=skudata_url, headers=self.skudata_headers) if skudata_body == '': print('获取到的skudata_body为空str!请检查!') self.result_data = {} return {} skudata = re.compile(r'(.*)').findall(skudata_body) # 贪婪匹配匹配所有 if skudata != []: skudata = skudata[0] try: skudata = json.loads(skudata) except: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} skudata = skudata.get('skudata', {}) # pprint(skudata) try: if skudata.get('info') is not None: pass # 说明得到正确的skudata else: # 否则跳出 print('skudata中info的key为None, 返回空dict') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} except AttributeError as e: print('遇到错误如下(先跳过!): ', e) self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('skudata为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if data != []: main_data = data[0] # print(main_data) try: main_data = json.loads(main_data) # pprint(main_data) except: self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} if main_data.get('detail') is not None: main_data = self._wash_main_data( main_data.get('detail', {})) main_data['skudata'] = skudata # pprint(main_data) # print(main_data) main_data['goods_id'] = goods_id self.result_data = main_data return main_data else: print('data中detail的key为None, 返回空dict') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {} def deal_with_data(self): ''' 解析data数据,得到需要的东西 :return: dict ''' data = self.result_data if data != {}: # 店铺名称 shop_name = self._get_shop_name(data=data) # 掌柜 account = '' # 商品名称 title = data.get('baseInfo', {}).get('title', '') # 子标题 sub_title = '' # 商品库存 # 商品标签属性名称 detail_name_list = self._get_detail_name_list(data=data) if isinstance(detail_name_list, str): # 单独处理下架的情况 if detail_name_list == 'is_delete=1': print('该商品已下架...') sql_str = 'update dbo.GoodsInfoAutoGet set IsDelete=1 where GoodsID=%s' params = (self.result_data.get('goods_id', ''), ) _ = SqlServerMyPageInfoSaveItemPipeline() result = _._update_table(sql_str=sql_str, params=params) if result: print('### 该商品已经is_delete=1 ###') else: print('is_delete=1标记失败!') if detail_name_list == {}: self.result_data = {} return {} # print(detail_name_list) # 商品标签属性对应的值(pass不采集) # 要存储的每个标签对应的规格的价格及其库存 price_info_list, price, taobao_price = self._get_price_info_list_and_price_and_taobao_price( data=data) # print('最高价为: ', price) # print('最低价为: ', taobao_price) # pprint(price_info_list) # 所有示例图片的地址 # pprint(data.get('goodImages')) all_img_url = [{ 'img_url': item } for item in data.get('goodImages')] # print(all_img_url) # 详细信息标签名对应的属性 p_info = self._get_p_info(data=data) # pprint(p_info) # div_desc div_desc = self._get_div_desc(data=data) # print(div_desc) # 商品销售时间段 schedule = self._get_goods_schedule(data=data) # pprint(schedule) is_delete = self._get_is_delete(data=data, schedule=schedule) if price == 0 or taobao_price == 0: # 没有获取到价格说明商品已经下架了 is_delete = 1 # print('is_delete = ', is_delete) result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'is_delete': is_delete, # 是否下架判断 'schedule': schedule, # 商品销售时间段 } # pprint(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) gc.collect() return result else: print('待处理的data为空的dict') return {} def to_right_and_update_data(self, data, pipeline): data_list = data tmp = GoodsItem() tmp['goods_id'] = data_list['goods_id'] # 官方商品id now_time = get_shanghai_time() tmp['modify_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 商品子标题 tmp['link_name'] = '' # 卖家姓名 tmp['account'] = data_list['account'] # 掌柜名称 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['price_info'] = [] # 价格信息 tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 tmp['shelf_time'] = data_list.get('shelf_time', '') tmp['delete_time'] = data_list.get('delete_time', '') tmp['is_price_change'] = data_list.get('_is_price_change') tmp['price_change_info'] = data_list.get('_price_change_info') params = self._get_db_update_params(item=tmp) # 改价格的sql语句 # sql_str = r'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, Price=%s, TaoBaoPrice=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, MyShelfAndDownTime=%s, delete_time=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s where GoodsID = %s' # 不改价格的sql语句 if tmp['delete_time'] == '': sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s where GoodsID = %s' elif tmp['shelf_time'] == '': sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, delete_time=%s where GoodsID = %s' else: sql_str = 'update dbo.GoodsInfoAutoGet set ModfiyTime = %s, ShopName=%s, Account=%s, GoodsName=%s, SubTitle=%s, LinkName=%s, PriceInfo=%s, SKUName=%s, SKUInfo=%s, ImageUrl=%s, PropertyInfo=%s, DetailInfo=%s, IsDelete=%s, Schedule=%s, IsPriceChange=%s, PriceChangeInfo=%s, shelf_time=%s, delete_time=%s where GoodsID = %s' pipeline._update_table(sql_str=sql_str, params=params) def insert_into_juanpi_xianshimiaosha_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id tmp['spider_url'] = data_list['spider_url'] # 商品地址 tmp['username'] = data_list['username'] # 操作人员username now_time = get_shanghai_time() tmp['deal_with_time'] = now_time # 操作时间 tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 商品子标题 # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['stock_info'] = data_list.get('stock_info') tmp['miaosha_time'] = data_list.get('miaosha_time') tmp['miaosha_begin_time'] = data_list.get('miaosha_begin_time') tmp['miaosha_end_time'] = data_list.get('miaosha_end_time') tmp['tab_id'] = data_list.get('tab_id') tmp['page'] = data_list.get('page') # 采集的来源地 tmp['site_id'] = 15 # 采集来源地(卷皮秒杀商品) tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>> | 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_insert_miaosha_params(item=tmp) sql_str = r'insert into dbo.juanpi_xianshimiaosha(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, property_info, detail_info, schedule, stock_info, miaosha_time, miaosha_begin_time, miaosha_end_time, tab_id, page, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' pipeline._insert_into_table(sql_str=sql_str, params=params) def to_update_juanpi_xianshimiaosha_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id now_time = get_shanghai_time() tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal(data_list['taobao_price']).__round__(2) tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['stock_info'] = data_list.get('stock_info') tmp['miaosha_time'] = data_list.get('miaosha_time') tmp['miaosha_begin_time'] = data_list.get('miaosha_begin_time') tmp['miaosha_end_time'] = data_list.get('miaosha_end_time') tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_update_miaosha_params(item=tmp) sql_str = r'update dbo.juanpi_xianshimiaosha set modfiy_time = %s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_info=%s, all_image_url=%s, property_info=%s, detail_info=%s, is_delete=%s, schedule=%s, stock_info=%s, miaosha_time=%s, miaosha_begin_time=%s, miaosha_end_time=%s where goods_id = %s' pipeline._update_table(sql_str=sql_str, params=params) def insert_into_juuanpi_pintuan_table(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id tmp['spider_url'] = data_list['spider_url'] # 商品地址 tmp['username'] = data_list['username'] # 操作人员username now_time = get_shanghai_time() tmp['deal_with_time'] = now_time # 操作时间 tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price try: tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal( data_list['taobao_price']).__round__(2) except: # 此处抓到的可能是卷皮拼团券所以跳过 print('此处抓到的可能是卷皮拼团券所以跳过') return None tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 tmp['all_sell_count'] = data_list.get('all_sell_count') # 总销量 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') tmp['pintuan_begin_time'] = data_list.get('pintuan_begin_time') tmp['pintuan_end_time'] = data_list.get('pintuan_end_time') tmp['page'] = data_list.get('page') # 采集的来源地 tmp['site_id'] = 18 # 采集来源地(卷皮拼团商品) tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>> | 待存储的数据信息为: |', tmp) print('------>>> | 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_insert_pintuan_params(item=tmp) sql_str = r'insert into dbo.juanpi_pintuan(goods_id, goods_url, username, create_time, modfiy_time, shop_name, goods_name, sub_title, price, taobao_price, sku_name, sku_info, all_image_url, all_sell_count, property_info, detail_info, schedule, miaosha_begin_time, miaosha_end_time, page, site_id, is_delete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' _r = pipeline._insert_into_table(sql_str=sql_str, params=params) return _r def to_right_and_update_pintuan_data(self, data, pipeline): data_list = data tmp = {} tmp['goods_id'] = data_list['goods_id'] # 官方商品id now_time = get_shanghai_time() tmp['modfiy_time'] = now_time # 修改时间 tmp['shop_name'] = data_list['shop_name'] # 公司名称 tmp['title'] = data_list['title'] # 商品名称 tmp['sub_title'] = data_list['sub_title'] # 设置最高价price, 最低价taobao_price try: tmp['price'] = Decimal(data_list['price']).__round__(2) tmp['taobao_price'] = Decimal( data_list['taobao_price']).__round__(2) except: # 此处抓到的可能是卷皮拼团券所以跳过 print('此处抓到的可能是卷皮拼团券所以跳过') return None tmp['detail_name_list'] = data_list['detail_name_list'] # 标签属性名称 """ 得到sku_map """ tmp['price_info_list'] = data_list.get( 'price_info_list') # 每个规格对应价格及其库存 tmp['all_img_url'] = data_list.get('all_img_url') # 所有示例图片地址 # tmp['all_sell_count'] = data_list.get('all_sell_count') # 总销量 tmp['p_info'] = data_list.get('p_info') # 详细信息 tmp['div_desc'] = data_list.get('div_desc') # 下方div tmp['schedule'] = data_list.get('schedule') # 采集的来源地 # tmp['site_id'] = 18 # 采集来源地(卷皮拼团商品) tmp['is_delete'] = data_list.get('is_delete') # 逻辑删除, 未删除为0, 删除为1 # print('is_delete=', tmp['is_delete']) # print('------>>>| 待存储的数据信息为: |', tmp) print('------>>>| 待存储的数据信息为: |', tmp.get('goods_id')) params = self._get_db_update_pintuan_params(item=tmp) sql_str = r'update dbo.juanpi_pintuan set modfiy_time=%s, shop_name=%s, goods_name=%s, sub_title=%s, price=%s, taobao_price=%s, sku_name=%s, sku_Info=%s, all_image_url=%s, property_info=%s, detail_info=%s, schedule=%s, is_delete=%s where goods_id = %s' pipeline._update_table(sql_str=sql_str, params=params) def _get_shop_name(self, data): ''' 获取shop_name :param data: :return: ''' if data.get('brand_info') is not None: shop_name = data.get('brand_info', {}).get('title', '') else: shop_name = data.get('schedule_info', {}).get('brand_title', '') return shop_name def _get_detail_name_list(self, data): ''' 获取detail_name_list :param data: :return: {} 表示出错 | [] 非空正常 ''' sku = data.get('skudata', {}).get('sku', []) # pprint(sku) detail_name_list = [] if sku != []: try: if sku[0].get('av_fvalue', '') == '': fav_name = '' pass else: tmp = {} fav_name = data.get('skudata', {}).get('info', {}).get('fav_name', '') tmp['spec_name'] = fav_name detail_name_list.append(tmp) except IndexError: print('IndexError错误,此处跳过!') # print(sku) if isinstance(sku, str): # 单独处理下架的 if sku == '': return 'is_delete=1' return {} if sku[0].get('av_zvalue', '') == '': zav_name = '' else: tmp = {} zav_name = data.get('skudata', {}).get('info', {}).get('zav_name', '') tmp['spec_name'] = zav_name detail_name_list.append(tmp) return detail_name_list def _get_price_info_list_and_price_and_taobao_price(self, data): ''' 获取price_info_list, price, taobao_price :param data: :return: a tuple ''' sku = data.get('skudata', {}).get('sku', []) # 分析得到sku肯定不为[] # pprint(sku) price_info_list = [] if len(sku) == 1 and sku[0].get( 'av_fvalue', '') == '' and sku[0].get('av_zvalue') == '': # 没有规格的默认只有一个{} # price最高价, taobao_price最低价 price = round(float(sku[0].get('cprice')), 2) taobao_price = price else: # 有规格的 # 通过'stock'='1'来判断是否有库存, ='0'表示无库存 # '由于卷皮不给返回库存值, 所以 'stock_tips'='库存紧张', 我就设置剩余库存为10, 如果'stock_tips'='', 就默认设置库存量为50 # print('777') for item in sku: tmp = {} tmp_1 = [] if item.get('av_fvalue', '') == '': pass else: tmp_1.append(item.get('av_fvalue')) if item.get('av_zvalue', '') == '': pass else: tmp_1.append(item.get('av_zvalue')) tmp_1 = '|'.join(tmp_1) if item.get('av_origin_zpic', '') != '': tmp['img_url'] = item.get('av_origin_zpic', '') else: tmp['img_url'] = '' if item.get('cprice', '') != '': tmp['pintuan_price'] = item.get('cprice') tmp['detail_price'] = item.get('sprice', '') tmp['normal_price'] = item.get('price') else: tmp['pintuan_price'] = item.get('price') if item.get('sprice', '') != '': tmp['detail_price'] = item.get('sprice', '') else: tmp['detail_price'] = item.get('price') tmp['normal_price'] = item.get('price') if item.get('stock') == '0': # 跳过 rest_number = '0' else: # 即'stock'='1' rest_number = '50' if item.get('stock_tips', '') != '' and item.get( 'stock_tips', '') == '库存紧张': # 库存紧张的时候设置下 rest_number = '10' tmp['spec_value'] = tmp_1 tmp['rest_number'] = rest_number price_info_list.append(tmp) # 得到有规格时的最高价和最低价 tmp_price_list = sorted([ round(float(item.get('pintuan_price', '')), 2) for item in price_info_list ]) # print(tmp_price_list) if tmp_price_list == []: price = 0 taobao_price = 0 else: price = tmp_price_list[-1] # 商品价格 taobao_price = tmp_price_list[0] # 淘宝价 return price_info_list, price, taobao_price def _get_p_info(self, data): ''' 获取p_info :param data: :return: ''' p_info = [] attr = data.get('goodsDetail', {}).get('attr', []) # print(attr) if attr != []: # item是str时跳过 p_info = [{ 'p_name': item.get('st_key'), 'p_value': item.get('st_value') } for item in attr if isinstance(item, dict)] for item in p_info: if item.get('p_name') == '运费': # 过滤掉颜色的html代码 item['p_value'] = '全国包邮(偏远地区除外)' # 过滤清洗 tmp_p_value = item.get('p_value', '') tmp_p_value = re.compile(r'\xa0').sub(' ', tmp_p_value) # 替换为一个空格 item['p_value'] = tmp_p_value return p_info def _get_div_desc(self, data): ''' 获取div_desc :param data: :return: ''' div_images_list = data.get('goodsDetail', {}).get('images', []) tmp_div_desc = '' for item in div_images_list: tmp = r'<img src="{}" style="height:auto;width:100%;"/>'.format( item) tmp_div_desc += tmp return '<div>' + tmp_div_desc + '</div>' def _get_goods_schedule(self, data): ''' 获取商品销售时间段 :param data: :return: ''' # print(data.get('skudata', {}).get('info', {})) # print(data.get('skudata', {})) begin_time = data.get('skudata', {}).get('info', {}).get( 'start_time') # 取这个时间段才是正确的销售时间, 之前baseInfo是虚假的 end_time = data.get('skudata', {}).get('info', {}).get('end_time') if begin_time is None or end_time is None: schedule = [] else: schedule = [{ 'begin_time': timestamp_to_regulartime(begin_time), 'end_time': timestamp_to_regulartime(end_time), }] return schedule def _get_is_delete(self, data, schedule): ''' 得到商品的上下架状态 :param data: :param schedule: :return: ''' end_time = data.get('skudata', {}).get('info', {}).get('end_time') is_delete = 0 # 是否下架判断 # 结束时间戳小于当前时间戳则表示已经删除无法购买, 另外每个规格卖光也不显示is_delete=1(在上面已经判断, 这个就跟销售时间段没关系了) if schedule != []: if data.get('baseInfo', {}).get('end_time') is not None: ''' 先判断如果baseInfo中的end_time=='0'表示已经下架 ''' # base_info_end_time = data.get('baseInfo', {}).get('end_time') # self.my_lg.info(base_info_end_time) # if base_info_end_time == '0': # is_delete = 1 pass if float(end_time) < time.time(): ''' 再判断日期过期的 ''' is_delete = 1 ''' 卷皮-新增下架判断: time: 2018-5-12 ''' if data.get('skudata', {}).get('info', {}).get('gstatus', '1') == '2': # 'gstatus'在售状态为'1' is_delete = 1 return is_delete def _wash_main_data(self, main_data): ''' 清洗main_data :param main_data: :return: ''' # 处理commitments try: main_data['commitments'] = '' main_data.get('discount', {})['coupon'] = '' main_data.get('discount', {})['coupon_index'] = '' main_data.get('discount', {})['vip_info'] = '' main_data['topbanner'] = '' except: pass try: main_data.get('brand_info')['sub_goods'] = '' except: pass return main_data def _get_db_update_params(self, item): ''' 得到待更新的db数据 :param item: :return: ''' params = [ item['modify_time'], item['shop_name'], item['account'], item['title'], item['sub_title'], item['link_name'], # item['price'], # item['taobao_price'], dumps(item['price_info'], ensure_ascii=False), dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], # item['delete_time'], item['is_delete'], dumps(item['schedule'], ensure_ascii=False), item['is_price_change'], dumps(item['price_change_info'], ensure_ascii=False), item['goods_id'], ] if item.get('delete_time', '') == '': params.insert(-1, item['shelf_time']) elif item.get('shelf_time', '') == '': params.insert(-1, item['delete_time']) else: params.insert(-1, item['shelf_time']) params.insert(-1, item['delete_time']) return tuple(params) def _get_db_insert_miaosha_params(self, item): params = ( item['goods_id'], item['spider_url'], item['username'], item['deal_with_time'], item['modfiy_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), dumps(item['stock_info'], ensure_ascii=False), dumps(item['miaosha_time'], ensure_ascii=False), item['miaosha_begin_time'], item['miaosha_end_time'], item['tab_id'], item['page'], item['site_id'], item['is_delete'], ) return params def _get_db_update_miaosha_params(self, item): params = ( item['modfiy_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], item['is_delete'], dumps(item['schedule'], ensure_ascii=False), dumps(item['stock_info'], ensure_ascii=False), dumps(item['miaosha_time'], ensure_ascii=False), item['miaosha_begin_time'], item['miaosha_end_time'], item['goods_id'], ) return params def _get_db_insert_pintuan_params(self, item): params = ( item['goods_id'], item['spider_url'], item['username'], item['deal_with_time'], item['modfiy_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), item['all_sell_count'], dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), item['pintuan_begin_time'], item['pintuan_end_time'], item['page'], item['site_id'], item['is_delete'], ) return params def _get_db_update_pintuan_params(self, item): params = ( item['modfiy_time'], item['shop_name'], item['title'], item['sub_title'], item['price'], item['taobao_price'], dumps(item['detail_name_list'], ensure_ascii=False ), # 把list转换为json才能正常插入数据(并设置ensure_ascii=False) dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), # item['all_sell_count'], dumps(item['p_info'], ensure_ascii=False), # 存入到PropertyInfo item['div_desc'], # 存入到DetailInfo dumps(item['schedule'], ensure_ascii=False), item['is_delete'], item['goods_id']) return params def get_goods_id_from_url(self, juanpi_url): ''' 得到goods_id :param juanpi_url: :return: goods_id (类型str) ''' is_juanpi_url = re.compile(r'http://shop.juanpi.com/deal/.*?').findall( juanpi_url) if is_juanpi_url != []: if re.compile(r'http://shop.juanpi.com/deal/(\d+).*?').findall( juanpi_url) != []: tmp_juanpi_url = re.compile( r'http://shop.juanpi.com/deal/(\d+).*?').findall( juanpi_url)[0] if tmp_juanpi_url != '': goods_id = tmp_juanpi_url else: # 只是为了在pycharm运行时不跳到chrome,其实else完全可以不要的 juanpi_url = re.compile(r';').sub('', juanpi_url) goods_id = re.compile( r'http://shop.juanpi.com/deal/(\d+).*?').findall( juanpi_url)[0] print('------>>>| 得到的卷皮商品的地址为:', goods_id) return goods_id else: print( '卷皮商品url错误, 非正规的url, 请参照格式(http://shop.juanpi.com/deal/)开头的...' ) return '' def __del__(self): try: del self.my_phantomjs del self.result_data except: pass gc.collect()
class BaseFund(object): def __init__(self, base_path='/Users/afa/myFiles/tmp/基金/伪好基/'): ''' :param base_path: 基金图片存储path ''' self.page_num_start = 1 # 开放基金排行开始page self.page_num_end = 3 self.CRAWL_FUND_TIME = 1.5 # 抓取每只基金的sleep time self.plot_pic = None self.base_path = base_path self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_PATH) def _get_rank_fund_info(self): ''' 得到天天基金全部基金的rank_fund :return: a list ''' rank_fund_list = [] for page_num in range(self.page_num_start, self.page_num_end): print('正在抓取第{0}页的基金信息...'.format(page_num)) cookies = { 'st_pvi': '11586003301354', 'EMFUND1': 'null', 'EMFUND0': 'null', 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884', 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106', 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301', 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723', 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595', 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594', 'st_si': '38764934559714', 'ASP.NET_SessionId': 'hqeo1xk5oqgwb0cqzxicytda', 'EMFUND8': '07-11 11:28:55@#$%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148', 'EMFUND9': '07-11 11:28:55@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092', } headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', # 'Referer': 'http://fund.eastmoney.com/data/fundranking.html', 'Proxy-Connection': 'keep-alive', } end_date = str(get_shanghai_time())[:10] start_date = str(datetime.datetime(year=get_shanghai_time().year-1, month=get_shanghai_time().month, day=get_shanghai_time().day))[:10] print('开始时间: {0}, 结束时间: {1}'.format(start_date, end_date)) params = ( ('op', 'ph'), ('dt', 'kf'), ('ft', 'all'), ('rs', ''), ('gs', '0'), ('sc', 'zzf'), ('st', 'desc'), ('sd', start_date), # '2017-07-10' ('ed', end_date), # '2018-07-10' ('qdii', ''), ('tabSubtype', ',,,,,'), ('pi', str(page_num)), # rank_data的页码 ('pn', '50'), ('dx', '1'), # ('v', '0.5290053467389759'), ) url = 'http://fund.eastmoney.com/data/rankhandler.aspx' # TODO 常规requests被502 # body = MyRequests.get_url_body(url=url, headers=headers, params=params, cookies=None) # print(body) # 用phantomjs body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=_get_url_contain_params(url, params)) try: body = re.compile('<body>(.*)</body>').findall(body)[0] this_page_rank_data = re.compile(r'rankData = (.*);').findall(body)[0] # print(this_page_rank_data) except IndexError: print('在获取this_page_rank_data时索引异常!请检查!') continue # 报错: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) # 解决方案: 用demjson处理下 this_page_rank_data = demjson.decode(this_page_rank_data).get('datas', {}) # pprint(this_page_rank_data) if this_page_rank_data == {}: return [] for item in this_page_rank_data: _i = item.split(',') rank_fund_list.append({ '基金代码': _i[0], '基金简称': _i[1], '当天日期': _i[3], '单位净值': _i[4], '累计净值': _i[5], '日增长率': _i[6], '近1周': _i[7], '近1月': _i[8], '近3月': _i[9], '近6月': _i[10], '近1年': _i[11], '近2年': _i[12], '近3年': _i[13], '今年来': _i[14], '成立来': _i[15], '手续费': _i[20], }) sleep(2.5) print('\n抓取完毕!\n') # pprint(rank_fund_list) return rank_fund_list def _deal_with_rank_fund_info(self): ''' 处理rank_fund_info :return: ''' rank_fund_list = self._get_rank_fund_info() for item in rank_fund_list: fund_code = item.get('基金代码', '') print('正在处理基金代码: {0}...'.format(fund_code)) self._get_one_fund_info(fund_code=fund_code) sleep(self.CRAWL_FUND_TIME) print('\n@@@ 所有操作完成!\n') return True def _get_one_fund_info(self, fund_code): ''' 得到一只基金的info,并处理 :return: ''' cookies = { 'st_pvi': '11586003301354', 'st_si': '46806950936799', 'ASP.NET_SessionId': 'fhllwae2zicg00o0x4ub1fxs', 'EMFUND1': 'null', 'EMFUND0': 'null', # 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884', 'EMFUND2': '07-10 18:01:38@#$华润元大现金通货币B@#$002884', # 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106', 'EMFUND3': '07-10 18:01:48@#$天弘现金管家货币B@#$420106', # 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301', 'EMFUND4': '07-10 18:11:53@#$方正富邦保险主题指数分级@#$167301', # 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723', 'EMFUND5': '07-10 18:04:32@#$招商中证银行指数分级@#$161723', # 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595', 'EMFUND6': '07-10 18:05:13@#$天弘中证银行指数C@#$001595', # 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594', 'EMFUND7': '07-10 18:06:13@#$天弘中证银行指数A@#$001594', # 'EMFUND8': '07-10%2018%3A11%3A22@%23%24%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148', 'EMFUND8': '07-10 18:11:22@#$申万菱信多策略灵活配置混合A@#$001148', # 'EMFUND9': '07-10 18:12:26@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092', 'EMFUND9': '07-10 18:12:26@#$广发生物科技指数(QDII)@#$001092', } cookies = unquote_cookies(cookies) # pprint(cookies) headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', # 'Referer': 'http://fund.eastmoney.com/001092.html', 'Proxy-Connection': 'keep-alive', } v = re.compile(r'-| |:').sub('', str(get_shanghai_time())) # 2018-07-10 18:30:46 -> 20180710183046 # print(v) params = ( # ('v', '20180710175951'), # 时间 ('v', v), # 时间 ) fund_url = 'http://fund.eastmoney.com/pingzhongdata/{0}.js'.format(fund_code) # response = requests.get(fund_url, headers=headers, params=params, cookies=None) # body = response.text # print(body) # body = MyRequests.get_url_body(url=fund_url, headers=headers, params=params, cookies=None) # print(body) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=_get_url_contain_params(fund_url, params) ) # print(body) self._get_this_fund_info(body=body) return True def _get_this_fund_info(self, body): try: # 基金名 fund_name = re.compile(r'fS_name = "(.*?)";').findall(body)[0] # 基金代码 fund_code = re.compile(r'fS_code = "(.*?)";').findall(body)[0] print('基金名: {0}, 基金代码: {1}'.format(fund_name, fund_code)) # 购买手续费 fund_source_rate = re.compile(r'fund_sourceRate="(.*?)";').findall(body)[0] # 现费率 fund_rate = re.compile('fund_Rate="(.*?)";').findall(body)[0] # 最小起购金额 fund_minsg = re.compile(r'fund_minsg="(.*?)";').findall(body)[0] print('购买手续费: {0}%, 现费率: {1}%, 最小起购金额: {2}RMB'.format(fund_source_rate, fund_rate, fund_minsg)) '''收益率''' # 近一年收益率 syl_1n = re.compile(r'syl_1n="(.*?)";').findall(body)[0] # 近6月收益率 syl_6y = re.compile(r'syl_6y="(.*?)";').findall(body)[0] # 近三月收益率 syl_3y = re.compile(r'syl_3y="(.*?)";').findall(body)[0] # 近一月收益率 syl_1y = re.compile(r'syl_1y="(.*?)";').findall(body)[0] msg = '@@收益率:\n\t近1年: {0}%, 近6月: {1}%, 近3月: {2}%, 近1月: {3}%'.format(syl_1n, syl_6y, syl_3y, syl_1y) print(msg) # 单位净值走势 equityReturn-净值回报 unitMoney-每份派送金 data_net_worth_trend = json_2_dict(re.compile(r'Data_netWorthTrend = (.*?);').findall(body)[0]) # pprint(data_net_worth_trend) # print('单位净值走势: {0}'.format(data_net_worth_trend)) self._deal_with_data_net_worth_trend( fund_name=fund_name, fund_code=fund_code, data_net_worth_trend=data_net_worth_trend) # 累计净值走势 data_ac_worth_trend = json_2_dict(re.compile(r'Data_ACWorthTrend = (.*?);').findall(body)[0]) # pprint(data_ac_worth_trend) # print('累计净值走势: {0}'.format(data_ac_worth_trend)) # 累计收益率走势 data_grand_total = json_2_dict(re.compile(r'Data_grandTotal = (.*?);').findall(body)[0]) # print('累计收益率走势: {0}'.format(data_grand_total)) # 同类排名走势 data_rate_in_similar_type = json_2_dict(re.compile(r'Data_rateInSimilarType = (.*?);').findall(body)[0]) # print('同类排名走势: {0}'.format(data_rate_in_similar_type)) # 同类排名百分比 data_rate_in_similar_persent = json_2_dict(re.compile(r'Data_rateInSimilarPersent=(.*?);').findall(body)[0]) # print('同类排名百分比: {0}'.format(data_rate_in_similar_persent)) # 同类型基金涨幅榜(页面底部通栏) swith_same_type = json_2_dict(re.compile(r'swithSameType = (.*?);').findall(body)[0]) # print('同类型基金涨幅榜: {0}'.format(swith_same_type)) except IndexError as e: print(e) return None def _deal_with_data_net_worth_trend(self, **kwargs): ''' 处理data_net_worth_trend(单位净值走势), 并成像 :param fund_name: :param fund_code: :param data_net_worth_trend: :return: ''' fund_name = kwargs.get('fund_name') fund_code = kwargs.get('fund_code') data_net_worth_trend = kwargs.get('data_net_worth_trend', []) [item.update({'x': str(timestamp_to_regulartime(str(item.get('x'))[:10]))}) for item in data_net_worth_trend] print('时间格式转换成功!') # pprint(data_net_worth_trend) x = [item.get('x') for item in data_net_worth_trend] y = [item.get('y') for item in data_net_worth_trend] '''绘图''' self.plot_pic = self._drawing(fund_name=fund_name, fund_code=fund_code, x=x, y=y) try: del self.plot_pic except: pass gc.collect() return True def _drawing(self, **kwargs): ''' 初始化画笔 :param kwargs: :return: ''' import matplotlib.pyplot as plt from random import randint figure_num = randint(1, 10000) plt.figure(figure_num) # 创建图表1, 一个Figure对象可以包含多个子图(Axes), 从而避免图都画在一张上 fund_name = kwargs.get('fund_name') fund_code = kwargs.get('fund_code') x = kwargs.get('x') y = kwargs.get('y') # 加载字体 font = FontProperties(fname='/Library/Fonts/Songti.ttc', size=10) # 显示标题 plt.title('{0}(代码{1})的单位净值走势图'.format(fund_name, fund_code), fontproperties=font, fontsize=15) plt.xlabel('日期', fontproperties=font) plt.ylabel('单位净值', fontproperties=font) # 显示网格 # plt.grid() # 太密集了不显示 # 设置坐标轴步长step x_axis_label = self._get_x_axis_label(x) # pprint(x_axis_label) y_axis_label = self._get_y_axis_label(y) # pprint(y_axis_label) plt.xticks(arange(len(x_axis_label)), x_axis_label, rotation=30, fontsize=5) # 放str得先处理成这个格式 # plt.yticks(y_axis_label) # 设置x轴值区间 # plt.xlim(x[0], x[-2]) # 显示图例 plt.legend(['单位:元'], loc=1, prop=font) plt.figure(figure_num) # 调用绘制线性图函数plot() plot_pic = plt.plot( x, y, marker='.', markerfacecolor='r', markersize=1, # 标记的点的size linewidth=.4, # 线宽 color='#7EB6EA' # 线的颜色 ) # 标识数字标签 # for a, b in zip(x, y): # plt.text(a, b, '%.3f' % (b,), fontsize=5) # 调用show方法显式 # plt.show() # 保存pic pic_file_name = '{0}(代码{1}).png'.format(fund_name, fund_code) pic_path = self.base_path + pic_file_name if os.path.exists(pic_path): # 原先存在,就删除! # print('文件已存在!') os.remove(pic_path) savefig(fname=pic_path, dpi=400) # dpi控制图片像素 print('[+] {0} 保存完毕!'.format(pic_file_name)) plt.cla() # 清空当前图像 return plot_pic def _get_x_axis_label(self, x): ''' 得到x轴的刻度list :param x: :return: list ''' now_time = datetime.datetime.now() x_axis_label = [] for _x in x: if _x is not None and month_differ(now_time, string_to_datetime(_x)) % 6 == 0: if str(_x)[:7] in x_axis_label: # 如果已存在append('') x_axis_label.append('') else: x_axis_label.append(str(_x)[:7]) else: x_axis_label.append('') return x_axis_label def _get_y_axis_label(self, y): ''' 得到y轴的刻度list :param y: :return: ''' y_step = .1 y_axis_label = [_y for _y in arange(min(y) - y_step, max(y) + y_step, y_step)] return y_axis_label def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
class TmallCommentParse(object): def __init__(self, logger=None): self.result_data = {} self.msg = '' self._set_logger(logger) self._set_headers() self.page_size = '10' self.comment_page_switch_sleep_time = 1.5 # 评论下一页sleep time self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) self.g_data = {} # 临时数据 self.random_sku_info_list = [] # 临时数据(存该商品所有的规格) def _get_comment_data(self, type:int, goods_id): if goods_id == '' or type == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) '''先获取到sellerId''' try: seller_id = self._get_seller_id(type=type, goods_id=goods_id) except AssertionError or IndexError as e: self.my_lg.error('出错goods_id: %s' % goods_id) self.my_lg.error(e.args[0]) self.result_data = {} self.random_sku_info_list = [] return {} """再获取price_info_list""" try: self.random_sku_info_list = self._get_random_sku_info_list() # self.my_lg.info(self.random_sku_info_list) except Exception as e: self.my_lg.error('出错goods_id: %s' % str(goods_id)) self.my_lg.exception(e) self.result_data = {} self.random_sku_info_list = [] return {} _tmp_comment_list = [] for current_page in range(1, 4): self.my_lg.info('------>>>| 正在抓取第 {0} 页的评论...'.format(str(current_page))) _url = 'https://rate.tmall.com/list_detail_rate.htm' params = self._set_params(goods_id=goods_id, seller_id=seller_id, current_page=current_page) self.headers.update({'referer': 'https://detail.m.tmall.com/item.htm?id='+goods_id}) # 原先用代理请求不到数据的原因是没有cookies # body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params, encoding='gbk') # 所以直接用phantomjs来获取相关api数据 _url = _get_url_contain_params(url=_url, params=params) # 根据params组合得到url # self.my_lg.info(_url) body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=_url) # self.my_lg.info(str(body)) if body == '': self.my_lg.error('获取到的body为空str! 出错type:{0}, goods_id:{1}'.format(str(type), goods_id)) self.result_data = {} return {} try: _ = re.compile('\((.*)\)').findall(body)[0] except IndexError: _ = {} self.my_lg.error('索引异常! 出错type:{0}, goods_id:{1}'.format(str(type), goods_id)) try: data = json.loads(_).get('rateDetail', {}).get('rateList', []) # pprint(data) except: data = [] self.my_lg.error('json.loads转换_出错! 出错type:{0}, goods_id:{1}'.format(str(type), goods_id)) _tmp_comment_list += data sleep(self.comment_page_switch_sleep_time) try: _comment_list = self._get_comment_list(_tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错type:{0}, goods_id:{1}'.format(str(type), goods_id)) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data def _get_comment_list(self, _tmp_comment_list): ''' 转换成需求的结果集 :param _tmp_comment_list: :return: ''' _comment_list = [] for item in _tmp_comment_list: _comment_date = item.get('rateDate', '') assert _comment_date != '', '得到的_comment_date为空str!请检查!' # 天猫接口拿到的sku_info默认为空 # sku_info = '' # 从所有规格里面随机一个 if self.random_sku_info_list == []: self.random_sku_info_list = [''] sku_info = str(choice(self.random_sku_info_list)) _comment_content = item.get('rateContent', '') assert _comment_content != '', '得到的评论内容为空str!请检查!' _comment_content = self._wash_comment(comment=_comment_content) buyer_name = item.get('displayUserNick', '') assert buyer_name != '', '得到的用户昵称为空值!请检查!' # 天猫设置默认 购买量为1 quantify = 1 # 天猫没有head_img回传,就设置一个默认地址 head_img = '' # 第一次评论图片 _comment_img_list = item.get('pics', []) if item.get('pics', '') != '' else [] if _comment_img_list != []: _comment_img_list = [{'img_url': 'https:' + img} for img in _comment_img_list] '''追评''' _tmp_append_comment = item.get('appendComment', {}) if item.get('appendComment', '') != '' else {} # 追评的图片 _append_comment_img_list = _tmp_append_comment.get('pics', []) if _tmp_append_comment.get('pics', '') != '' else [] if _append_comment_img_list != []: _append_comment_img_list = [{'img_url': 'https:' + img} for img in _append_comment_img_list] if _tmp_append_comment != {}: append_comment = { 'comment_date': _tmp_append_comment.get('commentTime', ''), 'comment': self._wash_comment(_tmp_append_comment.get('content', '')), 'img_url_list': _append_comment_img_list, } else: append_comment = {} if not filter_invalid_comment_content(_comment_content): continue comment = [{ 'comment': _comment_content, 'comment_date': _comment_date, 'sku_info': sku_info, 'img_url_list': _comment_img_list, 'star_level': randint(4, 5), 'video': '', }] _ = { 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 评论数量 'head_img': head_img, # 头像 'append_comment': append_comment, # 追评 } _comment_list.append(_) return _comment_list def _get_seller_id(self, type, goods_id): ''' 得到seller_id :param type: :param goods_id: :return: ''' _ = TmallParse(logger=self.my_lg) _g = [type, goods_id] self.g_data = _.get_goods_data(goods_id=_g) seller_id = str(self.g_data.get('seller', {}).get('userId', 0)) # self.my_lg.info('获取到的seller_id: ' + seller_id) try: del _ except: pass assert seller_id != 0, '获取到的seller_id为0!' return seller_id def _get_random_sku_info_list(self): ''' 得到所有的sku_info_list信息,用于随机一个属性 :return: ''' assert self.g_data != {}, 'g_data为空dict' _t = TaoBaoLoginAndParse(logger=self.my_lg) # 得到每个标签对应值的价格及其库存 price_info_list = _t._get_price_info_list( data=self.g_data, detail_value_list=_t._get_detail_name_and_value_list(data=self.g_data)[1] ) try: del _t except: pass return list(set([_i.get('spec_value', '') for _i in price_info_list])) def _set_logger(self, logger): if logger is None: self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/comment/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) else: self.my_lg = logger def _set_headers(self): self.headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', 'referer': 'https://detail.m.tmall.com/item.htm?id=524718632348', } def _wash_comment(self, comment): ''' 清洗评论 :param comment: :return: ''' comment = re.compile('天猫超市|天猫国际|天猫全球购|天猫大药房|某淘|某宝').sub('', comment) comment = comment.replace('天猫', '').replace('淘宝', '') comment = re.compile('tmall|Tmall|TMALL|TAOBAO|taobao').sub('', comment) return comment def _set_params(self, **kwargs): ''' 设置params :param kwargs: :return: ''' goods_id = kwargs.get('goods_id') seller_id = kwargs.get('seller_id') current_page = kwargs.get('current_page') callback = '_DLP_2519_der_3_currentPage_{0}_pageSize_{1}_'.format(str(current_page), self.page_size) _params = ( ('itemId', goods_id), ('sellerId', seller_id), ('order', '3'), ('currentPage', str(current_page)), ('pageSize', self.page_size), ('callback', callback), ) return _params def __del__(self): try: del self.my_lg del self.my_phantomjs del self.g_data except: pass gc.collect()
class MoGuJiePinTuanRealTimesUpdate(object): def __init__(self): self._set_headers() self.delete_sql_str = 'delete from dbo.mogujie_pintuan where goods_id=%s' def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Accept-Encoding:': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'list.mogujie.com', # 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select goods_id, miaosha_time, fcid, page from dbo.mogujie_pintuan where site_id=23' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # print(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [{ 'spec_value': item_4.get('spec_value'), 'pintuan_price': item_4.get('detail_price'), 'detail_price': '', 'normal_price': item_4.get('normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get('rest_number'), } for item_4 in tmp_price_info_list] goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = price_info_list # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 tmp_price_info_list = goods_data[ 'price_info_list'] price_info_list = [ { 'spec_value': item_4.get( 'spec_value'), 'pintuan_price': item_4.get( 'detail_price'), 'detail_price': '', 'normal_price': item_4.get( 'normal_price'), 'img_url': item_4.get('img_url'), 'rest_number': item_4.get( 'rest_number'), } for item_4 in tmp_price_info_list ] goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = price_info_list goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time( pintuan_time=goods_data[ 'pintuan_time']) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) # print(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() def get_pintuan_begin_time_and_pintuan_end_time(self, pintuan_time): ''' 返回拼团开始和结束时间 :param pintuan_time: :return: tuple pintuan_begin_time, pintuan_end_time ''' pintuan_begin_time = pintuan_time.get('begin_time', '') pintuan_end_time = pintuan_time.get('end_time', '') # 将字符串转换为datetime类型 pintuan_begin_time = datetime.datetime.strptime( pintuan_begin_time, '%Y-%m-%d %H:%M:%S') pintuan_end_time = datetime.datetime.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S') return pintuan_begin_time, pintuan_end_time def get_pintuan_end_time(self, begin_time, left_time): ''' 处理并得到拼团结束时间 :param begin_time: 秒杀开始时间戳 :param left_time: 剩余时间字符串 :return: end_time 时间戳(int) ''' # 'leftTimeOrg': '6天13小时' # 'leftTimeOrg': '13小时57分' had_day = re.compile(r'天').findall(left_time) had_hour = re.compile(r'小时').findall(left_time) had_min = re.compile(r'分').findall(left_time) tmp = re.compile(r'\d+').findall(left_time) if had_day != [] and had_hour != []: # left_time 格式为 '6天13小时' day, hour, min = int(tmp[0]), int(tmp[1]), 0 elif had_day == [] and had_hour != []: # left_time 格式为 '13小时57分' day, hour, min = 0, int(tmp[0]), int(tmp[1]) elif had_day == [] and had_hour == []: # left_time 格式为 '36分' print('left_time = ', left_time) day, hour, min = 0, 0, int(tmp[0]) else: # 无天, 小时, 分 print('day, hour, min = 0, 0, 0', 'left_time = ', left_time) day, hour, min = 0, 0, 0 left_end_time_timestamp = \ day * 24 * 60 * 60 + \ hour * 60 * 60 + \ min * 60 return begin_time + left_end_time_timestamp def is_recent_time(self, timestamp): ''' 判断是否在指定的日期差内 :param timestamp: 时间戳 :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的 ''' time_1 = int(timestamp) time_2 = int(time.time()) # 当前的时间戳 diff_time = time_1 - time_2 if diff_time < -86400: # (为了后台能同步下架)所以设置为 24个小时 # if diff_time < 0: # (原先的时间)结束时间 与当前时间差 <= 0 return 0 # 已过期恢复原价的 elif diff_time > 0: return 1 # 表示是昨天跟今天的也就是待更新的 else: # 表示过期但是处于等待的数据不进行相关先删除操作(等<=24小时时再2删除) return 2 def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
def get_goods_data(self, goods_id): ''' 模拟构造得到data的url :param goods_id: :return: data 类型dict ''' if goods_id == '': self.result_data = {} return {} print('------>>>| 对应的手机端地址为: ', 'https://m.chuchujie.com/details/detail.html?id=' + goods_id) ''' 1.原先直接去手机端页面api post请求数据但是死活就返回请求参数错误,反复研究无果, 就改为解析pc端的 ''' # tmp_url = 'https://api-product.chuchujie.com/api.php?method=product_detail' # self.headers['Referer'] = 'https://m.chuchujie.com/details/detail.html?id=' + str(goods_id) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) # # params_2 = { # "channel": "QD_appstore", # "package_name": "com.culiukeji.huanletao", # "client_version": "3.9.101", # "ageGroup": "AG_0to24", # "client_type": "h5", # "api_version": "v5", # "imei": "", # "method": "product_detail", # "gender": "1", # 性别 0-女、1-男 # "token": "", # "userId": "", # "product_id": int(goods_id), # } # # params = { # 'data': json.dumps(params_2), # } # # try: # # response = requests.post( # # url=tmp_url, # # headers=self.headers, # # data=json.dumps(params), # # proxies=tmp_proxies, # # timeout=13 # # ) # response = requests.get( # url=tmp_url, # headers=self.headers, # params=params, # proxies=tmp_proxies, # timeout=13, # ) # last_url = re.compile(r'\+').sub('', response.url) # 转换后得到正确的url请求地址 # print(last_url) # print(tmp_url + '&data=%7B%22channel%22%3A%22QD_appstore%22%2C%22package_name%22%3A%22com.culiukeji.huanletao%22%2C%22client_version%22%3A%223.9.101%22%2C%22ageGroup%22%3A%22AG_0to24%22%2C%22client_type%22%3A%22h5%22%2C%22api_version%22%3A%22v5%22%2C%22imei%22%3A%22%22%2C%22method%22%3A%22product_detail%22%2C%22gender%22%3A%221%22%2C%22token%22%3A%22%22%2C%22userId%22%3A%22%22%2C%22product_id%22%3A10016793335%7D') # response = requests.get(last_url, headers=self.headers, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # # data = response.content.decode('utf-8') # print(data) # # except Exception: # print('requests.post()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 2. 改为解析pc端的商品页面数据 ''' tmp_url = 'http://wx.chuchujie.com/index.php?s=/WebProduct/product_detail/product_id/' + str(goods_id) # 开始常规requests有数据, 后面无数据, 改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) body = my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) try: del my_phantomjs except: pass # print(body) if body == '': print('获取到的body为空str!') self.result_data = {} return {} data = {} try: data['title'] = Selector(text=body).css('div.zy_info_rt h3::text').extract_first() if data['title'] == '': print('title为空!') raise Exception data['sub_title'] = '' data['shop_name'] = Selector(text=body).css('div.other.ft14.clearfix label b::text').extract_first() data['all_img_url'] = self._get_all_img_url(body=body) data['p_info'] = [] # 由于获取的是pc端的对应没有p_info div_desc = Selector(text=body).css('div.s_two').extract_first() if div_desc == '': print('div_desc为空!请检查!') raise Exception data['div_desc'] = div_desc detail_name_list = self._get_detail_name_list(body=body) data['detail_name_list'] = detail_name_list # 商品价格(原价)跟淘宝价格 taobao_price = Selector(text=body).css('dl.detail p.price b::text').extract_first() price = Selector(text=body).css('dl.detail dd em.yjprice::text').extract_first() # print(taobao_price) # print(price) try: # 后面有'*' 是为了避免有价格为整数不是浮点类型的 taobao_price = re.compile(r'(\d+\.{0,1}\d*)').findall(taobao_price)[0] price = re.compile(r'(\d+\.{0,1}\d*)').findall(price)[0] except IndexError: print('获取price失败,请检查!') raise IndexError if taobao_price == '' or price == '': print('获取到的taobao_price或者price为空值出错, 请检查!') raise Exception taobao_price = Decimal(taobao_price).__round__(2) price = Decimal(price).__round__(2) # print('商品促销价为: ', taobao_price, ' 商品原价为: ', price) data['price'] = price data['taobao_price'] = taobao_price ''' 获取每个规格对应价格跟规格以及其库存 ''' price_info_list = self.get_price_info_list( detail_name_list, body, price, taobao_price ) # pprint(price_info_list) if price_info_list == '': raise Exception else: data['price_info_list'] = price_info_list ''' 是否卖光 ''' all_stock = int(Selector(text=body).css('dl.detail dd label em::text').extract_first()) if all_stock == 0: is_delete = 1 else: is_delete = 0 data['is_delete'] = is_delete except Exception as e: print('遇到错误: ', e) self.result_data = {} return {} if data != {}: # pprint(data) self.result_data = data return data else: print('data为空!') self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 return {}
class GX8899Spider(object): def __init__(self, logger=None): self._set_sort_type_name() self._set_logger(logger) self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.update_sql = 'update dbo.sina_weibo set head_img_url=%s, modify_time=%s where id=%s' self.phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) self.id_list = [] self.update_index = 0 def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/gx8899/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_sort_type_name(self): ''' 设置抓取的分类名 :return: ''' self.sort_type_name_list = [ # 'weixin', # 'nansheng', # 'nvsheng', 'fengjing', 'jingxuan', 'wupin', 'oumei', 'weimei', 'heibai', 'baqi', 'xiaoqingxin', 'yijing', 'beiying', 'chouyan', 'sumiao', 'gexing', 'xiaohai', 'qiche', 'zhiwu', 'shouhui', 'weshen', 'mingxing', 'jianzhu', 'renwu', ] def _get_gx8899_all_img_url(self): self.my_lg.info('即将开始采集gx8899...') fz = [] for sort_type_name in self.sort_type_name_list: tmp = self._get_one_sort_type_name_page_info(sort_type_name) if tmp != []: fz += tmp self.my_lg.info('@@@ 全部头像抓取完毕!') self.fz = fz return fz def _get_new_wait_2_handle_id_list(self): ''' 获取新的带处理的 :return: ''' sql_str = ''' select top 1000 id from dbo.sina_weibo where sina_type = 'bilibili' and modify_time is null ''' if self.id_list == []: self.my_lg.info('@@@ 重新获取id_list...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() try: wait = self.my_pipeline._select_table(sql_str=sql_str) self.id_list = [i[0] for i in wait] except TypeError or IndexError: sleep(8) return [] else: pass return self.id_list @fz_set_timeout(6) def oo(self, id, img_url): try: self.my_pipeline._update_table_2(sql_str=self.update_sql, params=(img_url, get_shanghai_time(), id), logger=self.my_lg) except Exception: return False return True def _get_one_sort_type_name_page_info(self, sort_type_name): ''' 得到一个分类的某页信息 :return: ''' base_url = 'http://m.gx8899.com/{0}/'.format(sort_type_name) headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_pc_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', # 'Referer': 'http://m.gx8899.com/weixin/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } index = 0 res = [] while True: if index == 0: url = base_url index += 1 # 第二页index_2开始 else: url = base_url + 'index_{0}.html'.format(index) self.my_lg.info('正在抓取{0}'.format(url)) # 太慢, 改用phantomjs # body = self._get_loop_run_result(url=url, headers=headers) if index % 15 == 0: try: del self.phantomjs except: pass gc.collect() self.phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) self.my_lg.info('[+] phantomjs已重置!') body = self.phantomjs.use_phantomjs_to_get_url_body(url=url) # self.my_lg.info(str(body)) if re.compile(r'<title>404 - 找不到文件或目录。</title>').findall( body) != []: break need = Selector(text=body).css( 'div#con_tabone_1 li.last a:last-child ::attr(href)').extract( ) pprint(need) if need == []: self.my_lg.error('获取到的need为空list!出错地址:{0}'.format(url)) continue for article_url in need: _ = self._get_one_article_page_info(article_url) if _ != []: res += _ self.my_lg.info('#### 已更新{0}个id !'.format(self.update_index)) index += 1 return res def _get_one_article_page_info(self, url): ''' 得到一个推荐地址里面所有图片list :param url: :return: ''' headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } # body = self._get_loop_run_result(url=url, headers=headers) body = self.phantomjs.use_phantomjs_to_get_url_body(url=url) if body == '': self.my_lg.info('获取到img list为空list!出错地址:{}'.format(url)) return [] need = Selector( text=body).css('div.content p img ::attr(src)').extract() # pprint(need) # self.my_lg.info(str(need)) if need != []: self.my_lg.info('[+] crawl子地址success') else: self.my_lg.info('[-] crawl子地址fail') # 数据更新操作 for img_url in need: try: random_id_index = randint( 0, len(self._get_new_wait_2_handle_id_list()) - 1) except: sleep(5) continue res = self.oo( id=self.id_list[random_id_index], img_url=img_url, ) if res: self.id_list.pop(random_id_index) self.update_index += 1 return need async def _get_one_page_body(self, url, headers): ''' 异步获取body :param url: :param headers: :return: ''' body = await MyAiohttp.aio_get_url_body(url=url, headers=headers) return body def _get_loop_run_result(self, **kwargs): loop = get_event_loop() result = loop.run_until_complete( self._get_one_page_body(url=kwargs.get('url', ''), headers=kwargs.get('headers', {}))) return result def __del__(self): try: del self.phantomjs del self.my_lg except: pass gc.collect()
class MoGuJiePinTuan(object): def __init__(self): self._set_headers() self._set_fcid_dict() def _set_headers(self): self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding:': 'gzip', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'api.mogujie.com', 'Referer': 'https://pintuan.mogujie.com/ptpt/app/pd?acm=3.mce.1_10_1fvsk.51827.0.mUTadqIzS9Pbg.m_370494-pos_2-mf_4537_796033&ptp=m1._mf1_1239_4537._keyword_51827.0.xLt0G92', 'User-Agent': get_random_pc_ua(), # 随机一个请求头 } def _set_fcid_dict(self): self.fcid_dict = { '女装': 10053171, # '精选': 10053172, '男友': 10053173, '内衣': 10053174, '女鞋': 10053175, '包包': 10053176, '美妆': 10053177, '生活': 10053178, '配饰': 10053179, '母婴': 10053180, '食品': 10053181, } def get_pintuan_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时拼团商品信息 :return: None ''' goods_list = [] ''' 方法一: 蘑菇街手机版拼团商品列表获取签名暂时无法破解,所以不用手机端的方法来获取数据 ''' # mw_appkey = '100028' # mw_t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # mw_uuid = '956bf265-90a4-45b0-bfa8-31040782f99e' # mw_ttid = 'NMMain%40mgj_h5_1.0' # # _ = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # # data = { # "pid": "93745", # "platform": "m", # "cKey": "mwp_mait", # "fcid": "", # } # # params = { # 'data': data # } # # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647409632&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&mw-sign=abde92f778e47bce98a3ed25fd71eb1a&data=%7B%22pid%22%3A%2293745%22%2C%22platform%22%3A%22m%22%2C%22cKey%22%3A%22mwp_mait%22%2C%22fcid%22%3A%22%22%7D&callback=mwpCb1&_=1517647409648 # # https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey=100028&mw-t=1517647893930&mw-uuid=956bf265-90a4-45b0-bfa8-31040782f99e&mw-ttid=NMMain%40mgj_h5_1.0&callback=mwpCb1&_=1517647893748&data=pid&data=platform&data=cKey&data=fcid # # tmp_url = 'https://api.mogujie.com/h5/mwp.darwin.get/3/?mw-appkey={0}&mw-t={1}&mw-uuid={2}&mw-ttid={3}&callback=mwpCb1&_={4}'.format( # mw_appkey, mw_t, mw_uuid, mw_ttid, _ # ) # # # 设置代理ip # ip_object = MyIpPools() # self.proxies = ip_object.get_proxy_ip_from_ip_pool() # {'http': ['xx', 'yy', ...]} # self.proxy = self.proxies['http'][randint(0, len(self.proxies) - 1)] # # tmp_proxies = { # 'http': self.proxy, # } # # try: # response = requests.post(tmp_url, headers=self.headers, data=data, proxies=tmp_proxies, timeout=13) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 # body = response.content.decode('utf-8') # print(body) # except Exception: # print('requests.get()请求超时....') # print('data为空!') # self.result_data = {} # 重置下,避免存入时影响下面爬取的赋值 # return {} ''' 方法二: 通过pc端来获取拼团商品列表 ''' self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH) for key in self.fcid_dict: print('正在抓取的分类为: ', key) for index in range(1, 100): if index % 5 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) fcid = self.fcid_dict[key] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( str(index), fcid) # requests请求数据被过滤(起初能用),改用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) try: body = re.compile(r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) except: print('json.loads转换body时出错, 请检查') continue if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: # 表示拼团数据为空则跳出循环 break # pprint(tmp_data) # print(tmp_data) tmp_item_list = tmp_data.get('result', {}).get('wall', {}).get('docs', []) # print(tmp_item_list) # pprint(tmp_item_list) begin_time_timestamp = int(time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), 'fcid': fcid, 'page': index, 'sort': key, } for item in tmp_item_list] print(item_list) for item_1 in item_list: goods_list.append(item_1) sleep(MOGUJIE_SLEEP_TIME) # 处理goods_list数据 print(goods_list) self.deal_with_data(goods_list) sleep(5) def deal_with_data(self, *params): ''' 处理并存储相关拼团商品的数据 :param params: 待传参数 :return: ''' goods_list = params[0] mogujie = MoGuJieParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table(sql_str=mg_select_str_1)) ] print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://shop.mogujie.com/detail/' + str( goods_id) mogujie.get_goods_data(goods_id=str(goods_id)) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 # 规范化 goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['pintuan_time'] = item.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get('pintuan_time', {})) goods_data['all_sell_count'] = item.get( 'all_sell_count', '') goods_data['fcid'] = str(item.get('fcid')) goods_data['page'] = str(item.get('page')) goods_data['sort'] = str(item.get('sort', '')) # pprint(goods_data) # print(goods_data) _r = mogujie.insert_into_mogujie_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 更新 db_goods_id_list.append(goods_id) db_goods_id_list = list(set(db_goods_id_list)) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect() def get_pintuan_end_time(self, begin_time, left_time): ''' 处理并得到拼团结束时间 :param begin_time: 秒杀开始时间戳 :param left_time: 剩余时间字符串 :return: end_time 时间戳(int) ''' # 'leftTimeOrg': '6天13小时' # 'leftTimeOrg': '13小时57分' had_day = re.compile(r'天').findall(left_time) had_hour = re.compile(r'小时').findall(left_time) had_min = re.compile(r'分').findall(left_time) tmp = re.compile(r'\d+').findall(left_time) if had_day != [] and had_hour != []: # left_time 格式为 '6天13小时' day, hour, min = int(tmp[0]), int(tmp[1]), 0 elif had_day == [] and had_hour != []: # left_time 格式为 '13小时57分' day, hour, min = 0, int(tmp[0]), int(tmp[1]) elif had_day == [] and had_hour == []: # left_time 格式为 '36分' print('left_time = ', left_time) day, hour, min = 0, 0, int(tmp[0]) else: # 无天, 小时, 分 print('day, hour, min = 0, 0, 0', 'left_time = ', left_time) day, hour, min = 0, 0, 0 left_end_time_timestamp = \ day * 24 * 60 * 60 + \ hour * 60 * 60 + \ min * 60 return begin_time + left_end_time_timestamp def __del__(self): try: del self.my_phantomjs except: pass gc.collect()
class YanXuanParse(object): def __init__(self, logger=None): super(YanXuanParse, self).__init__() self.result_data = {} self._set_logger(logger) self._set_headers() self.my_phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH, logger=self.my_lg) def _set_logger(self, logger): if logger is None: self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) else: self.my_lg = logger def _set_headers(self): self.headers = { 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_phone_ua(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', } def _get_goods_data(self, goods_id): ''' 得到需求数据 :param goods_id: :return: ''' if goods_id == '': self.my_lg.error('获取到的goods_id为空值!此处跳过!') return self._get_data_error_init() # 网易严选m站抓取 url = 'http://m.you.163.com/item/detail' params = self._get_params(goods_id=goods_id) m_url = url + '?id={0}'.format(goods_id) self.my_lg.info('------>>>| 正在抓取严选地址为: {0}'.format(m_url)) write_info = '出错goods_id:{0}, 出错地址: {1}'.format(goods_id, m_url) '''requests被无限转发''' # body = MyRequests.get_url_body(url=url, headers=self.headers, params=params) # self.my_lg.info(str(body)) '''改用phantomjs''' body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=_get_url_contain_params(url=url, params=params)) if body == '': self.my_lg.error('获取到的body为空值!' + write_info) return self._get_data_error_init() try: body = re.compile('var jsonData=(.*?),policyList=').findall( body)[0] except IndexError: self.my_lg.error('获取body时索引异常!' + write_info, exc_info=True) return self._get_data_error_init() body = nonstandard_json_str_handle(json_str=body) # self.my_lg.info(str(body)) _ = json_2_dict(json_str=body, logger=self.my_lg) # pprint(_) if _ == {}: self.my_lg.error('获取到的data为空dict!' + write_info) return self._get_data_error_init() _ = self._wash_data(_) # pprint(_) data = {} try: data['title'] = self._wash_sensitive_info(self._get_title(data=_)) data['sub_title'] = self._wash_sensitive_info( self._get_sub_title(data=_)) data['shop_name'] = '' data['all_img_url'] = self._get_all_img_url(data=_) data['p_info'] = self._get_p_info(data=_) data['div_desc'] = self._get_div_desc(data=_) data['sell_time'] = self._get_sell_time(data=_) data['detail_name_list'] = self._get_detail_name_list( data=_.get('skuSpecList', [])) data['price_info_list'] = self._get_price_info_list( data=_.get('skuList', [])) data['price'], data[ 'taobao_price'] = self._get_price_and_taobao_price( price_info_list=data['price_info_list']) if data['price'] == 0 or data['taobao_price'] == 0: # 售罄商品处理 data['is_delete'] = 1 else: data['is_delete'] = self._get_is_delete( price_info_list=data['price_info_list'], data=data, other=_) except Exception: self.my_lg.error('遇到错误:', exc_info=True) self.my_lg.error(write_info) return self._get_data_error_init() if data != {}: self.result_data = data # pprint(data) return data else: self.my_lg.info('data为空值') return self._get_data_error_init() def _deal_with_data(self): ''' 结构化数据 :return: ''' data = self.result_data if data != {}: shop_name = data['shop_name'] account = '' title = data['title'] sub_title = data['sub_title'] detail_name_list = data['detail_name_list'] price_info_list = data['price_info_list'] all_img_url = data['all_img_url'] p_info = data['p_info'] # pprint(p_info) div_desc = data['div_desc'] is_delete = data['is_delete'] # 上下架时间 if data.get('sell_time', {}) != {}: schedule = [{ 'begin_time': data.get('sell_time', {}).get('begin_time', ''), 'end_time': data.get('sell_time', {}).get('end_time', ''), }] else: schedule = [] # 销售总量 all_sell_count = '' # 商品价格和淘宝价 price, taobao_price = data['price'], data['taobao_price'] result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 # 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 # 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'schedule': schedule, # 商品特价销售时间段 'all_sell_count': all_sell_count, # 销售总量 'is_delete': is_delete # 是否下架 } # pprint(result) # print(result) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) self.result_data = {} return result else: self.my_lg.error('待处理的data为空的dict, 该商品可能已经转移或者下架') return self._get_data_error_init() def to_right_and_update_data(self, data, pipeline): ''' 实时更新数据 :param data: :param pipeline: :return: ''' tmp = _get_right_model_data(data, site_id=30, logger=self.my_lg) params = self._get_db_update_params(item=tmp) base_sql_str = yx_update_str_1 if tmp['delete_time'] == '': sql_str = base_sql_str.format('shelf_time=%s', '') elif tmp['shelf_time'] == '': sql_str = base_sql_str.format('delete_time=%s', '') else: sql_str = base_sql_str.format('shelf_time=%s,', 'delete_time=%s') result = pipeline._update_table_2(sql_str=sql_str, params=params, logger=self.my_lg) return result def _get_db_update_params(self, item): params = [ item['modify_time'], item['shop_name'], item['account'], item['title'], item['sub_title'], item['link_name'], # item['price'], # item['taobao_price'], dumps(item['price_info'], ensure_ascii=False), dumps(item['detail_name_list'], ensure_ascii=False), dumps(item['price_info_list'], ensure_ascii=False), dumps(item['all_img_url'], ensure_ascii=False), dumps(item['p_info'], ensure_ascii=False), item['div_desc'], item['all_sell_count'], # item['delete_time'], item['is_delete'], item['is_price_change'], dumps(item['price_change_info'], ensure_ascii=False), item['sku_info_trans_time'], item['goods_id'], ] if item.get('delete_time', '') == '': params.insert(-1, item['shelf_time']) elif item.get('shelf_time', '') == '': params.insert(-1, item['delete_time']) else: params.insert(-1, item['shelf_time']) params.insert(-1, item['delete_time']) return tuple(params) def _wash_sensitive_info(self, target_str): ''' 清洗敏感信息 :param target_str: :return: ''' add_sensitive_str_list = [ '网易', '严选', '云音乐', ] target_str = wash_sensitive_info( data=target_str, replace_str_list=[], add_sensitive_str_list=add_sensitive_str_list) return target_str def _get_title(self, data): title = data.get('name', '') assert title != '', '获取到的name为空值!请检查!' return title def _get_sub_title(self, data): sub_title = data.get('simpleDesc', '') # 可以为空 return sub_title def _get_all_img_url(self, data): tmp = data.get('itemDetail', {}) first_img_url = data.get('listPicUrl', '') assert tmp != {}, '获取到的all_img_url为空dict!' all_img_url = [{ 'img_url': first_img_url }] if first_img_url != '' else [] for key, value in tmp.items(): if re.compile('picUrl').findall(key) != []: all_img_url.append({ 'img_url': value, }) return all_img_url def _get_p_info(self, data): p_info = [{ 'p_name': item.get('attrName', ''), 'p_value': self._wash_sensitive_info(item.get('attrValue', '')), } for item in data.get('attrList', [])] return p_info def _get_div_desc(self, data): div_desc = data.get('itemDetail', {}).get('detailHtml', '') assert div_desc != '', '获取到的div_desc为空值!请检查!' # self.my_lg.info(str(div_desc)) div_desc = self._wash_div_desc(div_desc) # print(div_desc) return div_desc def _wash_div_desc(self, div_desc): ''' 清洗div_desc :param div_desc: :return: ''' # 方案1: 过滤不充分 # filter = ''' # _src=\".*?\"| # http://yanxuan.nosdn.127.net/e5f0f6b40368d7e532ff6b3a6481e6ab.jpg| # http://yanxuan.nosdn.127.net/c56658fa7b0b8a38bdb9c292a68fb176.jpg # '''.replace('\n', '').replace(' ', '') # # div_desc = re.compile(filter).sub('', div_desc) # # # 因为前面的严选声明照片地址是hash值, 每次都变 # # 所以所有div_desc统一洗去前4张 # div_desc = re.compile('<img.*?/>').sub('', div_desc, count=4) # 方案2: img_list = unique_list_and_keep_original_order( re.compile('src=\"(.*?)\"').findall(div_desc)) # pprint(img_list) _ = '' for item in img_list[3:-2:]: _ += '<p><img src="{0}" style=""/></p>'.format(item) div_desc = _ return div_desc def _get_sell_time(self, data): ''' 得到上下架时间 :param data: :return: ''' try: left_time = data.get('gradientPrice', {}).get('leftTime', 0) except AttributeError: # gradientPrice的值可能为'' return {} if left_time == 0: return {} now_time_timestamp = datetime_to_timestamp(get_shanghai_time()) sell_time = { 'begin_time': timestamp_to_regulartime(now_time_timestamp), 'end_time': timestamp_to_regulartime(now_time_timestamp + left_time), } return sell_time def _get_detail_name_list(self, data): detail_name_list = [] # pprint(data) for item in data: if item.get('name') is None: return [] else: try: img_here = 1 if item.get('skuSpecValueList', [])[0].get( 'picUrl', '') != '' else 0 except IndexError: img_here = 0 detail_name_list.append({ 'spec_name': item.get('name'), 'img_here': img_here, }) return detail_name_list def _get_price_info_list(self, data): ''' 得到price_info_list :param data: :return: ''' price_info_list = [] # pprint(data) for item in data: itemSkuSpecValueList = item.get('itemSkuSpecValueList', []) # pprint(itemSkuSpecValueList) spec_value_list = [ i.get('skuSpecValue', {}).get('value', '') for i in itemSkuSpecValueList ] spec_value = '|'.join(spec_value_list) img_url = item.get('pic', '') # 默认为空 if item.get('promotionDesc', '') == '新人专享价': # 新人专享价处理为原价 detail_price = str(item.get('calcPrice', '')) else: detail_price = str(item.get('retailPrice', '')) # 零售价 normal_price = str(item.get('counterPrice', '')) # 市场价 account_limit_buy_count = 5 rest_number = item.get('sellVolume', 0) # 官方接口没有规格库存信息, 此处默认为20 if rest_number == 0: continue price_info_list.append({ 'spec_value': spec_value, 'img_url': img_url, 'detail_price': detail_price, 'normal_price': normal_price, 'account_limit_buy_count': account_limit_buy_count, 'rest_number': rest_number, }) return price_info_list def _get_price_and_taobao_price(self, price_info_list): # pprint(price_info_list) if price_info_list == []: # 售罄商品处理 return 0, 0 try: tmp_price_list = sorted([ round(float(item.get('detail_price', '')), 2) for item in price_info_list ]) price = tmp_price_list[-1] # 商品价格 taobao_price = tmp_price_list[0] # 淘宝价 except IndexError: raise IndexError('获取price, taobao_price时索引异常!请检查!') return price, taobao_price def _get_is_delete(self, price_info_list, data, other): is_delete = 0 all_rest_number = 0 if price_info_list != []: for item in price_info_list: all_rest_number += item.get('rest_number', 0) if all_rest_number == 0: is_delete = 1 else: is_delete = 1 # 当官方下架时间< 当前时间戳 则商品已下架 is_delete = 1 if data['sell_time'] != {}: end_time = datetime_to_timestamp( string_to_datetime( data.get('sell_time', {}).get('end_time', ''))) if end_time < datetime_to_timestamp(get_shanghai_time()): self.my_lg.info('该商品已经过期下架...! 进行逻辑删除 is_delete=1') is_delete = 1 # print(is_delete) if other.get('soldOut'): # True or False is_delete = 1 return is_delete def _get_data_error_init(self): ''' 获取或者失败处理 :return: ''' self.result_data = {} return {} def _get_params(self, goods_id): params = (('id', goods_id), ) return params def _wash_data(self, data): ''' 清洗无用数据 :param data: :return: ''' try: data['comments'] = [] data['issueList'] = [] except: pass return data def get_goods_id_from_url(self, yanxuan_url): ''' 得到goods_id :param yanxuan_url: :return: goods_id ''' # http://you.163.com/item/detail?id=1130056&_stat_area=mod_1_item_1&_stat_id=1005000&_stat_referer=itemList is_yanxuan_url = re.compile(r'you.163.com/item/detail.*?').findall( yanxuan_url) if is_yanxuan_url != []: if re.compile(r'id=(\d+)').findall(yanxuan_url) != []: goods_id = re.compile(r'id=(\d+)').findall(yanxuan_url)[0] self.my_lg.info( '------>>>| 得到的严选商品的goods_id为: {0}'.format(goods_id)) return goods_id else: self.my_lg.info( '网易严选商品url错误, 非正规的url, 请参照格式(https://you.163.com/item/detail)开头的...' ) return '' def __del__(self): try: del self.my_phantomjs del self.my_lg except: pass gc.collect()