def get_pintuan_goods_data(self, juanpi_pintuan, goods_id, all_sell_count, page): ''' 得到goods_data :param juanpi_pintuan: :param goods_id: 商品id :param page: :return: a dict ''' tmp_url = 'http://shop.juanpi.com/deal/' + str(goods_id) goods_id = juanpi_pintuan.get_goods_id_from_url(tmp_url) juanpi_pintuan.get_goods_data(goods_id=goods_id) goods_data = juanpi_pintuan.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(goods_id) goods_data['spider_url'] = 'https://web.juanpi.com/pintuan/shop/' + str(goods_id) goods_data['username'] = '******' goods_data['all_sell_count'] = all_sell_count goods_data['page'] = page goods_data['pintuan_begin_time'], goods_data['pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data.get('schedule', [])[0]) gc.collect() return goods_data
def _deal_with_data(self): ''' 处理并存储抓取到的拼团商品的数据 :return: ''' zid_list = self._get_pintuan_goods_info() zhe_800_pintuan = Zhe800PintuanParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table(sql_str=z8_select_str_1)) ] for item in zid_list: if item[0] in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str( item[0]) goods_id = zhe_800_pintuan.get_goods_id_from_url(tmp_url) zhe_800_pintuan.get_goods_data(goods_id=goods_id) goods_data = zhe_800_pintuan.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['goods_id'] = str(item[0]) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['page'] = str(item[1]) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data.get('schedule', [])[0]) # print(goods_data) _r = zhe_800_pintuan.insert_into_zhe_800_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 插入就更新 db_goods_id_list.append(item[0]) db_goods_id_list = list(set(db_goods_id_list)) sleep(ZHE_800_PINTUAN_SLEEP_TIME) gc.collect() else: pass try: del zhe_800_pintuan except: pass gc.collect() return None
async def insert_into_table(self, tmp_item, category, current_page, my_pipeline, index): ''' 执行插入到淘宝天天特价的操作 :param tmp_item: :param category: :param current_page: :param my_pipeline: :param index: :return: index 加1 ''' tmp_url = 'https://item.taobao.com/item.htm?id=' + str( tmp_item.get('goods_id', '')) taobao = TaoBaoLoginAndParse(logger=self.my_lg) goods_id = taobao.get_goods_id_from_url(tmp_url) taobao.get_goods_data(goods_id=goods_id) goods_data = taobao.deal_with_data(goods_id=goods_id) if goods_data != {}: goods_data['goods_id'] = tmp_item.get('goods_id', '') goods_data['goods_url'] = tmp_url goods_data['schedule'] = [{ 'begin_time': tmp_item.get('start_time', ''), 'end_time': tmp_item.get('end_time', ''), }] goods_data['tejia_begin_time'], goods_data[ 'tejia_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data.get('schedule', [])[0]) goods_data['block_id'] = str(category) goods_data['tag_id'] = str(current_page) goods_data['father_sort'] = self.main_sort[category][0] goods_data['child_sort'] = '' # pprint(goods_data) await taobao.insert_into_taobao_tiantiantejia_table( data=goods_data, pipeline=my_pipeline) else: await asyncio.sleep(4) # 否则休息4秒 pass index += 1 return index
def _get_mia_pt_one_goods_info(self, mia_pt_obj, goods_id, sub_title='') -> dict: """ 获取mia单个goods info :return: """ mia_pt_obj.get_goods_data(goods_id=goods_id) goods_data = mia_pt_obj.deal_with_data() assert goods_data != {}, 'goods_data不为空dict' goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = sub_title if goods_data['pintuan_time'] == {}: # 当没有拼团时间时,就表示已下架拼团 now_time = get_shanghai_time() goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = (now_time, now_time) else: goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['pintuan_time']) return goods_data
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' all_miaosha_goods_list = self.get_all_miaosha_goods_list() try: del self.driver except: pass gc.collect() pinduoduo = PinduoduoParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: self.db_goods_id_list = self._get_db_goods_id_list() for item in all_miaosha_goods_list: ''' 注意: 明日8点半抓取到的是页面加载中返回的是空值 ''' if item.get('goods_id') != 'None': # 跳过goods_id为'None' if item.get('goods_id', '') in self.db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://mobile.yangkeduo.com/goods.html?goods_id=' + item.get('goods_id') pinduoduo.get_goods_data(goods_id=item.get('goods_id')) goods_data = pinduoduo.deal_with_data() # print(goods_data) if goods_data == {}: # 返回的data为空则跳过 print('得到的goods_data为空值,此处先跳过,下次遍历再进行处理') # sleep(3) pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get('stock_info') goods_data['goods_id'] = item.get('goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get('taobao_price') # 秒杀价 goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time')) if item.get('stock_info', {}).get('activity_stock', 0) <= 2: # 实时秒杀库存小于等于2时就标记为 已售罄 print('该秒杀商品已售罄...') goods_data['is_delete'] = 1 pinduoduo.insert_into_pinduoduo_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(PINDUODUO_SLEEP_TIME) else: print('该goods_id为"None", 此处跳过') pass sleep(5) else: pass try: del pinduoduo except: pass gc.collect()
def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=jm_select_str_2)) db_goods_id_list = [item[0] for item in _] # print(db_goods_id_list) for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: jumei = JuMeiYouPinParse() goods_id = item.get('goods_id', '') type = item.get('type', '') tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format( goods_id, type) jumei.get_goods_data(goods_id=[goods_id, type]) goods_data = jumei.deal_with_data() if goods_data == {}: pass elif goods_data.get('is_delete', 0) == 1: print('------>>>| 该商品库存为0,已被抢光!') pass else: # 否则就解析并且插入 goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['page'] = item.get('page') # pprint(goods_data) res = jumei.insert_into_jumeiyoupin_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(JUMEIYOUPIN_SLEEP_TIME ) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 try: del jumei except: pass else: print('数据库连接失败,此处跳过!') pass gc.collect()
def deal_with_data(self, *params): ''' 处理并存储相关秒杀商品数据 :param params: 相关参数 :return: ''' item_list = params[0] chuchujie = ChuChuJie_9_9_Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=cc_select_str_2)) db_goods_id_list = [item[0] for item in _] # print(db_goods_id_list) for item in item_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = item.get('goods_id', '') tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str( goods_id) chuchujie.get_goods_data(goods_id=goods_id) goods_data = chuchujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 sleep(.5) elif goods_data.get('is_delete', 0) == 1: # is_delete=1(即库存为0)则跳过 print('------>>>| 该商品库存为0,已被抢光!') sleep(.5) else: # 否则就解析并且插入 my_phantomjs = BaseDriver( executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) # 获取剩余时间 tmp_body = my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, css_selector='p#activityTime span') # print(tmp_body) try: del my_phantomjs except: pass gc.collect() if tmp_body == '': # 获取手机版的页面完整html失败 sleep(.5) pass else: # p#activityTime span _t = Selector(text=tmp_body).css( 'p#activityTime span::text').extract_first() _t = re.compile(r'剩余').sub('', _t) # print(_t) if _t == '' or _t is None: print('获取到的_t为空值, 严重错误! 请检查!') miaosha_end_time = self.get_miaosha_end_time(_t) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(int(time.time())), 'miaosha_end_time': timestamp_to_regulartime( int(miaosha_end_time)), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['gender'] = str(item.get('gender', '0')) goods_data['page'] = item.get('page') res = chuchujie.insert_into_chuchujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) # sleep(CHUCHUJIE_SLEEP_TIME) # 放慢速度 由于初始化用了phantomjs时间久,于是就不睡眠 # index += 1 else: print('数据库连接失败,此处跳过!') pass try: del chuchujie except: pass gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33] # notice for tab_id in tab_id_list: for index in range(0, 50): tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(index)) print('待抓取的限时秒杀地址为: ', tmp_url) data = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format( tab_id, index)) break else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) print(miaosha_goods_list) juanpi = JuanPiParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: if my_pipeline._select_table( sql_str=jp_select_str_5) is None: db_goods_id_list = [] else: db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table( sql_str=jp_select_str_5)) ] for item in miaosha_goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://shop.juanpi.com/deal/' + item.get( 'goods_id') juanpi.get_goods_data( goods_id=item.get('goods_id')) goods_data = juanpi.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = item.get( 'goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get( 'price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get( 'taobao_price') # 秒杀价 goods_data['sub_title'] = item.get( 'sub_title', '') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get( 'miaosha_time')) goods_data['tab_id'] = tab_id goods_data['page'] = index # print(goods_data) juanpi.insert_into_juanpi_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(.4) # 短暂sleep下避免出错跳出 sleep(.65) else: pass try: del juanpi except: pass gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) data = self._get_one_session_id_data(base_session_id=base_session_id) sleep(.5) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 base_session_id += 2 continue try: begin_times_timestamp = self._get_begin_times_timestamp(data) except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp)) is_recent_time = self.is_recent_time(timestamp=begin_times_timestamp) if not is_recent_time: # 说明秒杀日期合法 base_session_id += 2 continue try: data = [item_s.get('deal', {}) for item_s in data.get('data', {}).get('blocks', [])] except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list(data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = self._get_db_goods_id_list(my_pipeline) for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str(item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url(tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get('stock_info') goods_data['goods_id'] = str(item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get('taobao_price') goods_data['sub_title'] = item.get('sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time')) goods_data['session_id'] = str(base_session_id) # print(goods_data) res = zhe_800.insert_into_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 sleep(4) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') pass base_session_id += 2
def deal_with_data(self, goods_list): ''' 处理并存储相关拼团商品的数据 :param goods_list: :return: ''' mia = MiaPintuanParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table(sql_str=mia_select_str_1)) ] # print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://www.mia.com/item-' + str( goods_id) + '.html' mia.get_goods_data(goods_id=str(goods_id)) goods_data = mia.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_url = goods_data['goods_url'] if re.compile(r'://m.miyabaobei.hk/').findall( goods_url) != '': goods_url = 'https://www.miyabaobei.hk/item-' + str( goods_id) + '.html' else: goods_url = 'https://www.mia.com/item-' + str( goods_id) + '.html' goods_data['goods_url'] = goods_url goods_data['goods_id'] = str(goods_id) goods_data['sub_title'] = item.get('sub_title', '') goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['pintuan_time']) goods_data['pid'] = item.get('pid') # pprint(goods_data) _r = mia.insert_into_mia_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 更新 db_goods_id_list.append(goods_id) db_goods_id_list = list(set(db_goods_id_list)) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mia except: pass gc.collect()
def run_forever(self): ''' 这个实时更新的想法是只更新当天未来2小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server._select_table(sql_str=pd_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 pinduoduo_miaosha = PinduoduoParse() all_miaosha_goods_list = self.get_all_miaosha_goods_list() # 其中所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in all_miaosha_goods_list ] # print(miaosha_goods_all_goods_id) for item in result: # 实时更新数据 # 对于拼多多先拿到该商品的结束时间点 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示其中没有了该goods_id ''' tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) pass else: # 未下架的 for item_1 in all_miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # pinduoduo_miaosha = PinduoduoParse() pinduoduo_miaosha.get_goods_data( goods_id=item[0]) goods_data = pinduoduo_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 # sleep(3) pass else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get( 'stock_info') goods_data['goods_id'] = item_1.get( 'goods_id') if item_1.get('stock_info').get( 'activity_stock') > 0: goods_data['price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price') # 秒杀价 else: pass goods_data['sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get( 'miaosha_time')) if item_1.get('stock_info').get( 'activity_stock') <= 1: # 实时秒杀库存小于等于1时就标记为 已售罄 print('该秒杀商品已售罄...') goods_data['is_delete'] = 1 # print(goods_data) pinduoduo_miaosha.to_update_pinduoduo_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(PINDUODUO_SLEEP_TIME) else: pass index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(3) # del ali_1688 gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mia_delete_str_2) result = list( tmp_sql_server._select_table(sql_str=mia_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 pintuan_end_time = json_2_dict(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mia_pintuan = MiaPintuanParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://m.mia.com/instant/groupon/common_list/' + str( item[2]) + '/0/' # print(tmp_url) body = Requests.get_url_body( url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) if body == '': print('获取到的body为空值! 此处跳过') else: tmp_data = json_2_dict(json_str=body) if tmp_data == {}: print('json.loads转换body时出错, 此处跳过!') if tmp_data.get('data_list', []) == []: print('得到的data_list为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: data_list = [{ 'goods_id': item_2.get('sku', ''), 'sub_title': item_2.get('intro', ''), } for item_2 in tmp_data.get('data_list', [])] # pprint(data_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in data_list ] # print(pintuan_goods_all_goods_id) ''' 蜜芽拼团不对内部下架的进行操作,一律都更新未过期商品 (根据pid来进行更新多次研究发现出现商品还在拼团,误删的情况很普遍) ''' if item[0] not in pintuan_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # 一律更新 mia_pintuan.get_goods_data( goods_id=item[0]) goods_data = mia_pintuan.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) if goods_data[ 'pintuan_time'] == {}: # 当没有拼团时间时,就表示已下架拼团(未让其正常更新进数据库, 我把拼团开始结束时间都设置为当前时间) now_time = get_shanghai_time() goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = ( now_time, now_time) else: goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'pintuan_time']) # pprint(goods_data) # print(goods_data) mia_pintuan.update_mia_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in data_list: if item_2.get('goods_id', '') == item[0]: mia_pintuan.get_goods_data( goods_id=item[0]) goods_data = mia_pintuan.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) goods_data[ 'sub_title'] = item_2.get( 'sub_title', '') if goods_data[ 'pintuan_time'] == {}: # 当没有拼团时间时,就表示已下架拼团 now_time = get_shanghai_time( ) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = ( now_time, now_time) else: goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) # pprint(goods_data) # print(goods_data) mia_pintuan.update_mia_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME ) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mia_delete_str_4) result = list(tmp_sql_server._select_table(sql_str=mia_select_str_3)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int(str(time.mktime(time.strptime(miaosha_end_time,'%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mia_miaosha = MiaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(item[2]) body = Requests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True, ip_pool_type=self.ip_pool_type) # print(body) if body == '' or body == '[]': print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') begin_time = tmp_data.get('p_info', {}).get('start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') begin_time = int(time.mktime(time.strptime(begin_time, '%Y/%m/%d %H:%M:%S'))) # 把str字符串类型转换为时间戳的形式 end_time = int(time.mktime(time.strptime(end_time, '%Y/%m/%d %H:%M:%S'))) item_list = tmp_data.get('item_list', []) # 该pid中现有的所有goods_id的list miaosha_goods_all_goods_id = [item_1.get('item_id', '') for item_1 in item_list] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('item_id', '') == item[0]: mia_miaosha.get_goods_data(goods_id=item[0]) goods_data = mia_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['price'] = item_2.get('active_price') goods_data['taobao_price'] = item_2.get('active_price') goods_data['sub_title'] = item_2.get('short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(begin_time), 'miaosha_end_time': timestamp_to_regulartime(end_time), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) # pprint(goods_data) # print(goods_data) mia_miaosha.update_mia_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
async def _deal_with_all_goods_id(self): ''' 获取每个详细分类的商品信息 :return: None ''' _data = await self._get_all_goods_list() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() index = 1 if my_pipeline.is_connect_success: self.my_lg.info('正在获取淘抢购db原有goods_id, 请耐心等待...') sql_str = r'select goods_id from dbo.tao_qianggou_xianshimiaosha where site_id=28' db_ = list(my_pipeline._select_table(sql_str=sql_str)) db_all_goods_id = [item[0] for item in db_] self.my_lg.info('获取完毕!!!') # self.my_lg.info(str(db_all_goods_id)) for item in _data: miaosha_goods_list = await self._get_taoqianggou_goods_list(data=item.get('data', [])) # self.my_lg.info(str(miaosha_goods_list)) # pprint(miaosha_goods_list) for tmp_item in miaosha_goods_list: if tmp_item.get('goods_id', '') in db_all_goods_id: # 处理如果该goods_id已经存在于数据库中的情况 self.my_lg.info('该goods_id[%s]已存在db中' % tmp_item.get('goods_id', '')) continue if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # my_pipeline = SqlPools() self.my_lg.info('与数据库的新连接成功建立...') if my_pipeline.is_connect_success: tmall = TmallParse(logger=self.my_lg) tmp_url = 'https://detail.tmall.com/item.htm?id={0}'.format(tmp_item.get('goods_id')) goods_id = tmall.get_goods_id_from_url(tmp_url) tmall.get_goods_data(goods_id=goods_id) goods_data = tmall.deal_with_data() if goods_data != {}: # self.my_lg.info(str(tmp_item)) goods_data['goods_id'] = tmp_item.get('goods_id') goods_data['spider_url'] = tmp_url goods_data['miaosha_time'] = tmp_item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=tmp_item.get('miaosha_time')) goods_data['page'] = tmp_item.get('page') goods_data['spider_time'] = tmp_item.get('spider_time') tmall.insert_into_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME) else: await asyncio.sleep(5) try: del tmall except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=jm_delete_str_2) result = list(tmp_sql_server._select_table(sql_str=jm_select_str_1)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 获取cookies my_phantomjs = BaseDriver(executable_path=PHANTOMJS_DRIVER_PATH, ip_pool_type=self.ip_pool_type) cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/') try: del my_phantomjs except: pass if cookies == '': print('!!! 获取cookies失败 !!!') return False print('获取cookies成功!') self.headers.update(Cookie=cookies) for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int(str(time.mktime(time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 jumeiyoupin_miaosha = JuMeiYouPinParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('过期的goods_id为(%s)' % item[0], ', 限时秒杀结束时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_end_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] this_page_all_goods_list = self.get_one_page_all_goods_list(item[2]) if this_page_all_goods_list == '网络错误!': print('网络错误!先跳过') continue elif this_page_all_goods_list == []: print('#### 该page对应得到的this_page_all_goods_list为空[]!') print('** 该商品已被下架限时秒杀活动, 此处将其删除') tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: """ 由于不会内部提前下架,所以在售卖时间内的全部进行相关更新 """ # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list] # # if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass # # else: # 未下架的 tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url(item[3]) jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r) goods_data = jumeiyoupin_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['miaosha_time'] = { 'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''), 'miaosha_end_time': goods_data['schedule'].get('end_time', ''), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) # print(goods_data) jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server) sleep(JUMEIYOUPIN_SLEEP_TIME) else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来14小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None) result = list( tmp_sql_server._select_table(sql_str=jp_select_str_4)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() for item in result: # 实时更新数据 miaosha_begin_time = json.loads( item[1]).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_begin_time) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0]), lock_timeout=2000) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_begin_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(item[2]), str(item[3]), ) # print('待爬取的tab_id, page地址为: ', tmp_url) data = Requests.get_url_body( url=tmp_url, headers=self.headers, ip_pool_type=self.ip_pool_type) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'. format(item[2], item[3])) pass else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # print(miaosha_goods_list) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in miaosha_goods_list ] # print(miaosha_goods_all_goods_id) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示该tab_id,page中没有了该goods_id ''' tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) pass else: # 未下架的 for item_1 in miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # juanpi_miaosha = JuanPiParse() juanpi_miaosha.get_goods_data( goods_id=item[0]) goods_data = juanpi_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data[ 'stock_info'] = item_1.get( 'stock_info') goods_data[ 'goods_id'] = item_1.get( 'goods_id') # goods_data['username'] = '******' if item_1.get( 'stock_info' ).get('activity_stock') > 0: goods_data[ 'price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price' ) # 秒杀价 else: pass goods_data[ 'sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1 .get('miaosha_time' )) juanpi_miaosha.to_update_juanpi_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(.3) # 避免太快 else: pass if index % 10 == 0: # 每过几个初始化一次,既能加快速度,又能优化内存 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() gc.collect() index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: # sleep(5) pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_2) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_2)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) for item in result: # 实时更新数据 pintuan_end_time = json.loads(item[1]).get('end_time') pintuan_end_time = int( str( time.mktime( time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} mogujie_pintuan = MoGuJieParse() if index % 8 == 0: try: del self.my_phantomjs except: pass gc.collect() self.my_phantomjs = MyPhantomjs( executable_path=PHANTOMJS_DRIVER_PATH) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(pintuan_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print( '过期的goods_id为(%s)' % item[0], ', 拼团开始时间为(%s), 删除成功!' % json.loads(item[1]).get('begin_time')) elif self.is_recent_time(pintuan_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] tmp_url = 'http://list.mogujie.com/search?page={0}&fcid={1}&algoKey=pc_tuan_book_pop&cKey=pc-tuan'.format( item[3], item[2]) # print(tmp_url) # requests请求不到数据,涉及证书认证,直接用phantomjs # body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) # print(body) if body == '': print('获取到的body为空值! 此处跳过') else: try: body = re.compile( r'<pre.*?>(.*?)</pre>').findall(body)[0] tmp_data = json.loads(body) # pprint(tmp_data) except: print('json.loads转换body时出错, 请检查') tmp_data = {} if tmp_data.get('result', {}).get('wall', {}).get('docs', []) == []: print('得到的docs为[]!') print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0])) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: tmp_item_list = tmp_data.get('result', {}).get( 'wall', {}).get('docs', []) # pprint(tmp_item_list) begin_time_timestamp = int( time.time()) # 开始拼团的时间戳 item_list = [{ 'goods_id': item.get('tradeItemId', ''), 'pintuan_time': { 'begin_time': timestamp_to_regulartime( timestamp=begin_time_timestamp), 'end_time': timestamp_to_regulartime( self.get_pintuan_end_time( begin_time_timestamp, item.get('leftTimeOrg', ''))), }, 'all_sell_count': str(item.get('salesVolume', 0)), } for item in tmp_item_list] # pprint(item_list) pintuan_goods_all_goods_id = [ item_1.get('goods_id', '') for item_1 in item_list ] # print(pintuan_goods_all_goods_id) ''' 内部已经下架的(内部下架的其实并未真实下架,还在卖的,所以我就更新其商品信息数据,不更新上下架时间) ''' if item[0] not in pintuan_goods_all_goods_id: # print('该商品已被下架限时秒杀活动,此处将其删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) # print('下架的goods_id为(%s)' % item[0], ', 删除成功!') # pass mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 print('+++ 内部下架,其实还在售卖的商品更新') goods_data['goods_id'] = item[0] goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table_2( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: # 未下架的 for item_2 in item_list: if item_2.get('goods_id', '') == item[0]: mogujie_pintuan.get_goods_data( goods_id=item[0]) goods_data = mogujie_pintuan.deal_with_data( ) if goods_data == {}: pass else: # 规范化 goods_data['goods_id'] = item[ 0] goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data[ 'price_info_list']) goods_data[ 'pintuan_time'] = item_2.get( 'pintuan_time', {}) goods_data[ 'pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= goods_data[ 'pintuan_time'] ) goods_data[ 'all_sell_count'] = item_2.get( 'all_sell_count', '') # pprint(goods_data) mogujie_pintuan.update_mogujie_pintuan_table( data=goods_data, pipeline=tmp_sql_server) sleep( MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: print('数据库连接失败,此处跳过!') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def _update_old_goods_info(self, tmp_sql_server, result): ''' 更新old goods info :param result: :return: ''' index = 1 for item in result: # 实时更新数据 miaosha_begin_time = json.loads(item[1]).get('miaosha_begin_time') miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # self.my_lg.info(str(miaosha_begin_time)) data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_miaosha = Zhe800Parse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],)) self.my_lg.info('过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'.format(item[0], json.loads(item[1]).get('miaosha_begin_time'))) elif self.is_recent_time(miaosha_begin_time) == 2: # 可能包括过期的 self.my_lg.info('未来时间暂时不更新! {}'.format(timestamp_to_regulartime(miaosha_begin_time))) pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(item[0], index)) data['goods_id'] = item[0] try: tmp_data = self.zhe_800_spike._get_one_session_id_data(base_session_id=str(item[2])) except Exception: self.my_lg.error(msg='', exc_info=True) continue if tmp_data.get('data', {}).get('blocks', []) == []: # session_id不存在 self.my_lg.info('该session_id不存在,此处跳过') pass else: tmp_data = [item_s.get('deal', {}) for item_s in tmp_data.get('data', {}).get('blocks', [])] # pprint(tmp_data) if tmp_data != []: # 否则说明里面有数据 try: miaosha_goods_list = self.get_miaoshao_goods_info_list(data=tmp_data) except ValueError: sleep(2) continue # pprint(miaosha_goods_list) # 该session_id中现有的所有zid的list miaosha_goods_all_goods_id = [i.get('zid') for i in miaosha_goods_list] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0],)) self._update_is_delete(tmp_sql_server=tmp_sql_server, goods_id=item[0]) self.my_lg.info('该商品已被官方下架限秒活动! 下架的goods_id为({0}), 逻辑删除成功!'.format(item[0])) pass else: # 未下架的 for item_1 in miaosha_goods_list: if item_1.get('zid', '') == item[0]: zhe_800_miaosha.get_goods_data(goods_id=item[0]) goods_data = zhe_800_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item_1.get('stock_info') goods_data['goods_id'] = str(item_1.get('zid')) # goods_data['username'] = '******' if item_1.get('stock_info').get('activity_stock') > 0: goods_data['price'] = item_1.get('price') goods_data['taobao_price'] = item_1.get('taobao_price') else: pass goods_data['sub_title'] = item_1.get('sub_title') goods_data['miaosha_time'] = item_1.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1.get('miaosha_time')) if goods_data.get('is_delete', 0) == 1: self.my_lg.info('该商品[{0}]已售罄...'.format(item[0])) # self.my_lg.info(str(goods_data['stock_info'])) # self.my_lg.info(str(goods_data['miaosha_time'])) zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server) else: pass else: # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品 self._update_is_delete(tmp_sql_server=tmp_sql_server, goods_id=item[0]) self.my_lg.info('该sessionid没有相关key为jsons的数据! 过期的goods_id为({0}), 限时秒杀开始时间为({1}), 删除成功!'.format(item[0], json.loads(item[1]).get('miaosha_begin_time'))) pass else: # 表示返回的data值为空值 self.my_lg.error('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass sleep(1.2) gc.collect() self.my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) gc.collect() return
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) data = self._get_one_session_id_data( base_session_id=base_session_id) sleep(.3) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 pass else: # 否则session_id存在 try: _ = str( data.get('data', {}).get('blocks', [])[0].get('deal', {}).get('begin_time', ''))[:10] if _ != '': pass elif data.get('data', {}).get('blocks', [])[0].get( 'showcase', {}) != {}: # 未来时间 print('*** 未来时间 ***') # pprint(data.get('data', {})) _ = str( data.get('data', {}).get('blocks', [])[1].get( 'deal', {}).get('begin_time', ''))[:10] else: raise Exception begin_times_timestamp = int( _) # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整 except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue print('秒杀时间为: ', timestamp_to_regulartime(begin_times_timestamp)) if self.is_recent_time( timestamp=begin_times_timestamp): # 说明秒杀日期合法 try: data = [ item_s.get('deal', {}) for item_s in data.get( 'data', {}).get('blocks', []) ] except Exception as e: print('遇到严重错误: ', e) base_session_id += 2 continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: sql_str = 'select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14' db_goods_id_list = [ item[0] for item in list( my_pipeline._select_table(sql_str=sql_str)) ] for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str( item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url( tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = str( item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get( 'taobao_price') goods_data['sub_title'] = item.get( 'sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get( 'miaosha_time')) goods_data['session_id'] = str( base_session_id) # print(goods_data['miaosha_time']) # print(goods_data) zhe_800.insert_into_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 # sleep(2) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') # return {} pass else: pass base_session_id += 2
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: tmp_sql_server._delete_table(sql_str=mg_delete_str_4) result = list( tmp_sql_server._select_table(sql_str=mg_select_str_3)) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int( str( time.mktime( time.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mogujie_miaosha = MoGuJieMiaoShaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server._delete_table( sql_str=self.delete_sql_str, params=(item[0], )) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] item_list = self.get_item_list(event_time=str(item[2])) if item_list == '': # 可能网络状况导致, 先跳过 pass elif item_list == []: print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 该event_time中现有的所有goods_id的list miaosha_goods_all_goods_id = [ item_1.get('iid', '') for item_1 in item_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其逻辑删除') # tmp_sql_server._delete_table(sql_str=self.delete_sql_str, params=(item[0])) tmp_sql_server._update_table( sql_str=mg_update_str_1, params=(item[0], )) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('iid', '') == item[0]: spider_url = item[3] mogujie_miaosha.get_goods_data( goods_id=spider_url) goods_data = mogujie_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str( item[0]) # price设置为原价 try: tmp_price_list = sorted([ round( float( item_4.get( 'normal_price', '')), 2) for item_4 in goods_data[ 'price_info_list'] ]) price = Decimal( tmp_price_list[-1] ).__round__(2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int( item_2.get( 'startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int( item_2.get( 'endTime', 0))), } goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data[ 'miaosha_time']) # print(goods_data['title']) # pprint(goods_data) # print(goods_data) mogujie_miaosha.update_mogujie_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def deal_with_data(self, *params): ''' 处理并存储相关拼团商品的数据 :param params: 待传参数 :return: ''' goods_list = params[0] mogujie = MoGuJieParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=mg_select_str_1)) db_goods_id_list = [item[0] for item in _] print(db_goods_id_list) for item in goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('goods_id', '')) tmp_url = 'https://shop.mogujie.com/detail/' + str( goods_id) mogujie.get_goods_data(goods_id=str(goods_id)) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 # 规范化 goods_data[ 'price_info_list'] = _get_mogujie_pintuan_price_info_list( goods_data['price_info_list']) goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) goods_data['pintuan_time'] = item.get( 'pintuan_time', {}) goods_data['pintuan_begin_time'], goods_data[ 'pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item.get('pintuan_time', {})) goods_data['all_sell_count'] = item.get( 'all_sell_count', '') goods_data['fcid'] = str(item.get('fcid')) goods_data['page'] = str(item.get('page')) goods_data['sort'] = str(item.get('sort', '')) # pprint(goods_data) # print(goods_data) _r = mogujie.insert_into_mogujie_pintuan_table( data=goods_data, pipeline=my_pipeline) if _r: # 更新 if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect()
def deal_with_data(self, *param): ''' 处理并存储相关秒杀商品的数据 :param param: 相关参数 :return: ''' pid = param[0] begin_time = int(time.mktime(time.strptime(param[1], '%Y/%m/%d %H:%M:%S'))) # 把str字符串类型转换为时间戳的形式 end_time = int(time.mktime(time.strptime(param[2], '%Y/%m/%d %H:%M:%S'))) item_list = param[3] mia = MiaParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=mia_select_str_4))] # print(db_goods_id_list) for item in item_list: if item.get('item_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('item_id', '')) tmp_url = 'https://www.mia.com/item-' + str(goods_id) + '.html' mia.get_goods_data(goods_id=str(goods_id)) goods_data = mia.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_url = goods_data['goods_url'] if re.compile(r'://m.miyabaobei.hk/').findall(goods_url) != '': goods_url = 'https://www.miyabaobei.hk/item-' + str(goods_id) + '.html' else: goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html' goods_data['goods_url'] = goods_url goods_data['goods_id'] = str(goods_id) goods_data['price'] = item.get('active_price') goods_data['taobao_price'] = item.get('active_price') # 秒杀最低价 goods_data['sub_title'] = item.get('short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime(begin_time), 'miaosha_end_time': timestamp_to_regulartime(end_time), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) goods_data['pid'] = str(pid) # pprint(goods_data) # print(goods_data) mia.insert_into_mia_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mia except: pass gc.collect()
def deal_with_data(self, *param): ''' 处理并存储相关秒杀商品的数据 :param param: 相关参数 :return: ''' print(60 * '*') event_time = param[0] item_list = param[1] print('秒杀开始时间:', timestamp_to_regulartime(event_time), '\t', '对应时间戳为: ', event_time) print(60 * '*') mogujie = MoGuJieMiaoShaParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: _ = list(my_pipeline._select_table(sql_str=mg_select_str_4)) db_goods_id_list = [item[0] for item in _] for item in item_list: goods_id = str(item.get('iid', '')) if goods_id in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = item.get('link', '') # print(tmp_url) try: object_id = re.compile('objectId=(\w+)').findall( tmp_url)[0] except IndexError: # 表示匹配到的地址不是秒杀商品的地址 print('+++++++ 这个url不是秒杀的url: ', tmp_url) continue tmp_url = 'https://shop.mogujie.com/rushdetail/{0}?objectId={1}&type=rush'.format( goods_id, object_id) tmp_ = mogujie.get_goods_id_from_url(tmp_url) mogujie.get_goods_data(goods_id=tmp_) goods_data = mogujie.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['goods_url'] = tmp_url goods_data['goods_id'] = str(goods_id) # price设置为原价 try: tmp_price_list = sorted([ round(float(item_4.get('normal_price', '')), 2) for item_4 in goods_data['price_info_list'] ]) price = Decimal(tmp_price_list[-1]).__round__( 2) # 商品原价 goods_data['price'] = price except: print('设置price为原价时出错!请检查') sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 continue goods_data['miaosha_time'] = { 'miaosha_begin_time': timestamp_to_regulartime( int(item.get('startTime', 0))), 'miaosha_end_time': timestamp_to_regulartime( int(item.get('endTime', 0))), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['event_time'] = str(event_time) # pprint(goods_data) # print(goods_data) res = mogujie.insert_into_mogujie_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) if res: if goods_id not in db_goods_id_list: db_goods_id_list.append(goods_id) sleep(MOGUJIE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mogujie except: pass gc.collect()