def deal_with_data(self, *param): ''' 处理并存储相关秒杀商品的数据 :param param: 相关参数 :return: ''' pid = param[0] begin_time = int( time.mktime(time.strptime( param[1], '%Y/%m/%d %H:%M:%S'))) # 把str字符串类型转换为时间戳的形式 end_time = int( time.mktime(time.strptime(param[2], '%Y/%m/%d %H:%M:%S'))) item_list = param[3] mia = MiaParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [ item[0] for item in list( my_pipeline.select_mia_xianshimiaosha_all_goods_id()) ] # print(db_goods_id_list) for item in item_list: if item.get('item_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: goods_id = str(item.get('item_id', '')) tmp_url = 'https://www.mia.com/item-' + str( goods_id) + '.html' mia.get_goods_data(goods_id=str(goods_id)) goods_data = mia.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_url = goods_data['goods_url'] if re.compile(r'://m.miyabaobei.hk/').findall( goods_url) != '': goods_url = 'https://www.miyabaobei.hk/item-' + str( goods_id) + '.html' else: goods_url = 'https://www.mia.com/item-' + str( goods_id) + '.html' goods_data['goods_url'] = goods_url goods_data['goods_id'] = str(goods_id) goods_data['price'] = item.get('active_price') goods_data['taobao_price'] = item.get( 'active_price') # 秒杀最低价 goods_data['sub_title'] = item.get('short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': self.timestamp_to_regulartime(begin_time), 'miaosha_end_time': self.timestamp_to_regulartime(end_time), } goods_data['miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data['miaosha_time']) goods_data['pid'] = str(pid) # pprint(goods_data) # print(goods_data) mia.insert_into_mia_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: print('数据库连接失败,此处跳过!') pass try: del mia except: pass gc.collect()
def run_forever(self): ''' 实时更新数据 :return: ''' tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_mia_xianshimiaosha_all_goods_id()) except TypeError: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_end_time = json.loads(item[1]).get('miaosha_end_time') miaosha_end_time = int(str(time.mktime(time.strptime(miaosha_end_time,'%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_end_time) data = {} # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放 mia_miaosha = MiaParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_end_time) == 0: tmp_sql_server.delete_mia_miaosha_expired_goods_id(goods_id=item[0]) print('过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_end_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://m.mia.com/instant/seckill/seckillPromotionItem/' + str(item[2]) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, had_referer=True) # print(body) if body == '' or body == '[]': print('获取到的body为空值! 此处跳过') else: try: tmp_data = json.loads(body) except: tmp_data = {} print('json.loads转换body时出错, 此处跳过!') begin_time = tmp_data.get('p_info', {}).get('start_time', '') end_time = tmp_data.get('p_info', {}).get('end_time', '') begin_time = int(time.mktime(time.strptime(begin_time, '%Y/%m/%d %H:%M:%S'))) # 把str字符串类型转换为时间戳的形式 end_time = int(time.mktime(time.strptime(end_time, '%Y/%m/%d %H:%M:%S'))) item_list = tmp_data.get('item_list', []) # 该pid中现有的所有goods_id的list miaosha_goods_all_goods_id = [item_1.get('item_id', '') for item_1 in item_list] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server.delete_mia_miaosha_expired_goods_id(goods_id=item[0]) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_2 in item_list: if item_2.get('item_id', '') == item[0]: mia_miaosha.get_goods_data(goods_id=item[0]) goods_data = mia_miaosha.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: goods_data['goods_id'] = str(item[0]) goods_data['price'] = item_2.get('active_price') goods_data['taobao_price'] = item_2.get('active_price') goods_data['sub_title'] = item_2.get('short_info', '') goods_data['miaosha_time'] = { 'miaosha_begin_time': self.timestamp_to_regulartime(begin_time), 'miaosha_end_time': self.timestamp_to_regulartime(end_time), } goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time']) # pprint(goods_data) # print(goods_data) mia_miaosha.update_mia_xianshimiaosha_table(data=goods_data, pipeline=tmp_sql_server) sleep(MIA_SPIKE_SLEEP_TIME) # 放慢速度 else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()