def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' base_session_id = BASE_SESSION_ID while base_session_id < MAX_SESSION_ID: print('待抓取的session_id为: ', base_session_id) tmp_url = 'https://zapi.zhe800.com/zhe800_n_api/xsq/m/session_deals?session_id={0}&page=1&per_page=1000'.format( str(base_session_id) ) body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url) # print(body) body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall(body) if body_1 != []: data = body_1[0] data = json.loads(data) # pprint(data) if data.get('data', {}).get('blocks', []) == []: # session_id不存在 print('该session_id不存在,此处跳过') pass else: # 否则session_id存在 # begin_times_timestamp = int(time.mktime(time.strptime(begin_times, '%Y-%m-%d %H:%M:%S'))) # 将如 "2017-09-28 10:00:00"的时间字符串转化为时间戳,然后再将时间戳取整 try: begin_times_timestamp = int(str(data.get('data', {}).get('blocks', [])[0].get('deal', {}).get('begin_time', ''))[:10]) except Exception as e: print('遇到严重错误: ', e) continue print('秒杀时间为: ', self.timestamp_to_regulartime(begin_times_timestamp)) if self.is_recent_time(timestamp=begin_times_timestamp): # 说明秒杀日期合法 try: data = [item_s.get('deal', {}) for item_s in data.get('data', {}).get('blocks', [])] except Exception as e: print('遇到严重错误: ', e) continue # pprint(data) if data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list(data=data) # pprint(miaosha_goods_list) zhe_800 = Zhe800Parse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: db_goods_id_list = [item[0] for item in list(my_pipeline.select_zhe_800_xianshimiaosha_all_goods_id())] for item in miaosha_goods_list: if item.get('zid', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'https://shop.zhe800.com/products/' + str(item.get('zid', '')) goods_id = zhe_800.get_goods_id_from_url(tmp_url) zhe_800.get_goods_data(goods_id=goods_id) goods_data = zhe_800.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data['stock_info'] = item.get('stock_info') goods_data['goods_id'] = str(item.get('zid')) goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') goods_data['taobao_price'] = item.get('taobao_price') goods_data['sub_title'] = item.get('sub_title') # goods_data['is_baoyou'] = item.get('is_baoyou') goods_data['miaosha_time'] = item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time')) goods_data['session_id'] = str(base_session_id) # print(goods_data) zhe_800.insert_into_zhe_800_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(ZHE_800_SPIKE_SLEEP_TIME) # 放慢速度 # sleep(2) else: pass try: del zhe_800 except: pass gc.collect() else: # 说明这个sessionid没有数据 print('该sessionid没有相关key为jsons的数据') # return {} pass else: pass else: print('获取到的data为空!') # return {} pass base_session_id += 2
def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server.select_zhe_800_xianshimiaosha_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 miaosha_begin_time = json.loads( item[1]).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_begin_time) data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 zhe_800_miaosha = Zhe800Parse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server.delete_zhe_800_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_begin_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) tmp_url = 'https://zapi.zhe800.com/zhe800_n_api/xsq/m/session_deals?session_id={0}&page=1&per_page=1000'.format( str(item[2])) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url) body_1 = re.compile(r'<pre.*?>(.*)</pre>').findall( body) if body_1 != []: tmp_data = body_1[0] tmp_data = json.loads(tmp_data) # pprint(tmp_data) if tmp_data.get('data', {}).get('blocks', []) == []: # session_id不存在 print('该session_id不存在,此处跳过') pass else: tmp_data = [ item_s.get('deal', {}) for item_s in tmp_data.get('data', {}).get( 'blocks', []) ] if tmp_data != []: # 否则说明里面有数据 miaosha_goods_list = self.get_miaoshao_goods_info_list( data=tmp_data) # pprint(miaosha_goods_list) # 该session_id中现有的所有zid的list miaosha_goods_all_goods_id = [ i.get('zid') for i in miaosha_goods_list ] if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 print('该商品已被下架限时秒杀活动,此处将其删除') tmp_sql_server.delete_zhe_800_expired_goods_id( goods_id=item[0]) print('下架的goods_id为(%s)' % item[0], ', 删除成功!') pass else: # 未下架的 for item_1 in miaosha_goods_list: if item_1.get('zid', '') == item[0]: zhe_800_miaosha.get_goods_data( goods_id=item[0]) goods_data = zhe_800_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data[ 'stock_info'] = item_1.get( 'stock_info') goods_data[ 'goods_id'] = str( item_1.get('zid')) # goods_data['username'] = '******' if item_1.get( 'stock_info' ).get('activity_stock' ) > 0: goods_data[ 'price'] = item_1.get( 'price') goods_data[ 'taobao_price'] = item_1.get( 'taobao_price') else: pass goods_data[ 'sub_title'] = item_1.get( 'sub_title') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time= item_1.get( 'miaosha_time' )) # print(goods_data['stock_info']) # print(goods_data['miaosha_time']) zhe_800_miaosha.to_update_zhe_800_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server ) else: pass else: # 说明这个sessionid没有数据, 就删除对应这个sessionid的限时秒杀商品 print('该sessionid没有相关key为jsons的数据') # return {} tmp_sql_server.delete_zhe_800_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads( item[1]).get('miaosha_begin_time')) pass else: print('获取到的data为空!') # return {} pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass # sleep(.8) gc.collect() print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()