def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33] # notice for tab_id in tab_id_list: for index in range(0, 50): tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(index) ) print('待抓取的限时秒杀地址为: ', tmp_url) data = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(tab_id, index)) break else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list(data=data) print(miaosha_goods_list) juanpi = JuanPiParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: if my_pipeline.select_juanpi_xianshimiaosha_all_goods_id() is None: db_goods_id_list = [] else: db_goods_id_list = [item[0] for item in list(my_pipeline.select_juanpi_xianshimiaosha_all_goods_id())] for item in miaosha_goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://shop.juanpi.com/deal/' + item.get('goods_id') juanpi.get_goods_data(goods_id=item.get('goods_id')) goods_data = juanpi.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get('stock_info') goods_data['goods_id'] = item.get('goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get('price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get('taobao_price') # 秒杀价 goods_data['sub_title'] = item.get('sub_title', '') goods_data['miaosha_time'] = item.get('miaosha_time') goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('miaosha_time')) goods_data['tab_id'] = tab_id goods_data['page'] = index # print(goods_data) juanpi.insert_into_juanpi_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline) sleep(.4) # 短暂sleep下避免出错跳出 sleep(.65) else: pass try: del juanpi except: pass gc.collect()
def get_spike_hour_goods_info(self): ''' 模拟构造得到data的url,得到近期所有的限时秒杀商品信息 :return: ''' tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33] # notice for tab_id in tab_id_list: for index in range(0, 50): tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(tab_id), str(index)) print('待抓取的限时秒杀地址为: ', tmp_url) # 设置代理ip self.proxies = self.get_proxy_ip_from_ip_pool( ) # {'http': ['xx', 'yy', ...]} self.proxy = self.proxies['http'][randint( 0, len(self.proxies) - 1)] tmp_proxies = { 'http': self.proxy, } # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy)) try: response = requests.get( tmp_url, headers=self.headers, proxies=tmp_proxies, timeout=10) # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造 data = response.content.decode('utf-8') # print(data) except Exception: print('requests.get()请求超时....') print('data为空!') break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format( tab_id, index)) break else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) print(miaosha_goods_list) juanpi = JuanPiParse() my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if my_pipeline.is_connect_success: if my_pipeline.select_juanpi_xianshimiaosha_all_goods_id( ) is None: db_goods_id_list = [] else: db_goods_id_list = [ item[0] for item in list( my_pipeline. select_juanpi_xianshimiaosha_all_goods_id( )) ] for item in miaosha_goods_list: if item.get('goods_id', '') in db_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过') pass else: tmp_url = 'http://shop.juanpi.com/deal/' + item.get( 'goods_id') juanpi.get_goods_data( goods_id=item.get('goods_id')) goods_data = juanpi.deal_with_data() if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并插入 goods_data['stock_info'] = item.get( 'stock_info') goods_data['goods_id'] = item.get( 'goods_id') goods_data['spider_url'] = tmp_url goods_data['username'] = '******' goods_data['price'] = item.get( 'price') # 秒杀前的原特价 goods_data['taobao_price'] = item.get( 'taobao_price') # 秒杀价 goods_data['sub_title'] = item.get( 'sub_title', '') goods_data['miaosha_time'] = item.get( 'miaosha_time') goods_data['tab_id'] = tab_id goods_data['page'] = index # print(goods_data) juanpi.insert_into_juanpi_xianshimiaosha_table( data=goods_data, pipeline=my_pipeline) sleep(.3) # 短暂sleep下避免出错跳出 sleep(.65) else: pass try: del juanpi except: pass gc.collect()
def run_forever(self): ''' 这个实时更新的想法是只更新当天前天未来14小时的上架商品的信息,再未来信息价格(全为原价)暂不更新 :return: ''' #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list( tmp_sql_server.select_juanpi_xianshimiaosha_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() for item in result: # 实时更新数据 miaosha_begin_time = json.loads( item[1]).get('miaosha_begin_time') miaosha_begin_time = int( str( time.mktime( time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10]) # print(miaosha_begin_time) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: if self.is_recent_time(miaosha_begin_time) == 0: tmp_sql_server.delete_juanpi_expired_goods_id( goods_id=item[0]) print( '过期的goods_id为(%s)' % item[0], ', 限时秒杀开始时间为(%s), 删除成功!' % json.loads(item[1]).get('miaosha_begin_time')) elif self.is_recent_time(miaosha_begin_time) == 2: # break # 跳出循环 pass # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的 else: # 返回1,表示在待更新区间内 print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format( str(item[2]), str(item[3]), ) # print('待爬取的tab_id, page地址为: ', tmp_url) data = MyRequests.get_url_body(url=tmp_url, headers=self.headers) if data == '': break try: data = json.loads(data) data = data.get('data', {}) # print(data) except: break if data.get('goodslist') == []: print('tab_id={0}, page={1}的goodslist为[], 此处跳过'. format(item[2], item[3])) pass else: data = data.get('goodslist', []) # print(data) if data == []: print('goodslist为[], 此处跳过') pass else: miaosha_goods_list = self.get_miaoshao_goods_info_list( data=data) # print(miaosha_goods_list) # 该tab_id, page中现有的所有goods_id的list miaosha_goods_all_goods_id = [ i.get('goods_id') for i in miaosha_goods_list ] # print(miaosha_goods_all_goods_id) if item[0] not in miaosha_goods_all_goods_id: # 内部已经下架的 ''' 表示该tab_id,page中没有了该goods_id ''' tmp_sql_server.delete_juanpi_expired_goods_id( goods_id=item[0]) print( '该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' % item[0]) pass else: # 未下架的 for item_1 in miaosha_goods_list: if item_1.get('goods_id', '') == item[0]: # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # juanpi_miaosha = JuanPiParse() juanpi_miaosha.get_goods_data( goods_id=item[0]) goods_data = juanpi_miaosha.deal_with_data( ) if goods_data == {}: # 返回的data为空则跳过 pass else: # 否则就解析并且插入 goods_data[ 'stock_info'] = item_1.get( 'stock_info') goods_data[ 'goods_id'] = item_1.get( 'goods_id') # goods_data['username'] = '******' if item_1.get( 'stock_info' ).get('activity_stock') > 0: goods_data[ 'price'] = item_1.get( 'price') # 秒杀前的原特价 goods_data[ 'taobao_price'] = item_1.get( 'taobao_price' ) # 秒杀价 else: pass goods_data[ 'sub_title'] = item_1.get( 'sub_title', '') goods_data[ 'miaosha_time'] = item_1.get( 'miaosha_time') goods_data[ 'miaosha_begin_time'], goods_data[ 'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=item_1 .get('miaosha_time' )) # print(goods_data) juanpi_miaosha.to_update_juanpi_xianshimiaosha_table( data=goods_data, pipeline=tmp_sql_server) sleep(.2) # 避免太快 else: pass if index % 10 == 0: # 每过几个初始化一次,既能加快速度,又能优化内存 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 juanpi_miaosha = JuanPiParse() gc.collect() index += 1 gc.collect() else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: # sleep(5) pass gc.collect()