def run_forever(): #### 实时更新数据 while True: # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 data = {} taobao = TaoBaoLoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) taobao.get_goods_data(item[0]) data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) ''' 设置最后刷新的商品状态上下架时间 ''' # 1.is_delete由0->1 为下架时间down_time 2. is_delete由1->0 为上架时间shelf_time my_shelf_and_down_time = { 'shelf_time': '', 'down_time': '', } if data['is_delete'] != item[1]: if data['is_delete'] == 0 and item[1] == 1: # is_delete由0->1 表示商品状态上架变为下架 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # is_delete由1->0 表示商品状态下架变为上架 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: if item[2] is None or item[ 2] == '{"shelf_time": "", "down_time": ""}' or len( item[2]) == 35: # 35就是那串初始str if data['is_delete'] == 0: # 上架的状态 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: # 下架的状态 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # 否则保存原始值不变 tmp_shelf_and_down_time = item[2] my_shelf_and_down_time = json.loads( tmp_shelf_and_down_time) # 先转换为dict data['my_shelf_and_down_time'] = my_shelf_and_down_time # print(my_shlef_and_down_time) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(): #### 实时更新数据 while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 taobao = TaoBaoLoginAndParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) data = taobao.get_goods_data(item[0]) if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: my_lg.info('------>>>| 休眠5s中...') sleep(5) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(10) pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() restart_program()
def run_forever(): #### 实时更新数据 while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server._select_table(sql_str=tb_select_str_3, ) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 taobao = TaoBaoLoginAndParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlPools() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) oo = taobao.get_goods_data(item[0]) oo_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[6]), site_id=1) except AttributeError: # 处理已被格式化过的 old_sku_info = item[6] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=1), is_price_change=item[7] if item[7] is not None else 0) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: if oo_is_delete == 1: pass else: my_lg.info('------>>>| 休眠5s中...') sleep(4) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(10) pass index += 1 gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() restart_program()