def set_sql_cli(self): """ 设置连接类型 :return: """ if self.db_conn_type == 1: # 推荐 self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() elif self.db_conn_type == 2: # 使用sqlalchemy管理数据库连接池 self.sql_cli = SqlPools() else: raise ValueError('db_conn_type 值异常!')
async def _get_db_old_data(self) -> (list, None): ''' 获取db需求更新的数据 :return: ''' if self.db_conn_type == 1: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() elif self.db_conn_type == 2: # 使用sqlalchemy管理数据库连接池 self.sql_cli = SqlPools() else: raise ValueError('db_conn_type 值异常!') result = None try: if self.db_res_from == 0: result = self.sql_cli._select_table(sql_str=tb_select_str_3,) elif self.db_res_from == 1: result = await get_waited_2_update_db_data_from_server( server_ip=self.server_ip, _type='tb', child_type=0,) elif self.db_res_from == 2: # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新 result = get_waited_2_update_db_data_from_redis_server( spider_name='tb0', logger=self.lg, slice_num=800,) else: raise ValueError('self.db_res_from value异常!') except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') except Exception: self.lg.error('遇到错误:', exc_info=True) await _print_db_old_data(logger=self.lg, result=result) return result
async def _get_db_old_data(self) -> (list, None): ''' 获取db需求更新的数据 :return: ''' # self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() # 使用sqlalchemy管理数据库连接池 self.sql_cli = SqlPools() result = None try: # result = self.sql_cli._select_table(sql_str=tb_select_str_3,) result = await get_waited_2_update_db_data_from_server( server_ip=self.server_ip, _type='tb', child_type=0, ) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') except Exception: self.lg.error('遇到错误:', exc_info=True) await _print_db_old_data(logger=self.lg, result=result) return result
def run_forever(): #### 实时更新数据 while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 taobao = TaoBaoLoginAndParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) data = taobao.get_goods_data(item[0]) if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: my_lg.info('------>>>| 休眠5s中...') sleep(5) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(10) pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() restart_program()
def run_forever(): #### 实时更新数据 while True: # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() result_2 = list(tmp_sql_server_2.select_old_table_all_goods_id()) # print(result_2) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 new_table_ali_1688_all_goods_id_list = [item[0] for item in result] for item in result_2: # 实时更新数据 data = {} taobao = TaoBaoLoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = taobao.get_goods_id_from_url(item[0]) if goods_id == '': print('@@@ 原商品的地址为: ', item[0]) continue else: if goods_id in new_table_ali_1688_all_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过!') continue else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) tt = taobao.get_goods_data(goods_id) if tt.get('is_delete') == 1: # 处理已下架的但是还是要插入的 tt['goods_id'] = goods_id tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) tt['username'] = '******' tt['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data=tt, pipeline=tmp_sql_server_2) index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) continue else: pass data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data, pipeline=tmp_sql_server_2) else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
class TBUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/') self.sql_cli = None # 1 SqlServerMyPageInfoSaveItemPipeline | 2 SqlPools self.db_conn_type = 1 self.goods_index = 1 # 并发量 self.concurrency = 100 self.concurrent_type = CONCURRENT_TYPE # 0 sqlserver | 1 new_my_server | 2 redis self.db_res_from = 2 if 'armv7l-with-debian' in platform.platform(): self.server_ip = 'http://0.0.0.0:80' else: self.server_ip = 'http://118.31.39.97' # self.server_ip = 'http://0.0.0.0:5000' async def _update_db(self): ''' 实时更新数据 :return: ''' while True: # 长期运行报: OSError: [Errno 24] Too many open files, 故不采用每日一志 # self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency) self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() except AssertionError: break one_res, index = await self._get_one_res( slice_params_list=slice_params_list, index=index) await self._except_sleep(res=one_res) self.lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * .5) else: await async_sleep(5.) try: # del self.lg del result except: pass collect() async def _get_db_old_data(self) -> (list, None): ''' 获取db需求更新的数据 :return: ''' if self.db_conn_type == 1: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() elif self.db_conn_type == 2: # 使用sqlalchemy管理数据库连接池 self.sql_cli = SqlPools() else: raise ValueError('db_conn_type 值异常!') result = None try: if self.db_res_from == 0: result = self.sql_cli._select_table(sql_str=tb_select_str_3,) elif self.db_res_from == 1: result = await get_waited_2_update_db_data_from_server( server_ip=self.server_ip, _type='tb', child_type=0,) elif self.db_res_from == 2: # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新 result = get_waited_2_update_db_data_from_redis_server( spider_name='tb0', logger=self.lg, slice_num=800,) else: raise ValueError('self.db_res_from value异常!') except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') except Exception: self.lg.error('遇到错误:', exc_info=True) await _print_db_old_data(logger=self.lg, result=result) return result async def _get_one_res(self, slice_params_list: list, index) -> tuple: """ 获取slice_params_list对应的one_res :param slice_params_list: :param index: :return: (list, int) """ def get_tasks_params_list(slice_params_list: list, index: int) -> list: tasks_params_list = [] for item in slice_params_list: db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg) tasks_params_list.append({ 'db_goods_info_obj': db_goods_info_obj, 'index': index, }) index += 1 return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where is goods_id: {}, index: {}] ...'.format( k['db_goods_info_obj'].goods_id, k['index'],) def get_now_args(k) -> list: return [ 'tb', k['db_goods_info_obj'].goods_id, k['index'], self.lg, ] async def handle_one_res(one_res: list): """ one_res后续处理 :param one_res: :return: """ nonlocal slice_params_list # 获取新new_slice_params_list new_slice_params_list = [] for item in slice_params_list: goods_id = item[1] for i in one_res: # self.lg.info(str(i)) try: goods_id2 = i[1] index = i[2] if goods_id == goods_id2: new_slice_params_list.append({ 'index': index, 'before_goods_data': i[3], 'end_goods_data': i[4], 'item': item, }) break else: continue except IndexError: continue # 阻塞方式进行存储, 避免db高并发导致大量死锁 tasks = [] for k in new_slice_params_list: item = k['item'] index = k['index'] db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('create task[where is goods_id: {}, index: {}]...'.format( db_goods_info_obj.goods_id, index)) tasks.append(self.loop.create_task(self._update_one_goods_info_in_db( db_goods_info_obj=db_goods_info_obj, index=index, before_goods_data=k['before_goods_data'], end_goods_data=k['end_goods_data'],))) # self.lg.error(str(one_res)) # self.lg.error(str(tasks)) one_res = await _get_async_task_result( tasks=tasks, logger=self.lg) # pprint(one_res) try: del new_slice_params_list except: pass return one_res # tasks = [] # # method 1 # for item in slice_params_list: # db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg) # self.lg.info('创建 task goods_id: {}'.format(db_goods_info_obj.goods_id)) # tasks.append(self.loop.create_task(self._update_one_goods_info( # db_goods_info_obj=db_goods_info_obj, # index=index))) # index += 1 # # res = await _get_async_task_result(tasks=tasks, logger=self.lg) # method 2 one_res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=get_tasks_params_list( slice_params_list=slice_params_list, index=index,), func_name_where_get_create_task_msg=get_create_task_msg, func_name=block_get_one_goods_info_task_by_external_type, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res2, one_default_res=(), step=self.concurrency, logger=self.lg, get_all_res=True, concurrent_type=self.concurrent_type, ) # pprint(one_res) res = await handle_one_res(one_res=one_res) return (res, index) async def _update_one_goods_info_in_db(self, db_goods_info_obj, index, before_goods_data, end_goods_data): """ 更新单个goods :param db_goods_info_obj: :param index: :param before_goods_data: :param end_goods_data: :return: """ res = False self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, db_conn_type=self.db_conn_type, remainder=25,) if self.sql_cli.is_connect_success: self.lg.info('*' * 20 + ' updating goods_id: {}, index: {} ...'.format( db_goods_info_obj.goods_id, index, )) # 避免下面解析data错误休眠 before_goods_data_is_delete = before_goods_data.get('is_delete', 0) if end_goods_data != {}: data = get_goods_info_change_data( target_short_name='tb', logger=self.lg, data=end_goods_data, db_goods_info_obj=db_goods_info_obj, ) res = to_right_and_update_tb_data( data=data, pipeline=self.sql_cli, logger=self.lg,) else: # 表示返回的data值为空值 if before_goods_data_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format( db_goods_info_obj.goods_id,)) await async_sleep(delay=7., loop=self.loop) # 改为阻塞进程, 机器会挂 # sleep(7.) else: self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=5, loop=self.loop) await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) collect() return [db_goods_info_obj.goods_id, res] async def _get_new_tb_obj(self, index) -> None: if index % 10 == 0: try: del self.taobao except: pass collect() self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True) async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新单个goods :return: ''' res = False await self._get_new_tb_obj(index=index) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, db_conn_type=self.db_conn_type, remainder=25,) if self.sql_cli.is_connect_success: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( db_goods_info_obj.goods_id, str(index))) oo = self.taobao.get_goods_data(goods_id=db_goods_info_obj.goods_id) oo_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 data = self.taobao.deal_with_data(goods_id=db_goods_info_obj.goods_id) if data != {}: data = get_goods_info_change_data( target_short_name='tb', logger=self.lg, data = data, db_goods_info_obj=db_goods_info_obj,) res = to_right_and_update_tb_data( data=data, pipeline=self.sql_cli, logger=self.lg) else: if oo_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('------>>>| 休眠8s中...') await async_sleep(delay=8, loop=self.loop) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=10, loop=self.loop) index += 1 self.goods_index = index collect() # 国外服务器上可以缩短时间, 可以设置为0s await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 return [db_goods_info_obj.goods_id, res] async def _except_sleep(self, res): ''' 异常休眠 :param res: :return: ''' count = 0 all_count_fail_sleep_time = 100. # 本来休眠40., 现在不休眠 sleep_time = 0. for item in res: try: if not item[1]: count += 1 except IndexError: pass self.lg.info('Fail count: {}个, 并发量: {}个'.format(count, self.concurrency)) if count/self.concurrency >= .96: # 全失败的休眠方式 self.lg.info('抓取异常!! 休眠{}s中...'.format(all_count_fail_sleep_time)) await async_sleep(all_count_fail_sleep_time) else: if count >= int(self.concurrency/5): self.lg.info('抓取异常!! 休眠{}s中...'.format(sleep_time)) await async_sleep(sleep_time) return None def __del__(self): try: del self.lg except: pass try: del self.sql_cli except: pass try: del self.loop except: pass collect()
class CommonGoodsRealTimeUpdater(AsyncCrawler): """常规商品实时更新""" def __init__(self): self.goods_spider_type = GOODS_SPIDER_NAME assert self.goods_spider_type is not None assert self.goods_spider_type in ('tb', 'tm'), \ 'self.goods_spider_type value异常!' AsyncCrawler.__init__( self, log_print=True, log_save_path=self.get_log_save_path(), ) self.set_concurrency() self.crawl_type = CRAWL_TYPE_ASYNCIO self.concurrent_type = CONCURRENT_TYPE self.db_res_from = DB_RES_FROM self.db_conn_type = DB_CONN_TYPE self.sql_cli = None self.set_sql_cli() assert self.db_res_from in (0, 1, 2,), \ 'self.db_res_from value异常!' self.db_data_slice_num = 800 self.is_real_times_update_call = True if 'armv7l-with-debian' in platform.platform(): self.server_ip = 'http://0.0.0.0:80' else: self.server_ip = 'http://118.31.39.97' # self.server_ip = 'http://0.0.0.0:5000' def get_log_save_path(self) -> str: if self.goods_spider_type == 'tm': return MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' elif self.goods_spider_type == 'tb': return MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' else: raise NotImplemented def set_concurrency(self) -> None: """ 设置并发量, log_save_path :return: """ if self.goods_spider_type == 'tm': self.concurrency = 100 elif self.goods_spider_type == 'tb': self.concurrency = 100 else: raise NotImplemented def set_sql_cli(self): """ 设置连接类型 :return: """ if self.db_conn_type == 1: # 推荐 self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() elif self.db_conn_type == 2: # 使用sqlalchemy管理数据库连接池 self.sql_cli = SqlPools() else: raise ValueError('db_conn_type 值异常!') async def _update_db(self): while True: # 长期运行报: OSError: [Errno 24] Too many open files, 故不采用每日一志 # self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() except AssertionError: break one_res, index = await self._get_one_res( slice_params_list=slice_params_list, index=index) await self._except_sleep(res=one_res) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * .5) else: await async_sleep(5.) try: # del self.lg del result except Exception: pass collect() async def _get_db_old_data(self) -> (list, None): """ 获取db需求更新的数据 :return: """ result = None try: if self.db_res_from == 0: if self.goods_spider_type == 'tm': sql_str = tm_select_str_3 elif self.goods_spider_type == 'tb': sql_str = tb_select_str_3 else: raise NotImplemented result = list( self.sql_cli._select_table( sql_str=sql_str, logger=self.lg, )) elif self.db_res_from == 1: result = await get_waited_2_update_db_data_from_server( server_ip=self.server_ip, _type=self.goods_spider_type, child_type=0, ) else: # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新 result = get_waited_2_update_db_data_from_redis_server( # eg: 'tm0' spider_name=self.goods_spider_type + '0', logger=self.lg, slice_num=self.db_data_slice_num, ) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_one_res(self, slice_params_list, index) -> tuple: """ 获取slice_params_list对应的one_res :param slice_params_list: :param index: :return: (list, int) """ if self.crawl_type == CRAWL_TYPE_ASYNCIO: """asyncio""" if self.goods_spider_type == 'tm': tasks_params_list = self.get_tm_tasks_params_list( slice_params_list=slice_params_list, index=index, ) func_name_where_get_create_task_msg = self.get_tm_create_task_msg func_name_where_get_now_args = self.get_tm_now_args elif self.goods_spider_type == 'tb': tasks_params_list = self.get_tb_tasks_params_list( slice_params_list=slice_params_list, index=index, ) func_name_where_get_create_task_msg = self.get_tb_create_task_msg func_name_where_get_now_args = self.get_tb_now_args else: raise NotImplemented # pprint(tasks_params_list) one_res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=tasks_params_list, func_name_where_get_create_task_msg= func_name_where_get_create_task_msg, func_name=block_get_one_goods_info_task_by_external_type, func_name_where_get_now_args=func_name_where_get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res= default_add_one_res_2_all_res2, one_default_res=(), step=self.concurrency, logger=self.lg, get_all_res=True, concurrent_type=self.concurrent_type, ) # pprint(one_res) elif self.crawl_type == CRAWL_TYPE_CELERY: """celery""" tasks = [] if self.goods_spider_type == 'tm': for item in slice_params_list: index += 1 db_goods_info_obj = TMDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format( db_goods_info_obj.goods_id)) tmp_item = self.get_tm_tmp_item( site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id, ) try: async_obj = await self.create_tm_celery_obj( goods_id=tmp_item, index=index, ) tasks.append(async_obj) except Exception: continue one_res = await _get_celery_async_results(tasks=tasks) else: raise NotImplemented else: raise NotImplemented res = await handle_real_times_goods_one_res( # eg: 'tm', 'tb' goods_type=self.goods_spider_type, loop=self.loop, func_name_where_update_one_goods_info_in_db=self. _update_one_goods_info_in_db, slice_params_list=slice_params_list, one_res=one_res, logger=self.lg, ) try: del slice_params_list except: pass return (res, index) def get_tm_tasks_params_list(self, slice_params_list: list, index: int) -> list: tasks_params_list = [] for item in slice_params_list: try: db_goods_info_obj = TMDbGoodsInfoObj(item=item, logger=self.lg) tmp_item = self.get_tm_tmp_item( site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id, ) tasks_params_list.append({ 'db_goods_info_obj': db_goods_info_obj, 'index': index, 'tmp_item': tmp_item, }) index += 1 except Exception: self.lg.error('遇到错误[goods_id: {}]:', exc_info=True) continue return tasks_params_list def get_tb_tasks_params_list(self, slice_params_list: list, index: int) -> list: tasks_params_list = [] for item in slice_params_list: try: db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg) tasks_params_list.append({ 'db_goods_info_obj': db_goods_info_obj, 'index': index, }) index += 1 except Exception: self.lg.error('遇到错误[goods_id: {}]:', exc_info=True) continue return tasks_params_list @staticmethod def get_tm_create_task_msg(k) -> str: return 'create task[where is goods_id: {}, index: {}] ...'.format( k['db_goods_info_obj'].goods_id, k['index'], ) @staticmethod def get_tb_create_task_msg(k) -> str: return 'create task[where is goods_id: {}, index: {}] ...'.format( k['db_goods_info_obj'].goods_id, k['index'], ) def get_tm_now_args(self, k) -> list: return [ 'tm', k['tmp_item'], k['index'], self.lg, ] def get_tb_now_args(self, k) -> list: return [ 'tb', k['db_goods_info_obj'].goods_id, k['index'], self.lg, ] async def _update_one_goods_info_in_db(self, db_goods_info_obj, index, before_goods_data, end_goods_data) -> (list, tuple): """ 更新单个goods :param db_goods_info_obj: :param index: :param before_goods_data: :param end_goods_data: :return: """ res = False self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg, db_conn_type=self.db_conn_type, remainder=25) if self.sql_cli.is_connect_success: self.lg.info('*' * 20 + ' updating goods_id: {}, index: {} ...'.format( db_goods_info_obj.goods_id, index, )) # 避免下面解析data错误休眠 before_goods_data_is_delete = before_goods_data.get('is_delete', 0) if end_goods_data != {}: data = get_goods_info_change_data( # eg: 'tm', 'tb' target_short_name=self.goods_spider_type, logger=self.lg, data=end_goods_data, db_goods_info_obj=db_goods_info_obj, sql_cli=self.sql_cli, ) res = to_right_and_update_data_by_goods_type( goods_type=self.goods_spider_type, data=data, pipeline=self.sql_cli, logger=self.lg, ) else: # 表示返回的data值为空值 if before_goods_data_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format( db_goods_info_obj.goods_id, )) await async_sleep(delay=7., loop=self.loop) # 改为阻塞进程, 机器会挂 # sleep(7.) else: self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=8, loop=self.loop) collect() return [db_goods_info_obj.goods_id, res] async def create_tm_celery_obj(self, **kwargs): """ 创建celery obj :param kwargs: :return: """ goods_id = kwargs.get('goods_id', []) index = kwargs['index'] async_obj = _get_tm_one_goods_info_task.apply_async( args=[ goods_id, index, ], expires=5 * 60, retry=False, ) return async_obj @staticmethod def get_tm_tmp_item(site_id, goods_id): tmp_item = [] # 从数据库中取出时,先转换为对应的类型 if site_id == 3: tmp_item.append(0) elif site_id == 4: tmp_item.append(1) elif site_id == 6: tmp_item.append(2) tmp_item.append(goods_id) return tmp_item @staticmethod def get_jd_tmp_item(site_id, goods_id): tmp_item = [] # 从数据库中取出时,先转换为对应的类型 if site_id == 7 \ or site_id == 8: tmp_item.append(0) elif site_id == 9: tmp_item.append(1) elif site_id == 10: tmp_item.append(2) tmp_item.append(goods_id) return tmp_item async def _except_sleep(self, res): """ 异常休眠 :param res: :return: """ count = 0 all_count_fail_sleep_time = 100. # 本来是40., 此处不休眠 sleep_time = 0. # pprint(res) for item in res: try: if not item[1]: count += 1 except IndexError: pass self.lg.info('Fail count: {}个, 并发量: {}个'.format( count, self.concurrency)) if count / self.concurrency >= .96: # 全失败的休眠方式 self.lg.info('抓取异常!! 休眠{}s中...'.format(all_count_fail_sleep_time)) await async_sleep(all_count_fail_sleep_time) else: if count >= int(self.concurrency / 5): self.lg.info('抓取异常!! 休眠{}s中...'.format(sleep_time)) await async_sleep(sleep_time) return None def __del__(self): try: del self.lg except: pass try: del self.sql_cli except: pass try: del self.loop except: pass collect()
def run_forever(): #### 实时更新数据 while True: # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 data = {} taobao = TaoBaoLoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) taobao.get_goods_data(item[0]) data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) ''' 设置最后刷新的商品状态上下架时间 ''' # 1.is_delete由0->1 为下架时间down_time 2. is_delete由1->0 为上架时间shelf_time my_shelf_and_down_time = { 'shelf_time': '', 'down_time': '', } if data['is_delete'] != item[1]: if data['is_delete'] == 0 and item[1] == 1: # is_delete由0->1 表示商品状态上架变为下架 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # is_delete由1->0 表示商品状态下架变为上架 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: if item[2] is None or item[ 2] == '{"shelf_time": "", "down_time": ""}' or len( item[2]) == 35: # 35就是那串初始str if data['is_delete'] == 0: # 上架的状态 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: # 下架的状态 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # 否则保存原始值不变 tmp_shelf_and_down_time = item[2] my_shelf_and_down_time = json.loads( tmp_shelf_and_down_time) # 先转换为dict data['my_shelf_and_down_time'] = my_shelf_and_down_time # print(my_shlef_and_down_time) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def run_forever(): #### 实时更新数据 while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server._select_table(sql_str=tb_select_str_3, ) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 taobao = TaoBaoLoginAndParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlPools() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) oo = taobao.get_goods_data(item[0]) oo_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[6]), site_id=1) except AttributeError: # 处理已被格式化过的 old_sku_info = item[6] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=1), is_price_change=item[7] if item[7] is not None else 0) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: if oo_is_delete == 1: pass else: my_lg.info('------>>>| 休眠5s中...') sleep(4) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(10) pass index += 1 gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() restart_program()