async def _update_one_goods_info_in_db(self, db_goods_info_obj, index, before_goods_data, end_goods_data) -> (list, tuple): """ 更新单个goods :param db_goods_info_obj: :param index: :param before_goods_data: :param end_goods_data: :return: """ res = False self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg, db_conn_type=self.db_conn_type, remainder=25) if self.sql_cli.is_connect_success: self.lg.info('*' * 20 + ' updating goods_id: {}, index: {} ...'.format( db_goods_info_obj.goods_id, index, )) # 避免下面解析data错误休眠 before_goods_data_is_delete = before_goods_data.get('is_delete', 0) if end_goods_data != {}: data = get_goods_info_change_data( # eg: 'tm', 'tb' target_short_name=self.goods_spider_type, logger=self.lg, data=end_goods_data, db_goods_info_obj=db_goods_info_obj, sql_cli=self.sql_cli, ) res = to_right_and_update_data_by_goods_type( goods_type=self.goods_spider_type, data=data, pipeline=self.sql_cli, logger=self.lg, ) else: # 表示返回的data值为空值 if before_goods_data_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format( db_goods_info_obj.goods_id, )) await async_sleep(delay=7., loop=self.loop) # 改为阻塞进程, 机器会挂 # sleep(7.) else: self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=8, loop=self.loop) collect() return [db_goods_info_obj.goods_id, res]
async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新单个jd商品信息 :param db_goods_info_obj: :param index: :return: ''' res = False await self._get_new_jd_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(db_goods_info_obj.goods_id, index)) tmp_item = await self._get_tmp_item( site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id, ) data = self.jd.get_goods_data(goods_id=tmp_item) if data.get('is_delete', 1) == 1: self.lg.info('该商品已下架...') self.sql_cli._update_table_2(sql_str=jd_update_str_2, params=( str(get_shanghai_time()), tmp_item[1], ), logger=self.lg) await async_sleep(1.2) index += 1 self.goods_index = index return db_goods_info_obj.goods_id, index data = self.jd.deal_with_data(goods_id=tmp_item) if data != {}: data = get_goods_info_change_data( target_short_name='jd', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) self.jd.to_right_and_update_data(data, pipeline=self.sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 self.goods_index = index collect() await async_sleep(1.2) # 避免被发现使用代理 return db_goods_info_obj.goods_id, index
async def _update_one_goods_info_by_celery(self, db_goods_info_obj, index, before_goods_data, end_goods_data): """ 更新单个goods :param item: :param index: :param before_goods_data: :param end_goods_data: :return: """ res = False self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg, remainder=50) if self.sql_cli.is_connect_success: self.lg.info('### updating goods_id: {}, index: {} ...'.format( db_goods_info_obj.goods_id, index, )) # 避免下面解析data错误休眠 before_goods_data_is_delete = before_goods_data.get('is_delete', 0) if end_goods_data != {}: data = get_goods_info_change_data( target_short_name='tm', logger=self.lg, data=end_goods_data, db_goods_info_obj=db_goods_info_obj, ) res = to_right_and_update_tm_data(data=data, pipeline=self.sql_cli, logger=self.lg) else: # 表示返回的data值为空值 if before_goods_data_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format( db_goods_info_obj.goods_id, )) await async_sleep(delay=7., loop=self.loop) # 改为阻塞进程, 机器会挂 # sleep(7.) else: self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=5, loop=self.loop) await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) collect() return [db_goods_info_obj.goods_id, res]
async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新单个goods :return: ''' res = False await self._get_new_tb_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg, db_conn_type=2, remainder=50) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (db_goods_info_obj.goods_id, str(index))) oo = self.taobao.get_goods_data( goods_id=db_goods_info_obj.goods_id) oo_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 data = self.taobao.deal_with_data( goods_id=db_goods_info_obj.goods_id) if data != {}: data = get_goods_info_change_data( target_short_name='tb', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) res = to_right_and_update_tb_data(data=data, pipeline=self.sql_cli, logger=self.lg) else: if oo_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('------>>>| 休眠8s中...') await async_sleep(delay=8, loop=self.loop) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=10, loop=self.loop) index += 1 self.goods_index = index collect() # 国外服务器上可以缩短时间, 可以设置为0s await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 return [db_goods_info_obj.goods_id, res]
async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新一个goods的信息 :param db_goods_info_obj: :param index: :return: ['goods_id', bool:'成功与否'] ''' res = False await self._get_new_ali_obj(index=index) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg) if self.sql_cli.is_connect_success: self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format( db_goods_info_obj.goods_id, index)) self.zhe_800.get_goods_data(goods_id=db_goods_info_obj.goods_id) data = self.zhe_800.deal_with_data() if data != {}: data = get_goods_info_change_data( target_short_name='z8', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj,) res = self.zhe_800.to_right_and_update_data( data=data, pipeline=self.sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index collect() await async_sleep(2.) return [db_goods_info_obj.goods_id, res]
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=mia_select_str_5)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 mia = MiaParse() for item in result: goods_id = item[1] if index % 5 == 0: try: del mia except: pass mia = MiaParse() collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) mia.get_goods_data(goods_id=goods_id) data = mia.deal_with_data() db_goods_info_obj = MIADbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='mia', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) mia._to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5 * 60) try: del my_lg except: pass collect()
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=kl_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) for item in result: # 实时更新数据 goods_id = item[1] if index % 5 == 0: try: del kaola except: pass kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True) collect() sql_cli = _block_get_new_db_conn( db_obj=sql_cli, index=index, logger=my_lg, remainder=10, ) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(goods_id), str(index))) db_goods_info_obj = KLDbGoodsInfoObj(item=item, logger=my_lg) data = kaola._get_goods_data(goods_id=goods_id) if data.get('is_delete', 0) == 1: # 单独处理下架商品 data['goods_id'] = goods_id data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=db_goods_info_obj.is_delete, shelf_time=db_goods_info_obj.shelf_time, delete_time=db_goods_info_obj.delete_time, ) try: kaola.to_right_and_update_data(data, pipeline=sql_cli) except Exception: my_lg.error(exc_info=True) sleep(TMALL_REAL_TIMES_SLEEP_TIME) index += 1 collect() continue data = kaola._deal_with_data() if data != {}: if data.get('is_delete', 0) == 1: _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=my_lg, sql_cli=sql_cli, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='kl', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) kaola.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠3s中...') sleep(3.) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
async def _update_one_goods_info(self, db_goods_info_obj, index): """ 更新单个goods :param db_goods_info_obj: :param index: :return: """ res = False tmall = TmallParse(logger=self.lg) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, remainder=50, ) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'. format(db_goods_info_obj.goods_id, index)) tmp_item = self._get_tmp_item(site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id) # self.lg.info(str(tmp_item)) # ** 阻塞方式运行 oo = tmall.get_goods_data(goods_id=tmp_item) # ** 非阻塞方式运行 # oo = await unblock_func( # func_name=tmall.get_goods_data, # func_args=[ # tmp_item, # ], # default_res={}, # logger=self.lg,) before_goods_data_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 # 阻塞方式 data = tmall.deal_with_data() if data != {}: data = get_goods_info_change_data( target_short_name='tm', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) res = to_right_and_update_tm_data(data=data, pipeline=self.sql_cli, logger=self.lg) else: if before_goods_data_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('------>>>| 阻塞休眠7s中...') await async_sleep(delay=7., loop=self.loop) # 改为阻塞进程, 机器会挂 # sleep(7.) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=5, loop=self.loop) try: del tmall except: pass collect() await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME) return [ db_goods_info_obj.goods_id, res, ]
def run_forever(): while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR, ) #### 实时更新数据 sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: result = list(sql_cli._select_table(sql_str=yx_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: _block_print_db_old_data(result=result, logger=my_lg) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 yanxuan = YanXuanParse(logger=my_lg) for item in result: # 实时更新数据 if index % 5 == 0: try: del yanxuan except: pass yanxuan = YanXuanParse(logger=my_lg) collect() sql_cli = _block_get_new_db_conn(db_obj=sql_cli, index=index, logger=my_lg, remainder=10) if sql_cli.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index))) yanxuan._get_goods_data(goods_id=item[1]) data = yanxuan._deal_with_data() db_goods_info_obj = YXDbGoodsInfoObj(item=item, logger=my_lg) if data != {}: if data.get('is_delete') == 1: # 单独处理下架商品 my_lg.info('@@@ 该商品已下架...') sql_cli._update_table_2( sql_str=yx_update_str_2, params=(db_goods_info_obj.goods_id, ), logger=my_lg, ) sleep(TMALL_REAL_TIMES_SLEEP_TIME) continue else: data = get_goods_info_change_data( target_short_name='yx', logger=my_lg, data=data, db_goods_info_obj=db_goods_info_obj, ) yanxuan.to_right_and_update_data(data, pipeline=sql_cli) else: # 表示返回的data值为空值 my_lg.info('------>>>| 休眠8s中...') sleep(8) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(5) pass index += 1 collect() sleep(TMALL_REAL_TIMES_SLEEP_TIME) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(60) collect()
async def _update_one_goods_info(self, db_goods_info_obj, index) -> list: """ 更新一个goods的信息 :param db_goods_info_obj: :param index: 索引值 :return: ['goods_id', bool:'成功与否'] """ res = False await self._get_new_ali_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'. format(db_goods_info_obj.goods_id, index)) data = self.ali_1688.get_ali_1688_data( goods_id=db_goods_info_obj.goods_id) if isinstance(data, int): # 单独处理返回tt为4041 self.goods_index += 1 return [db_goods_info_obj.goods_id, res] if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = db_goods_info_obj.goods_id data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=db_goods_info_obj.is_delete, shelf_time=db_goods_info_obj.shelf_time, delete_time=db_goods_info_obj.delete_time, ) try: self.ali_1688.to_right_and_update_data( data, pipeline=self.sql_cli) except Exception: self.lg.error(exc_info=True) await async_sleep(1.5) self.goods_index += 1 res = True return [db_goods_info_obj.goods_id, res] data = self.ali_1688.deal_with_data() if data != {}: data = get_goods_info_change_data( target_short_name='al', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) res = self.ali_1688.to_right_and_update_data( data, pipeline=self.sql_cli) await async_sleep(.3) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') index += 1 self.goods_index = index collect() await async_sleep(2.) # 避免被发现使用代理 return [db_goods_info_obj.goods_id, res]