async def _get_new_tb_obj(self, index) -> None: if index % 10 == 0: try: del self.taobao except: pass collect() self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True)
async def _get_new_tb_obj(self, index) -> None: if index % 10 == 0: try: del self.taobao except: pass collect() self.taobao = TaoBaoLoginAndParse(logger=self.lg)
async def update_expired_goods_to_normal_goods(goods_id, index, tmp_sql_server, logger): ''' 过期的不删除, 降为更新为常规爆款促销商品 :param goods_id: :param index: :param tmp_sql_server: :param logger: :return: index ''' # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0]) # logger.info('该商品goods_id({0})已过期, 天天特价结束时间为 [{1}], 删除成功!'.format(item[0], str(item[2].strftime('%Y-%m-%d %H:%M:%S')))) logger.info('++++++>>>| 此为过期商品, 正在更新! |<<<++++++') logger.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) taobao = TaoBaoLoginAndParse(logger=logger) data_before = taobao.get_goods_data(goods_id) if data_before.get('is_delete') == 1: # 单独处理下架状态的商品 data_before['goods_id'] = goods_id data_before['schedule'] = [] '''不更新特价时间段''' # data_before['tejia_begin_time'], data_before['tejia_end_time'] = '', '' # logger.info('------>>>| 爬取到的数据为: %s' % str(data_before)) await taobao.update_taobao_tiantiantejia_table(data_before, pipeline=tmp_sql_server) await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 避免服务器更新太频繁 index += 1 try: del taobao except: pass gc.collect() return index goods_data = taobao.deal_with_data(goods_id=goods_id) if goods_data != {}: goods_data['goods_id'] = goods_id await taobao.update_expired_goods_id_taobao_tiantiantejia_table( data=goods_data, pipeline=tmp_sql_server) else: await asyncio.sleep(4) # 否则休息4秒 pass await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) index += 1 try: del taobao except: pass gc.collect() return index
async def _update_db(self): ''' 实时更新数据 :return: ''' while True: self.lg = await self._get_new_logger() result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.taobao = TaoBaoLoginAndParse(logger=self.lg) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() except AssertionError: break tasks = [] for item in slice_params_list: db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format( db_goods_info_obj.goods_id)) tasks.append( self.loop.create_task( self._update_one_goods_info( db_goods_info_obj=db_goods_info_obj, index=index))) index += 1 res = await _get_async_task_result(tasks=tasks, logger=self.lg) await self._except_sleep(res=res) self.lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(5.) try: del self.lg del result except: pass collect()
def _get_random_sku_info_list(self): ''' 得到所有的sku_info_list信息,用于随机一个属性 :return: ''' assert self.g_data != {}, 'g_data为空dict' _t = TaoBaoLoginAndParse(logger=self.my_lg) # 得到每个标签对应值的价格及其库存 price_info_list = _t._get_price_info_list( data=self.g_data, detail_value_list=_t._get_detail_name_and_value_list(data=self.g_data)[1] ) try: del _t except: pass return list(set([_i.get('spec_value', '') for _i in price_info_list]))
def test_tb(): goods_id = '533127076450' pc_url = 'https://item.taobao.com/item.htm?id={}'.format(goods_id) phone_url = 'https://h5.m.taobao.com/awp/core/detail.htm?id={}'.format(goods_id) print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url)) tb = TaoBaoLoginAndParse(is_real_times_update_call=True) goods_id = tb.get_goods_id_from_url(pc_url) ori_data = tb.get_goods_data(goods_id=goods_id) # pprint(ori_data) data = tb.deal_with_data(goods_id=goods_id) pprint(data) try: del tb except: pass
def get_one_tb_data(**kwargs): ''' 抓取一个tb url的data :return: a dict ''' username = kwargs.get('username', '18698570079') tb_url = kwargs.get('tb_url', '') my_lg = kwargs.get('my_lg') login_taobao = TaoBaoLoginAndParse(logger=my_lg) goods_id = login_taobao.get_goods_id_from_url(tb_url) # 获取goods_id if goods_id == '': my_lg.info('获取到的goods_id为空!') try: del login_taobao # 每次都回收一下 except: pass gc.collect() return {'goods_id': ''} # 错误1: goods_id为空! wait_to_deal_with_url = 'https://item.taobao.com/item.htm?id={0}'.format(goods_id) # 构造成标准干净的淘宝商品地址 tmp_result = login_taobao.get_goods_data(goods_id=goods_id) data = login_taobao.deal_with_data(goods_id=goods_id) # 如果成功获取的话, 返回的是一个data的dict对象 sleep(TAOBAO_SLEEP_TIME) # 这个在服务器里面可以注释掉为.5s if data == {} or tmp_result == {}: my_lg.info('获取到的data为空!') try:del login_taobao except:pass gc.collect() return {'goods_id': goods_id, 'msg': 'data为空!'} # 错误2: 抓取data为空! wait_to_save_data = add_base_info_2_processed_data( data=data, spider_url=wait_to_deal_with_url, username=username, goods_id=goods_id ) try: del login_taobao except: pass return wait_to_save_data
async def _update_db(self): ''' 实时更新数据 :return: ''' while True: # 长期运行报: OSError: [Errno 24] Too many open files, 故不采用每日一志 # self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency) self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() except AssertionError: break one_res, index = await self._get_one_res( slice_params_list=slice_params_list, index=index) await self._except_sleep(res=one_res) self.lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * .5) else: await async_sleep(5.) try: # del self.lg del result except: pass collect()
async def insert_into_table(self, tmp_item, category, current_page, sql_cli, index): ''' 执行插入到淘宝天天特价的操作 :param tmp_item: :param category: :param current_page: :param sql_cli: :param index: :return: index 加1 ''' tmp_url = 'https://item.taobao.com/item.htm?id=' + str( tmp_item.get('goods_id', '')) taobao = TaoBaoLoginAndParse( logger=self.lg, is_real_times_update_call=self.is_real_times_update_call) goods_id = taobao.get_goods_id_from_url(tmp_url) try: taobao.get_goods_data(goods_id=goods_id) goods_data = taobao.deal_with_data(goods_id=goods_id) except Exception: self.lg.error('遇到错误:', exc_info=True) index += 1 return index if goods_data != {}: goods_data['goods_id'] = tmp_item.get('goods_id', '') goods_data['goods_url'] = tmp_url goods_data['schedule'] = [{ 'begin_time': tmp_item.get('start_time', ''), 'end_time': tmp_item.get('end_time', ''), }] goods_data['tejia_begin_time'], goods_data[ 'tejia_end_time'] = get_miaosha_begin_time_and_miaosha_end_time( miaosha_time=goods_data.get('schedule', [])[0]) goods_data['block_id'] = str(category) goods_data['tag_id'] = str(current_page) goods_data['father_sort'] = self.main_sort[category][0] goods_data['child_sort'] = '' # pprint(goods_data) if len(goods_data['all_img_url']) <= 1: self.lg.info('[goods_id: {}]主图个数<=1, pass'.format(goods_id)) return index await taobao.insert_into_taobao_tiantiantejia_table( data=goods_data, pipeline=sql_cli) else: await async_sleep(4) # 否则休息4秒 index += 1 # await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) return index
async def insert_into_table(self, tmp_item, category, current_page, my_pipeline, index): ''' 执行插入到淘宝天天特价的操作 :param tmp_item: :param category: :param current_page: :param my_pipeline: :param index: :return: index 加1 ''' tmp_url = 'https://item.taobao.com/item.htm?id=' + str( tmp_item.get('goods_id', '')) taobao = TaoBaoLoginAndParse(logger=self.my_lg) goods_id = taobao.get_goods_id_from_url(tmp_url) taobao.get_goods_data(goods_id=goods_id) goods_data = taobao.deal_with_data(goods_id=goods_id) if goods_data != {}: goods_data['goods_id'] = tmp_item.get('goods_id', '') goods_data['goods_url'] = tmp_url goods_data['schedule'] = [{ 'begin_time': tmp_item.get('start_time', ''), 'end_time': tmp_item.get('end_time', ''), }] goods_data['tejia_begin_time'], goods_data[ 'tejia_end_time'] = await self.get_tejia_begin_time_and_tejia_end_time( schedule=goods_data.get('schedule', [])[0]) goods_data['block_id'] = str(category) goods_data['tag_id'] = str(current_page) goods_data['father_sort'] = self.main_sort[category][0] goods_data['child_sort'] = '' # pprint(goods_data) await taobao.insert_into_taobao_tiantiantejia_table( data=goods_data, pipeline=my_pipeline) else: await asyncio.sleep(4) # 否则休息4秒 pass index += 1 return index
def run_forever(): #### 实时更新数据 while True: # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 data = {} taobao = TaoBaoLoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) taobao.get_goods_data(item[0]) data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] # print('------>>>| 爬取到的数据为: ', data) ''' 设置最后刷新的商品状态上下架时间 ''' # 1.is_delete由0->1 为下架时间down_time 2. is_delete由1->0 为上架时间shelf_time my_shelf_and_down_time = { 'shelf_time': '', 'down_time': '', } if data['is_delete'] != item[1]: if data['is_delete'] == 0 and item[1] == 1: # is_delete由0->1 表示商品状态上架变为下架 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # is_delete由1->0 表示商品状态下架变为上架 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: if item[2] is None or item[ 2] == '{"shelf_time": "", "down_time": ""}' or len( item[2]) == 35: # 35就是那串初始str if data['is_delete'] == 0: # 上架的状态 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: # 下架的状态 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # 否则保存原始值不变 tmp_shelf_and_down_time = item[2] my_shelf_and_down_time = json.loads( tmp_shelf_and_down_time) # 先转换为dict data['my_shelf_and_down_time'] = my_shelf_and_down_time # print(my_shlef_and_down_time) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def deal_with_data(self): ''' 得到需求数据 :return: ''' data = self.result_data # pprint(data) if data != {}: taobao = TaoBaoLoginAndParse(logger=self.my_lg) goods_id = data['goods_id'] # 天猫类型 tmall_type = data.get('type', 33) # 33用于表示无法正确获取 # self.my_lg.info(str(tmall_type)) # 店铺名称 shop_name = data['seller'].get('shopName', '') # 可能不存在shopName这个字段 # 掌柜 account = data['seller'].get('sellerNick', '') # 商品名称 title = data['item']['title'] # 子标题 sub_title = data['item'].get('subtitle', '') sub_title = re.compile(r'\n').sub('', sub_title) # 店铺主页地址 # shop_name_url = 'https:' + data['seller']['taoShopUrl'] # shop_name_url = re.compile(r'.m.').sub('.', shop_name_url) # 手机版转换为pc版 # 商品价格 # price = data['apiStack'][0]['value']['price']['extraPrices'][0]['priceText'] tmp_taobao_price = data['apiStack'][0].get( 'value', '').get('price').get('price').get('priceText', '') tmp_taobao_price = tmp_taobao_price.split( '-') # 如果是区间的话,分割成两个,单个价格就是一个 # self.my_lg.info(str(tmp_taobao_price)) if len(tmp_taobao_price) == 1: # 商品最高价 # price = Decimal(tmp_taobao_price[0]).__round__(2) # json不能处理decimal所以后期存的时候再处理 price = tmp_taobao_price[0] # 商品最低价 taobao_price = price # self.my_lg.info(str(price)) # self.my_lg.info(str(taobao_price)) else: # price = Decimal(tmp_taobao_price[1]).__round__(2) # taobao_price = Decimal(tmp_taobao_price[0]).__round__(2) price = tmp_taobao_price[1] taobao_price = tmp_taobao_price[0] # self.my_lg.info(str(price)) # self.my_lg.info(str(taobao_price)) # 商品库存 goods_stock = data['apiStack'][0]['value'].get('skuCore', {}).get( 'sku2info', {}).get('0', {}).get('quantity', '') # 商品标签属性名称,及其对应id值 detail_name_list, detail_value_list = taobao._get_detail_name_and_value_list( data=data) ''' 每个标签对应值的价格及其库存 ''' price_info_list = taobao._get_price_info_list( data=data, detail_value_list=detail_value_list) # 所有示例图片地址 all_img_url = taobao._get_all_img_url( tmp_all_img_url=data['item']['images']) # self.my_lg.info(str(all_img_url)) # 详细信息p_info p_info = taobao._get_p_info(tmp_p_info=data.get('props').get( 'groupProps')) # tmp_p_info 一个list [{'内存容量': '32GB'}, ...] if p_info != []: p_info = [{ 'id': 0, 'name': _i.get('p_name', ''), 'value': _i.get('p_value', ''), } for _i in p_info] ''' div_desc ''' # 手机端描述地址 if data.get('item', {}).get('taobaoDescUrl') is not None: phone_div_url = 'https:' + data['item']['taobaoDescUrl'] else: phone_div_url = '' # pc端描述地址 if data.get('item', {}).get('taobaoPcDescUrl') is not None: pc_div_url = 'https:' + data['item']['taobaoPcDescUrl'] # self.my_lg.info(phone_div_url) # self.my_lg.info(pc_div_url) div_desc = taobao.get_div_from_pc_div_url(pc_div_url, goods_id) # self.my_lg.info(div_desc) if div_desc == '': self.my_lg.error('该商品的div_desc为空! 出错goods_id: %s' % str(goods_id)) self.result_data = {} return {} # self.driver.quit() gc.collect() else: pc_div_url = '' div_desc = '' ''' 后期处理 ''' # 后期处理detail_name_list, detail_value_list detail_name_list = [{'spec_name': i[0]} for i in detail_name_list] # 商品标签属性对应的值, 及其对应id值 if data.get('skuBase').get('props') is None: pass else: tmp_detail_value_list = [ item['values'] for item in data.get('skuBase', '').get('props', '') ] # self.my_lg.info(str(tmp_detail_value_list)) detail_value_list = [] for item in tmp_detail_value_list: tmp = [i['name'] for i in item] # self.my_lg.info(str(tmp)) detail_value_list.append(tmp) # 商品标签属性对应的值 # pprint(detail_value_list) is_delete = self._get_is_delete(data=data, title=title) # self.my_lg.info('is_delete = %s' % str(is_delete)) if is_delete == 1: self.my_lg.info('@@@ 该商品已下架...') # 月销量 try: sell_count = str( data.get('apiStack', [])[0].get('value', {}).get('item', {}).get('sellCount', '')) except: sell_count = '0' # self.my_lg.info(sell_count) try: del taobao except: pass result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'pc_div_url': pc_div_url, # pc端描述地址 'div_desc': div_desc, # div_desc 'sell_count': sell_count, # 月销量 'is_delete': is_delete, # 是否下架判断 'type': tmall_type, # 天猫类型 } # pprint(result) # self.my_lg.info(str(result)) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = json.dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) gc.collect() return result else: self.my_lg.info('待处理的data为空的dict, 该商品可能已经转移或者下架') # return { # 'is_delete': 1, # } return {}
async def run_forever(): #### 实时更新数据 # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志 lg = set_logger(logger_name=get_uuid1(), log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # 由于不处理下架的商品,所以is_delete=0 try: # todo 先不处理过期的因为后台没有同步下架会导致其无法查到数据 # tmp_sql_server._delete_table(sql_str=tb_delete_str_2, params=None) # await async_sleep(10) result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7)) except TypeError: lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)') return None await _print_db_old_data( result=result, logger=lg, ) index = 1 for item in result: goods_id = item[0] tejia_end_time = item[2] tmp_sql_server = await _get_new_db_conn( db_obj=tmp_sql_server, index=index, logger=lg, db_conn_type=1, ) if tmp_sql_server.is_connect_success: # lg.info(str(tejia_end_time)) if tejia_end_time < get_shanghai_time(): # 过期的不删除, 降为更新为常规爆款促销商品 # index = await update_expired_goods_to_normal_goods( # goods_id=goods_id, # index=index, # tmp_sql_server=tmp_sql_server, # logger=lg # ) # 过期直接下架 lg.info('@@ 过期下架[goods_id: {}]'.format(goods_id)) _handle_goods_shelves_in_auto_goods_table( goods_id=goods_id, logger=lg, update_sql_str=tb_update_str_5, ) index += 1 else: # 下面为天天特价商品信息更新 ''' ** 由于天天特价不会提前下架商品,就不对应更新特价时间段 ''' # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间 # if index % 6 == 0: # try: del tmp_taobao_tiantiantejia # except: pass # collect() # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=lg) # # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3]) # if tmp_body == '': # msg = '获取到的tmp_body为空str! 出错category为: ' + item[3] # lg.error(msg) # continue # # try: # tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0] # except IndexError: # msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3] # lg.error(msg) # continue # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body) # if tmp_sort_data == 'no items': # lg.info('该api接口获取到的item_list为no items!请检查') # break # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data) # # lg.info(str(tejia_goods_list)) # await async_sleep(.45) # # lg.info('111') ''' 研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过 ''' # if is_in_child_sort(tejia_goods_list, goods_id=goods_id) is False: # 表示被官方提前下架 # # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=goods_id) # # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(goods_id)) # print('222') # pass # else: # 表示商品未被提前下架 lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) taobao = TaoBaoLoginAndParse( logger=lg, is_real_times_update_call=is_real_times_update_call) taobao.get_goods_data(goods_id) goods_data = taobao.deal_with_data(goods_id=goods_id) if goods_data != {}: # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=goods_id) # if tmp_time != []: # begin_time, end_time = tmp_time # # goods_data['goods_id'] = goods_id # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) # await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server) # else: # lg.info('该goods_id不在该api接口的商品中!!') # pass goods_data['goods_id'] = goods_id if goods_data.get('is_delete', 0) == 1: lg.info('@该商品已下架...') await taobao.update_taobao_tiantiantejia_table( data=goods_data, pipeline=tmp_sql_server) else: await async_sleep(4) await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) index += 1 collect() else: lg.error('数据库连接失败,数据库可能关闭或者维护中') pass collect() lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 # sleep(60 * 60 * .5) await async_sleep(5 * 60) else: await async_sleep(60 * 1) collect() return True
async def run_forever(): #### 实时更新数据 # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # 由于不处理下架的商品,所以is_delete=0 sql_str = ''' select goods_id, is_delete, tejia_end_time, block_id, tag_id from dbo.taobao_tiantiantejia where site_id=19 and is_delete=0 and GETDATE()-modfiy_time>2 and MainGoodsID is not null ''' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError: my_lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)') return None my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('--------------------------------------------------------') my_lg.info('待更新的goods_id个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg) for item in result: # 实时更新数据 if index % 50 == 0: my_lg.info('正在重置,并与数据库建立新连接中...') # try: del tmp_sql_server # except: pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: tejia_end_time = item[2] # my_lg.info(str(tejia_end_time)) if item[1] == 1: # 原先下架的商品,扫描到不处理 # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0]) # my_lg.info('该商品goods_id[{0}]已售完, 删除成功!'.format(item[0])) my_lg.info( '&&&&&& 该商品({0})原先状态为is_delete=1, 不进行实际删除操作! 索引为({1})'. format(item[0], str(index))) index += 1 pass elif tejia_end_time < datetime.datetime.now(): # 过期的不删除, 降为更新为常规爆款促销商品 index = await update_expired_goods_to_normal_goods( goods_id=item[0], index=index, tmp_sql_server=tmp_sql_server, logger=my_lg) pass else: # 下面为天天特价商品信息更新 ''' ** 由于天天特价不会提前下架商品,就不对应更新特价时间段 ''' # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间 # if index % 6 == 0: # try: del tmp_taobao_tiantiantejia # except: pass # gc.collect() # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg) # # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3]) # if tmp_body == '': # msg = '获取到的tmp_body为空str! 出错category为: ' + item[3] # my_lg.error(msg) # continue # # try: # tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0] # except IndexError: # msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3] # my_lg.error(msg) # continue # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body) # if tmp_sort_data == 'no items': # my_lg.info('该api接口获取到的item_list为no items!请检查') # break # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data) # # my_lg.info(str(tejia_goods_list)) # await asyncio.sleep(.45) # # my_lg.info('111') ''' 研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过 ''' # if is_in_child_sort(tejia_goods_list, goods_id=item[0]) is False: # 表示被官方提前下架 # # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0]) # # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(item[0])) # print('222') # pass # else: # 表示商品未被提前下架 my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) taobao = TaoBaoLoginAndParse(logger=my_lg) taobao.get_goods_data(item[0]) goods_data = taobao.deal_with_data(goods_id=item[0]) if goods_data != {}: # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=item[0]) # if tmp_time != []: # begin_time, end_time = tmp_time # # goods_data['goods_id'] = item[0] # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) # await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server) # else: # my_lg.info('该goods_id不在该api接口的商品中!!') # pass goods_data['goods_id'] = item[0] '''不专门更新上下架时间段''' # goods_data['schedule'] = [{ # 'begin_time': begin_time, # 'end_time': end_time, # }] # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0]) if goods_data.get('is_delete', 0) == 1: my_lg.info('@该商品已下架...') await taobao.update_taobao_tiantiantejia_table( data=goods_data, pipeline=tmp_sql_server) else: await asyncio.sleep(4) # 否则休息4秒 pass await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) index += 1 gc.collect() else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') pass gc.collect() my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 # sleep(60 * 60 * .5) pass else: sleep(5) gc.collect() return True
def _taobao_keywords_spider(self, **kwargs): ''' 抓取goods_id_list的数据,并存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https://item.taobao.com/item.htm?id=' + item for item in goods_id_list] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url # 用于判断某个goods是否被插入的参数 result = False try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=20,) if self.sql_cli.is_connect_success: goods_id = taobao.get_goods_id_from_url(item) if goods_id == '': self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id) data['username'] = '******' data['main_goods_id'] = None if not self.check_target_data_is_legal(target_data=data): return False result = taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.sql_cli) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True
def _taobao_keywords_spider(self, **kwargs): ''' 抓取goods_id_list的数据,并存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https://item.taobao.com/item.htm?id=' + item for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: taobao = TaoBaoLoginAndParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url(item) if goods_id == '': self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) result = taobao.old_taobao_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True
def run_forever(): #### 实时更新数据 while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 taobao = TaoBaoLoginAndParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) data = taobao.get_goods_data(item[0]) if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: my_lg.info('------>>>| 休眠5s中...') sleep(5) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(10) pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() restart_program()
def run_forever(): #### 实时更新数据 while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server._select_table(sql_str=tb_select_str_3, ) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('总计待更新个数: {0}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 taobao = TaoBaoLoginAndParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlPools() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) oo = taobao.get_goods_data(item[0]) oo_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] data['shelf_time'], data[ 'delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[1], shelf_time=item[4], delete_time=item[5]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[2], old_taobao_price=item[3], new_price=data['price'], new_taobao_price=data['taobao_price']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[6]), site_id=1) except AttributeError: # 处理已被格式化过的 old_sku_info = item[6] data['_is_price_change'], data[ 'sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list( data['price_info_list'], site_id=1), is_price_change=item[7] if item[7] is not None else 0) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: if oo_is_delete == 1: pass else: my_lg.info('------>>>| 休眠5s中...') sleep(4) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(10) pass index += 1 gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() restart_program()
class TBUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/') self.sql_cli = None # 1 SqlServerMyPageInfoSaveItemPipeline | 2 SqlPools self.db_conn_type = 1 self.goods_index = 1 # 并发量 self.concurrency = 100 self.concurrent_type = CONCURRENT_TYPE # 0 sqlserver | 1 new_my_server | 2 redis self.db_res_from = 2 if 'armv7l-with-debian' in platform.platform(): self.server_ip = 'http://0.0.0.0:80' else: self.server_ip = 'http://118.31.39.97' # self.server_ip = 'http://0.0.0.0:5000' async def _update_db(self): ''' 实时更新数据 :return: ''' while True: # 长期运行报: OSError: [Errno 24] Too many open files, 故不采用每日一志 # self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency) self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() except AssertionError: break one_res, index = await self._get_one_res( slice_params_list=slice_params_list, index=index) await self._except_sleep(res=one_res) self.lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * .5) else: await async_sleep(5.) try: # del self.lg del result except: pass collect() async def _get_db_old_data(self) -> (list, None): ''' 获取db需求更新的数据 :return: ''' if self.db_conn_type == 1: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() elif self.db_conn_type == 2: # 使用sqlalchemy管理数据库连接池 self.sql_cli = SqlPools() else: raise ValueError('db_conn_type 值异常!') result = None try: if self.db_res_from == 0: result = self.sql_cli._select_table(sql_str=tb_select_str_3,) elif self.db_res_from == 1: result = await get_waited_2_update_db_data_from_server( server_ip=self.server_ip, _type='tb', child_type=0,) elif self.db_res_from == 2: # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新 result = get_waited_2_update_db_data_from_redis_server( spider_name='tb0', logger=self.lg, slice_num=800,) else: raise ValueError('self.db_res_from value异常!') except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') except Exception: self.lg.error('遇到错误:', exc_info=True) await _print_db_old_data(logger=self.lg, result=result) return result async def _get_one_res(self, slice_params_list: list, index) -> tuple: """ 获取slice_params_list对应的one_res :param slice_params_list: :param index: :return: (list, int) """ def get_tasks_params_list(slice_params_list: list, index: int) -> list: tasks_params_list = [] for item in slice_params_list: db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg) tasks_params_list.append({ 'db_goods_info_obj': db_goods_info_obj, 'index': index, }) index += 1 return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where is goods_id: {}, index: {}] ...'.format( k['db_goods_info_obj'].goods_id, k['index'],) def get_now_args(k) -> list: return [ 'tb', k['db_goods_info_obj'].goods_id, k['index'], self.lg, ] async def handle_one_res(one_res: list): """ one_res后续处理 :param one_res: :return: """ nonlocal slice_params_list # 获取新new_slice_params_list new_slice_params_list = [] for item in slice_params_list: goods_id = item[1] for i in one_res: # self.lg.info(str(i)) try: goods_id2 = i[1] index = i[2] if goods_id == goods_id2: new_slice_params_list.append({ 'index': index, 'before_goods_data': i[3], 'end_goods_data': i[4], 'item': item, }) break else: continue except IndexError: continue # 阻塞方式进行存储, 避免db高并发导致大量死锁 tasks = [] for k in new_slice_params_list: item = k['item'] index = k['index'] db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('create task[where is goods_id: {}, index: {}]...'.format( db_goods_info_obj.goods_id, index)) tasks.append(self.loop.create_task(self._update_one_goods_info_in_db( db_goods_info_obj=db_goods_info_obj, index=index, before_goods_data=k['before_goods_data'], end_goods_data=k['end_goods_data'],))) # self.lg.error(str(one_res)) # self.lg.error(str(tasks)) one_res = await _get_async_task_result( tasks=tasks, logger=self.lg) # pprint(one_res) try: del new_slice_params_list except: pass return one_res # tasks = [] # # method 1 # for item in slice_params_list: # db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg) # self.lg.info('创建 task goods_id: {}'.format(db_goods_info_obj.goods_id)) # tasks.append(self.loop.create_task(self._update_one_goods_info( # db_goods_info_obj=db_goods_info_obj, # index=index))) # index += 1 # # res = await _get_async_task_result(tasks=tasks, logger=self.lg) # method 2 one_res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=get_tasks_params_list( slice_params_list=slice_params_list, index=index,), func_name_where_get_create_task_msg=get_create_task_msg, func_name=block_get_one_goods_info_task_by_external_type, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res2, one_default_res=(), step=self.concurrency, logger=self.lg, get_all_res=True, concurrent_type=self.concurrent_type, ) # pprint(one_res) res = await handle_one_res(one_res=one_res) return (res, index) async def _update_one_goods_info_in_db(self, db_goods_info_obj, index, before_goods_data, end_goods_data): """ 更新单个goods :param db_goods_info_obj: :param index: :param before_goods_data: :param end_goods_data: :return: """ res = False self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, db_conn_type=self.db_conn_type, remainder=25,) if self.sql_cli.is_connect_success: self.lg.info('*' * 20 + ' updating goods_id: {}, index: {} ...'.format( db_goods_info_obj.goods_id, index, )) # 避免下面解析data错误休眠 before_goods_data_is_delete = before_goods_data.get('is_delete', 0) if end_goods_data != {}: data = get_goods_info_change_data( target_short_name='tb', logger=self.lg, data=end_goods_data, db_goods_info_obj=db_goods_info_obj, ) res = to_right_and_update_tb_data( data=data, pipeline=self.sql_cli, logger=self.lg,) else: # 表示返回的data值为空值 if before_goods_data_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format( db_goods_info_obj.goods_id,)) await async_sleep(delay=7., loop=self.loop) # 改为阻塞进程, 机器会挂 # sleep(7.) else: self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=5, loop=self.loop) await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) collect() return [db_goods_info_obj.goods_id, res] async def _get_new_tb_obj(self, index) -> None: if index % 10 == 0: try: del self.taobao except: pass collect() self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True) async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新单个goods :return: ''' res = False await self._get_new_tb_obj(index=index) self.sql_cli = await _get_new_db_conn( db_obj=self.sql_cli, index=index, logger=self.lg, db_conn_type=self.db_conn_type, remainder=25,) if self.sql_cli.is_connect_success: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % ( db_goods_info_obj.goods_id, str(index))) oo = self.taobao.get_goods_data(goods_id=db_goods_info_obj.goods_id) oo_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 data = self.taobao.deal_with_data(goods_id=db_goods_info_obj.goods_id) if data != {}: data = get_goods_info_change_data( target_short_name='tb', logger=self.lg, data = data, db_goods_info_obj=db_goods_info_obj,) res = to_right_and_update_tb_data( data=data, pipeline=self.sql_cli, logger=self.lg) else: if oo_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('------>>>| 休眠8s中...') await async_sleep(delay=8, loop=self.loop) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=10, loop=self.loop) index += 1 self.goods_index = index collect() # 国外服务器上可以缩短时间, 可以设置为0s await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 return [db_goods_info_obj.goods_id, res] async def _except_sleep(self, res): ''' 异常休眠 :param res: :return: ''' count = 0 all_count_fail_sleep_time = 100. # 本来休眠40., 现在不休眠 sleep_time = 0. for item in res: try: if not item[1]: count += 1 except IndexError: pass self.lg.info('Fail count: {}个, 并发量: {}个'.format(count, self.concurrency)) if count/self.concurrency >= .96: # 全失败的休眠方式 self.lg.info('抓取异常!! 休眠{}s中...'.format(all_count_fail_sleep_time)) await async_sleep(all_count_fail_sleep_time) else: if count >= int(self.concurrency/5): self.lg.info('抓取异常!! 休眠{}s中...'.format(sleep_time)) await async_sleep(sleep_time) return None def __del__(self): try: del self.lg except: pass try: del self.sql_cli except: pass try: del self.loop except: pass collect()
class TBUpdater(AsyncCrawler): def __init__(self, *params, **kwargs): AsyncCrawler.__init__(self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/') self.sql_cli = None self.goods_index = 1 # 并发量 self.concurrency = 50 # self.server_ip = 'http://0.0.0.0:5000' self.server_ip = 'http://118.31.39.97' async def _update_db(self): ''' 实时更新数据 :return: ''' while True: self.lg = await self._get_new_logger() result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.taobao = TaoBaoLoginAndParse(logger=self.lg) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() except AssertionError: break tasks = [] for item in slice_params_list: db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format( db_goods_info_obj.goods_id)) tasks.append( self.loop.create_task( self._update_one_goods_info( db_goods_info_obj=db_goods_info_obj, index=index))) index += 1 res = await _get_async_task_result(tasks=tasks, logger=self.lg) await self._except_sleep(res=res) self.lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(5.) try: del self.lg del result except: pass collect() async def _get_db_old_data(self) -> (list, None): ''' 获取db需求更新的数据 :return: ''' # self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() # 使用sqlalchemy管理数据库连接池 self.sql_cli = SqlPools() result = None try: # result = self.sql_cli._select_table(sql_str=tb_select_str_3,) result = await get_waited_2_update_db_data_from_server( server_ip=self.server_ip, _type='tb', child_type=0, ) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') except Exception: self.lg.error('遇到错误:', exc_info=True) await _print_db_old_data(logger=self.lg, result=result) return result async def _get_new_tb_obj(self, index) -> None: if index % 10 == 0: try: del self.taobao except: pass collect() self.taobao = TaoBaoLoginAndParse(logger=self.lg) async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新单个goods :return: ''' res = False await self._get_new_tb_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg, db_conn_type=2, remainder=50) if self.sql_cli.is_connect_success: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (db_goods_info_obj.goods_id, str(index))) oo = self.taobao.get_goods_data( goods_id=db_goods_info_obj.goods_id) oo_is_delete = oo.get('is_delete', 0) # 避免下面解析data错误休眠 data = self.taobao.deal_with_data( goods_id=db_goods_info_obj.goods_id) if data != {}: data = get_goods_info_change_data( target_short_name='tb', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj, ) res = to_right_and_update_tb_data(data=data, pipeline=self.sql_cli, logger=self.lg) else: if oo_is_delete == 1: # 检索后下架状态的, res也设置为True res = True else: self.lg.info('------>>>| 休眠8s中...') await async_sleep(delay=8, loop=self.loop) else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') await async_sleep(delay=10, loop=self.loop) index += 1 self.goods_index = index collect() # 国外服务器上可以缩短时间, 可以设置为0s await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 return [db_goods_info_obj.goods_id, res] async def _except_sleep(self, res): ''' 异常休眠 :param res: :return: ''' count = 0 all_count_fail_sleep_time = 100. sleep_time = 50. for item in res: try: if not item[1]: count += 1 except IndexError: pass self.lg.info('Fail count: {}个, 并发量: {}个'.format( count, self.concurrency)) if count / self.concurrency >= .9: # 全失败的休眠方式 self.lg.info('抓取异常!! 休眠{}s中...'.format(all_count_fail_sleep_time)) await async_sleep(all_count_fail_sleep_time) else: if count >= int(self.concurrency / 5): self.lg.info('抓取异常!! 休眠{}s中...'.format(sleep_time)) await async_sleep(sleep_time) return None def __del__(self): try: del self.lg except: pass try: del self.loop except: pass collect()
def deal_with_data(self): """ 得到需求数据 :return: """ data = self.result_data # pprint(data) if data == {}: return self._data_error_init() taobao = TaoBaoLoginAndParse( logger=self.lg, is_real_times_update_call=self.is_real_times_update_call) goods_id = data['goods_id'] # 天猫类型 tmall_type = data.get('type', 33) # 33用于表示无法正确获取 # 可能不存在shopName这个字段 shop_name = data['seller'].get('shopName', '') account = data['seller'].get('sellerNick', '') title = data['item']['title'] sub_title = data['item'].get('subtitle', '') if sub_title is None: # 处理tm 2版为None的情况 sub_title = '' else: pass sub_title = re.compile(r'\n').sub('', sub_title) price, taobao_price = taobao._get_price_and_taobao_price(data=data) # 商品库存 goods_stock = data['apiStack'][0]['value']\ .get('skuCore', {})\ .get('sku2info', {})\ .get('0', {})\ .get('quantity', '') # 商品标签属性名称,及其对应id值 detail_name_list, detail_value_list = taobao._get_detail_name_and_value_list( data=data) # 上下架判断放在这里避免已下架的不进行taobao_price为空断言判断 is_delete = self._get_is_delete(data=data, title=title) # self.lg.info('is_delete: {}'.format(is_delete)) try: # 每个标签对应值的价格及其库存 price_info_list = taobao._get_price_info_list( data=data, detail_value_list=detail_value_list, is_delete=is_delete, ) # 多规格进行重新赋值 price, taobao_price = taobao._get_new_price_and_taobao_price_when_price_info_list_not_null_list( price_info_list=price_info_list, price=price, taobao_price=taobao_price, is_delete=is_delete) except Exception: self.lg.error('遇到错误[goods_id: {}]:'.format(goods_id), exc_info=True) return self._data_error_init() # 所有示例图片地址 all_img_url = taobao._get_all_img_url( tmp_all_img_url=data['item']['images']) # self.lg.info(str(all_img_url)) # tmp_p_info 一个list [{'内存容量': '32GB'}, ...] p_info = taobao._get_p_info( tmp_p_info=data.get('props').get('groupProps')) if p_info != []: p_info = [{ 'id': 0, 'name': _i.get('p_name', ''), 'value': _i.get('p_value', ''), } for _i in p_info] """div_desc""" div_desc = taobao.get_div_from_pc_div_url(goods_id=goods_id) # self.lg.info(div_desc) if div_desc == '': self.lg.error('该商品的div_desc为空! 出错goods_id: {}'.format(goods_id)) return self._data_error_init() else: pass """后期处理""" # 后期处理detail_name_list, detail_value_list detail_name_list = [{ 'spec_name': i[0], 'img_here': i[2], } for i in detail_name_list] # 商品标签属性对应的值, 及其对应id值 if data.get('skuBase').get('props') is None: pass else: tmp_detail_value_list = [ item['values'] for item in data.get('skuBase', '').get('props', '') ] # self.lg.info(str(tmp_detail_value_list)) detail_value_list = [] for item in tmp_detail_value_list: tmp = [i['name'] for i in item] # self.lg.info(str(tmp)) detail_value_list.append(tmp) # 商品标签属性对应的值 # pprint(detail_value_list) # 月销量 sell_count = '0' try: sell_count = str( data.get('apiStack', [])[0].get('value', {}).get('item', {}).get('sellCount', '')) # self.lg.info(sell_count) except: pass if target_str_contain_some_char_check( target_str=title, check_char_obj=CONTRABAND_GOODS_KEY_TUPLE): self.lg.info('违禁物品下架...') is_delete = 1 else: pass try: del taobao except: pass result = { 'shop_name': shop_name, # 店铺名称 'account': account, # 掌柜 'title': title, # 商品名称 'sub_title': sub_title, # 子标题 'price': price, # 商品价格 'taobao_price': taobao_price, # 淘宝价 'goods_stock': goods_stock, # 商品库存 'detail_name_list': detail_name_list, # 商品标签属性名称 'detail_value_list': detail_value_list, # 商品标签属性对应的值 'price_info_list': price_info_list, # 要存储的每个标签对应规格的价格及其库存 'all_img_url': all_img_url, # 所有示例图片地址 'p_info': p_info, # 详细信息标签名对应属性 'div_desc': div_desc, # div_desc 'sell_count': sell_count, # 月销量 'is_delete': is_delete, # 是否下架判断 'type': tmall_type, # 天猫类型 } # pprint(result) # self.lg.info(str(result)) # wait_to_send_data = { # 'reason': 'success', # 'data': result, # 'code': 1 # } # json_data = dumps(wait_to_send_data, ensure_ascii=False) # print(json_data) return result
def run_forever(): #### 实时更新数据 while True: # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() result_2 = list(tmp_sql_server_2.select_old_table_all_goods_id()) # print(result_2) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 new_table_ali_1688_all_goods_id_list = [item[0] for item in result] for item in result_2: # 实时更新数据 data = {} taobao = TaoBaoLoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = taobao.get_goods_id_from_url(item[0]) if goods_id == '': print('@@@ 原商品的地址为: ', item[0]) continue else: if goods_id in new_table_ali_1688_all_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过!') continue else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) tt = taobao.get_goods_data(goods_id) if tt.get('is_delete') == 1: # 处理已下架的但是还是要插入的 tt['goods_id'] = goods_id tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) tt['username'] = '******' tt['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data=tt, pipeline=tmp_sql_server_2) index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) continue else: pass data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data, pipeline=tmp_sql_server_2) else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
async def _crawl_and_save_these_goods(self, goods_url_list): ''' 采集该文章推荐的商品 :param goods_url_list: :return: ''' sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6' try: result = self.my_pipeline._select_table(sql_str=sql_str) except TypeError: result = [] self.my_lg.info('即将开始抓取该文章的goods, 请耐心等待...') index = 1 db_all_goods_id_list = [item[0] for item in result] for item in goods_url_list: try: goods_id = re.compile(r'id=(\d+)').findall(item.get('goods_url', ''))[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in db_all_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue else: taobao = TaoBaoLoginAndParse(logger=self.my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url(item.get('goods_url', '')) if goods_id == '': self.my_lg.info('@@@ 原商品的地址为: {0}'.format(item.get('goods_url', ''))) continue else: self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.my_lg.info('该文章的商品已经抓取完毕!') return True