def _taobao_keywords_spider(self, **kwargs): ''' 抓取goods_id_list的数据,并存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https://item.taobao.com/item.htm?id=' + item for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: taobao = TaoBaoLoginAndParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url(item) if goods_id == '': self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) result = taobao.old_taobao_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True
async def _crawl_and_save_these_goods(self, goods_url_list): ''' 采集该文章推荐的商品 :param goods_url_list: :return: ''' sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6' try: result = self.my_pipeline._select_table(sql_str=sql_str) except TypeError: result = [] self.my_lg.info('即将开始抓取该文章的goods, 请耐心等待...') index = 1 db_all_goods_id_list = [item[0] for item in result] for item in goods_url_list: try: goods_id = re.compile(r'id=(\d+)').findall(item.get('goods_url', ''))[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in db_all_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue else: taobao = TaoBaoLoginAndParse(logger=self.my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url(item.get('goods_url', '')) if goods_id == '': self.my_lg.info('@@@ 原商品的地址为: {0}'.format(item.get('goods_url', ''))) continue else: self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.my_lg.info('该文章的商品已经抓取完毕!') return True
def run_forever(): #### 实时更新数据 while True: # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() result_2 = list(tmp_sql_server_2.select_old_table_all_goods_id()) # print(result_2) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 new_table_ali_1688_all_goods_id_list = [item[0] for item in result] for item in result_2: # 实时更新数据 data = {} taobao = TaoBaoLoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = taobao.get_goods_id_from_url(item[0]) if goods_id == '': print('@@@ 原商品的地址为: ', item[0]) continue else: if goods_id in new_table_ali_1688_all_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过!') continue else: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) tt = taobao.get_goods_data(goods_id) if tt.get('is_delete') == 1: # 处理已下架的但是还是要插入的 tt['goods_id'] = goods_id tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) tt['username'] = '******' tt['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data=tt, pipeline=tmp_sql_server_2) index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) continue else: pass data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = item[1] # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data, pipeline=tmp_sql_server_2) else: pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect()
def _taobao_keywords_spider(self, **kwargs): ''' 抓取goods_id_list的数据,并存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https://item.taobao.com/item.htm?id=' + item for item in goods_id_list] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url # 用于判断某个goods是否被插入的参数 result = False try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=20,) if self.sql_cli.is_connect_success: goods_id = taobao.get_goods_id_from_url(item) if goods_id == '': self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id) data['username'] = '******' data['main_goods_id'] = None if not self.check_target_data_is_legal(target_data=data): return False result = taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.sql_cli) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True