async def _get_new_jd_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.jd except: pass collect() self.jd = JdParse(logger=self.lg)
def get_one_jd_data(**kwargs): ''' 抓取jd url的data :param kwargs: :return: ''' username = kwargs.get('username', '18698570079') wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '') my_lg = kwargs.get('my_lg') jd = JdParse(logger=my_lg) goods_id = jd.get_goods_id_from_url( wait_to_deal_with_url) # 获取goods_id, 这里返回的是一个list if goods_id == []: # 如果得不到goods_id, 则return error my_lg.info('获取到的goods_id为空!') try: del jd # 每次都回收一下 except: pass gc.collect() return {'goods_id': ''} # 改进判断,根据传入数据判断是京东(京东超市属于其中),还是京东全球购,还是京东大药房 ##################################################### if goods_id[0] == 0: # [0, '1111'] wait_to_deal_with_url = 'https://item.jd.com/' + goods_id[ 1] + '.html' # 构造成标准干净的jd商品地址 elif goods_id[0] == 1: # [1, '1111'] wait_to_deal_with_url = 'https://item.jd.hk/' + goods_id[1] + '.html' elif goods_id[0] == 2: # [2, '1111', 'https://xxxxx'] wait_to_deal_with_url = 'https://item.yiyaojd.com/' + goods_id[ 1] + '.html' tmp_result = jd.get_goods_data(goods_id=goods_id) data = jd.deal_with_data(goods_id=goods_id) # 如果成功获取的话, 返回的是一个data的dict对象 if data == {} or tmp_result == {}: my_lg.info('获取到的data为空!') try: del jd except: pass gc.collect() return {'goods_id': goods_id[1], 'msg': 'data为空!'} wait_to_save_data = add_base_info_2_processed_data( data=data, spider_url=wait_to_deal_with_url, username=username, goods_id=goods_id[1]) try: del jd except: pass return wait_to_save_data
async def _get_new_jd_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.jd except: pass collect() self.jd = JdParse( logger=self.lg, is_real_times_update_call=True, )
async def _update_db(self): while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj( tasks_params_list=result, step=self.concurrency) self.jd = JdParse( logger=self.lg, is_real_times_update_call=True, ) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: db_goods_info_obj = JDDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format( db_goods_info_obj.goods_id)) tasks.append( self.loop.create_task( self._update_one_goods_info( db_goods_info_obj=db_goods_info_obj, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(5.5) try: del self.jd except: pass collect()
class JDUpdater(AsyncCrawler): """jd常规商品更新""" def __init__(self, *params, **kwargs): AsyncCrawler.__init__( self, *params, **kwargs, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/jd/实时更新/' ) self.sql_cli = None self.goods_index = 1 # 并发量 self.concurrency = 10 async def _get_db_old_data(self): self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() result = None try: result = list(self.sql_cli._select_table(sql_str=jd_select_str_1)) except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') await _print_db_old_data(logger=self.lg, result=result) return result async def _get_new_jd_obj(self, index): if index % 10 == 0: # 不能共享一个对象了, 否则驱动访问会异常! try: del self.jd except: pass collect() self.jd = JdParse(logger=self.lg) async def _get_tmp_item(self, site_id, goods_id): tmp_item = [] if site_id == 7 or site_id == 8: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif site_id == 9: tmp_item.append(1) elif site_id == 10: tmp_item.append(2) tmp_item.append(goods_id) return tmp_item async def _update_one_goods_info(self, db_goods_info_obj, index): ''' 更新单个jd商品信息 :param db_goods_info_obj: :param index: :return: ''' res = False await self._get_new_jd_obj(index=index) self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg) if self.sql_cli.is_connect_success: self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format( db_goods_info_obj.goods_id, index)) tmp_item = await self._get_tmp_item( site_id=db_goods_info_obj.site_id, goods_id=db_goods_info_obj.goods_id,) data = self.jd.get_goods_data(goods_id=tmp_item) if data.get('is_delete', 1) == 1: self.lg.info('该商品已下架...') self.sql_cli._update_table_2( sql_str=jd_update_str_2, params=(str(get_shanghai_time()), tmp_item[1],), logger=self.lg) await async_sleep(1.2) index += 1 self.goods_index = index return db_goods_info_obj.goods_id, index data = self.jd.deal_with_data(goods_id=tmp_item) if data != {}: data = get_goods_info_change_data( target_short_name='jd', logger=self.lg, data=data, db_goods_info_obj=db_goods_info_obj,) self.jd.to_right_and_update_data(data, pipeline=self.sql_cli) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 self.lg.error('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 self.goods_index = index collect() await async_sleep(1.2) # 避免被发现使用代理 return db_goods_info_obj.goods_id, index async def _update_db(self): while True: self.lg = await self._get_new_logger(logger_name=get_uuid1()) result = await self._get_db_old_data() if result is None: pass else: self.goods_index = 1 tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency) self.jd = JdParse(logger=self.lg) index = 1 while True: try: slice_params_list = tasks_params_list.__next__() # self.lg.info(str(slice_params_list)) except AssertionError: # 全部提取完毕, 正常退出 break tasks = [] for item in slice_params_list: db_goods_info_obj = JDDbGoodsInfoObj(item=item, logger=self.lg) self.lg.info('创建 task goods_id: {}'.format(db_goods_info_obj.goods_id)) tasks.append(self.loop.create_task(self._update_one_goods_info( db_goods_info_obj=db_goods_info_obj, index=index))) index += 1 await _get_async_task_result(tasks=tasks, logger=self.lg) self.lg.info('全部数据更新完毕'.center(100, '#')) if get_shanghai_time().hour == 0: # 0点以后不更新 await async_sleep(60 * 60 * 5.5) else: await async_sleep(5.5) try: del self.jd except: pass collect() def __del__(self): try: del self.lg except: pass try: del self.loop except:pass collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() sql_str = ''' select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where (SiteID=7 or SiteID=8 or SiteID=9 or SiteID=10) and GETDATE()-ModfiyTime>1 and IsDelete=0 and MainGoodsID is not null ''' try: result = list(tmp_sql_server._select_table(sql_str=sql_str)) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') continue print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('总计待更新个数:', len(result)) print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 jd = JdParse() for item in result: # 实时更新数据 # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # jd = JdParse() if index % 10 == 0: try: del jd except: pass gc.collect() jd = JdParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[1], index)) tmp_item = [] if item[0] == 7 or item[0] == 8: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 9: tmp_item.append(1) elif item[0] == 10: tmp_item.append(2) tmp_item.append(item[1]) jd.get_goods_data(goods_id=tmp_item) data = jd.deal_with_data(goods_id=tmp_item) if data != {}: data['goods_id'] = item[1] data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[2], MyShelfAndDownTime=item[3] ) data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[4], old_taobao_price=item[5], new_price=data['price'], new_taobao_price=data['taobao_price'] ) # print('------>>>| 爬取到的数据为: ', data) jd.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del jd # except: # pass gc.collect() sleep(1.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) try: del jd except: pass if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(5) # del ali_1688 gc.collect()
def get_all_user_and_their_recommend_goods_list(self): for index in range(1, 100): t = str(time.time().__round__()) + str(randint(100, 999)) # time.time().__round__() 表示保留到个位 # 达人推荐的地址(ajax请求) tmp_url = 'https://wq.jd.com/shopgroup_feed/GetDarenFeeds?pageno={}&pagesize=5&darenType=0&perDarenFeedNum=3&totalpage=1&_={}&callback=jsonpCBKC&g_ty=ls'.format( str(index), t ) self.from_ip_pool_set_proxy_ip_to_phantomjs() self.driver.set_page_load_timeout(15) # 设置成15秒避免数据出错 try: self.driver.get(tmp_url) self.driver.implicitly_wait(15) except Exception as e: # 如果超时, 终止加载并继续后续操作 print('-->>time out after 15 seconds when loading page') self.driver.execute_script('window.stop()') # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作 # pass body = self.driver.page_source body = re.compile(r'\n').sub('', body) body = re.compile(r'\t').sub('', body) body = re.compile(r' ').sub('', body) # print(body) body = re.compile(r'square\((.*)\)').findall(body) if body != []: body = body[0] try: data = json.loads(body) # pprint(data) except: print('json.loads转换body得到data时出错!') return [] if data.get('user_list') is None: # 表示没有数据了,返回的为 square({"errmsg":"","iRet":0,"totalnum":347} ) print('body中获取的user_list为None!') pass else: user_list = data.get('user_list', []) # pprint(user_list) for item in user_list: # 达人昵称 nick_name = item.get('nickname', '') # 达人头像 head_url = item.get('headurl', '') head_url = re.compile(r'http:').sub('', head_url) if re.compile(r'^http').findall(head_url) != []: pass else: head_url = 'http:' + head_url # 个性签名 profile = item.get('profile', '') my_pipeline = SqlServerMyPageInfoSaveItemPipeline() sql_str = r'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=7 or SiteID=8 or SiteID=9 or SiteID=10' _ = my_pipeline._select_table(sql_str=sql_str) db_goods_id = [j[1] for j in list(_)] if _ is not None else [] # print(db_goods_id) sql_str = r'select share_id from dbo.jd_youxuan_daren_recommend' db_share_id = [j[0] for j in list(my_pipeline._select_table(sql_str=sql_str))] # print(db_share_id) jd = JdParse() # 达人推荐的商品info feed_list = item.get('feed_list', []) for feed_list_item in feed_list: if feed_list_item.get('shareid', '') in db_share_id: print('该share_id({})已存在于数据库中, 此处跳过!'.format(feed_list_item.get('shareid', ''))) pass else: # share_id share_id = feed_list_item.get('shareid', '') article_url = 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id print('------>>>| 正在抓取的jd优选达人推荐文章的地址为: ', 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id) # 图片的信息 tmp_share_img_url_list = [] for item2 in feed_list_item.get('sharepicurl', '').split(','): if re.compile(r'^//').findall(item2) == []: tmp_share_img_url = 'https://img14.360buyimg.com/evalpic/s800x800_' + item2 else: tmp_share_img_url = 'http:' + item2 tmp_share_img_url_list.append(tmp_share_img_url) share_img_url_list = [{'img_url': item5} for item5 in tmp_share_img_url_list] # 处理得到达人的自拍图片div tmp_img_div_desc = '' for item4 in tmp_share_img_url_list: tmp_img_div = r'<img src="{}" style="height:auto;width:100%;"/>'.format(item4) tmp_img_div_desc += tmp_img_div my_img_div = '<div>' + tmp_img_div_desc + '</div>' # print(my_img_div) # 获取到goods_id 和 fisrt_text share_url = 'https://wq.jd.com/shopgroup_feed/FeedDetail?shareid=' + feed_list_item.get('shareid', '') + '&g_tk=1975813451' try: self.from_ip_pool_set_proxy_ip_to_phantomjs() self.driver.get(share_url) self.driver.implicitly_wait(15) except Exception as e: # 如果超时, 终止加载并继续后续操作 print('-->>time out after 15 seconds when loading page') self.driver.execute_script('window.stop()') # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作 # pass feed_detail_body = self.driver.page_source feed_detail_body = re.compile(r'\n').sub('', feed_detail_body) feed_detail_body = re.compile(r'\t').sub('', feed_detail_body) feed_detail_body = re.compile(r' ').sub('', feed_detail_body) feed_data = re.compile(r'square\((.*)\)').findall(feed_detail_body) # print(feed_data) if feed_data != []: feed_data = feed_data[0] try: feed_data = json.loads(feed_data) except: print('json.loads转换feed_data失败,此处跳过!') break # 跳出后执行后面的外层的else # 文章标题 title = feed_data.get('feeddata', {}).get('title', '') title = re.compile(r'12.12').sub('', title) # 达人评论内容 tmp_comment_content = feed_data.get('feeddata', {}).get('commentcontent', '') tmp_comment_content = re.compile(r'&').sub('', tmp_comment_content) tmp_comment_content = re.compile(r'\n').sub('', tmp_comment_content) tmp_comment_content = re.compile(r'12.12').sub('', tmp_comment_content) tmp_comment_content = re.compile(r'11.11').sub('', tmp_comment_content) comment_content = tmp_comment_content if title == '': # 由于获取到title为空, 所有title = comment_content, 并把comment_content = '' title = comment_content comment_content = '' # print('该文章的标题为: ', title) # print('达人的评论内容为: ', comment_content) # first_text(文章的第一段评论内容) first_text = feed_data.get('feeddata', {}).get('firsttext', '') first_text = re.compile(r'12.12').sub('', first_text) first_text = re.compile(r'11.11').sub('', first_text) # print('first_text为: ', first_text) sku_id = feed_data.get('feeddata', {}).get('skuid') if sku_id == '0': # 如果sku_id = '0'表示没有sku_id sku_id = '' # print('sku_id为: ', sku_id) share_id = feed_list_item.get('shareid', '') tmp_div_body_dict = self.get_div_body(share_id=share_id) # pprint(tmp_div_body_dict) if tmp_div_body_dict['sku_info'] == [] and sku_id != '': # 表示如果tmp_div_body_dict['sku_info']为[],则第二部分没有goods_id,所有将第一个sku_id赋值给sku_info goods_id_list = [{'goods_id': sku_id}] else: # 这篇文章推荐的商品goods_id的list(第一个为没有div_body时的goods_id) goods_id_list = [{'goods_id': item6} for item6 in tmp_div_body_dict['sku_info']] tmp_div_body = '<div>' + '<h3>{}</h3>'.format(title) + '<p>{}</p>'.format(comment_content) + my_img_div + tmp_div_body_dict['div_body'] # print('该文章推荐的商品goods_id的list为: ', goods_id_list) # print(tmp_div_body) else: print('获取feed_data失败!') return [] # 后期处理 if comment_content == '': comment_content = first_text ''' 时区处理,时间处理到上海时间 ''' tz = pytz.timezone('Asia/Shanghai') # 创建时区对象 now_time = datetime.datetime.now(tz) # 处理为精确到秒位,删除时区信息 now_time = re.compile(r'\..*').sub('', str(now_time)) # 将字符串类型转换为datetime类型 now_time = datetime.datetime.strptime(now_time, '%Y-%m-%d %H:%M:%S') create_time = now_time # 创建的时间 result = { 'nick_name': nick_name, # 达人昵称 'head_url': head_url, # 达人头像 'profile': profile, # 个性签名 'share_id': share_id, # 分享的share_id 'article_url': article_url, # 文章原地址 'title': title, # 文章标题 'comment_content': comment_content, # 达人的评论内容 'share_img_url_list': share_img_url_list, # 达人自拍照片list # 'first_text': first_text, # 文章的第一段评论文字 'goods_id_list':goods_id_list, # 文章中所有推荐的商品的goods_id的list 'div_body': tmp_div_body, # 文章主体div 'create_time': create_time, # 文章创建的时间 } # pprint(result) print(result) params = self._get_db_insert_params(item=result) sql_str = r'insert into dbo.jd_youxuan_daren_recommend(nick_name, head_url, profile, share_id, gather_url, title, comment_content, share_img_url_list, goods_id_list, div_body, create_time) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' my_pipeline._insert_into_table(sql_str=sql_str, params=params) print('准备开始抓取该文章中的所有推荐商品'.center(30, '-')) for i in goods_id_list: if i.get('goods_id', '') in db_goods_id: print('该goods_id({})已经存在于数据库中, 此处跳过!'.format(i.get('goods_id', ''))) pass else: tmp_goods_id_url = 'https://item.jd.com/' + i.get('goods_id', '') + '.html' goods_id = jd.get_goods_id_from_url(jd_url=tmp_goods_id_url) jd.get_goods_data(goods_id=goods_id) tmp_jd_data = jd.deal_with_data(goods_id=goods_id) tmp_jd_data['spider_url'] = tmp_goods_id_url tmp_jd_data['username'] = '******' tmp_jd_data['goods_id'] = goods_id[1] jd.insert_into_jd_table(data=tmp_jd_data, pipeline=my_pipeline) print('该文章内推荐的商品全部抓取完毕'.center(30, '-')) else: print('body为空list!')
def _jd_keywords_spider(self, **kwargs): ''' jd对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') '''初始地址可以直接用这个[https://item.jd.com/xxxxx.html]因为jd会给你重定向到正确地址, 存也可以存这个地址''' # 所以这边jd就不分类存,一律存为常规商品site_id = 7 goods_url_list = [ 'https://item.jd.com/{0}.html'.format(str(item)) for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入db的参数 try: goods_id = re.compile('\/(\d+)\.html').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: jd = JdParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = jd.get_goods_id_from_url(item) if goods_id == []: self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = jd.get_goods_data(goods_id) data = jd.deal_with_data(goods_id) goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = item result = jd.old_jd_goods_insert_into_new_table( data, self.my_pipeline) else: pass else: self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 sleep(1) try: del jd except: pass gc.collect() if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_jd_all_goods_id_url()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 data = {} # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 jd = JdParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[1], index)) tmp_item = [] if item[0] == 7 or item[0] == 8: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 9: tmp_item.append(1) elif item[0] == 10: tmp_item.append(2) tmp_item.append(item[1]) jd.get_goods_data(goods_id=tmp_item) data = jd.deal_with_data(goods_id=tmp_item) if data != {}: data['goods_id'] = item[1] ''' 设置最后刷新的商品状态上下架时间 ''' # 1.is_delete由0->1 为下架时间down_time 2. is_delete由1->0 为上架时间shelf_time my_shelf_and_down_time = { 'shelf_time': '', 'down_time': '', } if data['is_delete'] != item[2]: if data['is_delete'] == 0 and item[2] == 1: # is_delete由0->1 表示商品状态上架变为下架 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # is_delete由1->0 表示商品状态下架变为上架 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: if item[3] is None or item[ 3] == '{"shelf_time": "", "down_time": ""}' or len( item[3]) == 35: # 35就是那串初始str if data['is_delete'] == 0: # 上架的状态 my_shelf_and_down_time['shelf_time'] = str( get_shanghai_time()) else: # 下架的状态 my_shelf_and_down_time['down_time'] = str( get_shanghai_time()) else: # 否则保存原始值不变 tmp_shelf_and_down_time = item[3] my_shelf_and_down_time = json.loads( tmp_shelf_and_down_time) # 先转换为dict data['my_shelf_and_down_time'] = my_shelf_and_down_time # print(my_shlef_and_down_time) # print('------>>>| 爬取到的数据为: ', data) jd.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del tmall # except: # pass gc.collect() # sleep(1) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()
def run_forever(): while True: my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/jd/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() # and GETDATE()-ModfiyTime>1 and IsDelete=0 try: result = list(tmp_sql_server._select_table(sql_str=jd_select_str_1)) except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') continue my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info('--------------------------------------------------------') my_lg.info('总计待更新个数:{}'.format(len(result))) my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 jd = JdParse(logger=my_lg) for item in result: # 实时更新数据 # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 # jd = JdParse() if index % 10 == 0: try: del jd except: pass gc.collect() jd = JdParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info('------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'.format(item[1], index)) tmp_item = [] if item[0] == 7 or item[0] == 8: # 从数据库中取出时,先转换为对应的类型 tmp_item.append(0) elif item[0] == 9: tmp_item.append(1) elif item[0] == 10: tmp_item.append(2) tmp_item.append(item[1]) jd.get_goods_data(goods_id=tmp_item) data = jd.deal_with_data(goods_id=tmp_item) if data != {}: data['goods_id'] = item[1] data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time( tmp_data=data, is_delete=item[2], shelf_time=item[5], delete_time=item[6]) my_lg.info('上架时间: {0}, 下架时间: {1}'.format(data['shelf_time'], data['delete_time'])) data['_is_price_change'], data['_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) site_id = jd._from_jd_type_get_site_id_value(jd_type=data['jd_type']) try: old_sku_info = format_price_info_list( price_info_list=json_2_dict(item[7]), site_id=site_id) except AttributeError: # 处理已被格式化过的 old_sku_info = item[7] data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record( old_sku_info=old_sku_info, new_sku_info=format_price_info_list(data['price_info_list'], site_id=site_id), is_price_change=item[8] if item[8] is not None else 0 ) jd.to_right_and_update_data(data, pipeline=tmp_sql_server) else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() sleep(1.2) my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) try: del jd except: pass if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(5) gc.collect()