def weitao_spider(): global loop if not my_queue.empty(): taobao_short_url = my_queue.get() taobao_short_url_uuid = str( uuid.uuid5(uuid.NAMESPACE_DNS, taobao_short_url)) print(taobao_short_url_uuid) print(old_message_url_uuid_list) if taobao_short_url_uuid in old_message_url_uuid_list: return False print('拿到待处理url:', taobao_short_url) weitao = TaoBaoWeiTaoShareParse() try: loop.run_until_complete( weitao._deal_with_api_info(taobao_short_url)) except RuntimeError: pass try: del weitao # loop.close() # 重用loop except: pass gc.collect() restart_program() # 通过这个重启环境, 避免log重复打印 return True else: print('空queue!') return False
def just_fuck_run(): '''由于写成守护进程无法运行, 采用tmux模式运行, 设置采集时间点用以防止采集冲突''' _spider_run_time = ['00', '01', '02', '03', '04', '05'] while True: if str(get_shanghai_time())[11:13] in _spider_run_time: while True: if str(get_shanghai_time())[11:13] not in _spider_run_time: print('冲突时间点, 不抓取数据..., 上海时间%s' % str(get_shanghai_time())) sleep(60 * 5) break print('一次大抓取即将开始'.center(30, '-')) taobao_qianggou = TaoBaoQiangGou() loop = asyncio.get_event_loop() loop.run_until_complete( taobao_qianggou._deal_with_all_goods_id()) try: del taobao_qianggou loop.close() except: pass gc.collect() print('一次大抓取完毕, 即将重新开始'.center(30, '-')) restart_program() # 通过这个重启环境, 避免log重复打印 sleep(60 * 30) else: print('未在脚本运行时间点...休眠中, 上海时间%s' % str(get_shanghai_time())) sleep(60 * 2)
def main_2(): while True: loop = asyncio.get_event_loop() loop.run_until_complete(run_forever()) try: loop.close() except: pass gc.collect() restart_program()
def just_fuck_run(): while True: print('一次大抓取即将开始'.center(30, '-')) jumeiyoupin_pintuan = JuMeiYouPinPinTuan() loop = asyncio.get_event_loop() loop.run_until_complete(jumeiyoupin_pintuan.deal_with_data()) try: del jumeiyoupin_pintuan loop.close() except: pass gc.collect() print('一次大抓取完毕, 即将重新开始'.center(30, '-')) restart_program() # 通过这个重启环境, 避免log重复打印
def just_fuck_run(): while True: print('一次大抓取即将开始'.center(30, '-')) taobao_qianggou = TaoBaoQiangGouRealTimesUpdate() loop = asyncio.get_event_loop() loop.run_until_complete(taobao_qianggou._run_forever()) try: del taobao_qianggou loop.close() except: pass gc.collect() print('一次大抓取完毕, 即将重新开始'.center(30, '-')) restart_program() # 通过这个重启环境, 避免log重复打印 sleep(60*10)
def just_fuck_run(): while True: print('一次大抓取即将开始'.center(30, '-')) taobao_tiantaintejia = TaoBaoTianTianTeJia() loop = asyncio.get_event_loop() loop.run_until_complete(taobao_tiantaintejia.deal_with_all_goods_id()) try: del taobao_tiantaintejia loop.close() except: pass gc.collect() print('一次大抓取完毕, 即将重新开始'.center(30, '-')) restart_program() # 通过这个重启环境, 避免log重复打印 sleep(60 * 5)
def just_fuck_run(): while True: print('一次大更新即将开始'.center(30, '-')) tmp = JuMeiYouPinRealTimesUpdate() loop = asyncio.get_event_loop() loop.run_until_complete(tmp.run_forever( )) # 切记run_until_complete()一定要接收一个return值,不然视为未结束重复打印结果 print('麻痹的执行完了') try: del tmp loop.close() except: pass gc.collect() print('一次大更新完毕'.center(30, '-')) restart_program() # 通过这个重启环境, 避免log重复打印
def run_forever(): #### 实时更新数据 while True: # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中 my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() # 使用sqlalchemy管理数据库连接池 try: # result = list(tmp_sql_server.select_taobao_all_goods_id()) result = tmp_sql_server.select_taobao_all_goods_id() except TypeError: my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') my_lg.info(str(result)) my_lg.info( '--------------------------------------------------------') my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 for item in result: # 实时更新数据 taobao = TaoBaoLoginAndParse(logger=my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 my_lg.info('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() tmp_sql_server = SqlPools() my_lg.info('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (item[0], str(index))) data = taobao.get_goods_data(item[0]) if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = taobao.deal_with_data(goods_id=item[0]) if data != {}: data['goods_id'] = item[0] data['my_shelf_and_down_time'], data[ 'delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2]) data['_is_price_change'], data[ '_price_change_info'] = _get_price_change_info( old_price=item[3], old_taobao_price=item[4], new_price=data['price'], new_taobao_price=data['taobao_price']) # my_lg.info('------>>>| 爬取到的数据为: ' + str(data)) taobao.to_right_and_update_data( data, pipeline=tmp_sql_server) else: my_lg.info('------>>>| 休眠5s中...') sleep(5) else: # 表示返回的data值为空值 my_lg.error('数据库连接失败,数据库可能关闭或者维护中') sleep(10) pass index += 1 # try: # del taobao # except: # pass gc.collect() # 国外服务器上可以缩短时间, 可以设置为0s sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) # 不能太频繁,与用户请求错开尽量 my_lg.info('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) gc.collect() restart_program()
return data def __del__(self): try: del self.my_lg del self.msg del self.my_pipeline except: pass gc.collect() # _short_url = 'http://m.tb.cn/h.WAjz5RP' # _short_url = 'http://m.tb.cn/h.WA6JGoC' _short_url = 'http://m.tb.cn/h.WA6Hp6H' if __name__ == '__main__': while True: taobao_short_url = input('请输入淘宝短链接:').replace(';', '') weitao = TaoBaoWeiTaoShareParse() loop = asyncio.get_event_loop() loop.run_until_complete(weitao._deal_with_api_info(taobao_short_url)) try: del weitao loop.close() except: pass gc.collect() restart_program() # 通过这个重启环境, 避免log重复打印