''' 设置待爬取的url :param wait_to_deal_with_url: :return: ''' self.wait_to_deal_with_url = wait_to_deal_with_url if __name__ == '__main__': login_ali = LoginAndParse() login_ali.get_qrcode_url() login_ali.login() login_ali.set_self_driver_with_phantomjs() # 不能放在循环内不然会生成很多phantomjs tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() result = list(tmp_sql_server.select_ali_1688_all_goods_id()) print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') # while True: print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) for item in result: # 实时更新数据 tmp_url = 'https://detail.1688.com/offer/' + str(item[0]) + '.html' wait_to_deal_with_url = tmp_url login_ali.set_wait_to_deal_with_url(wait_to_deal_with_url) data = login_ali.deal_with_page_url() if data: data['goods_id'] = item[0] data['deal_with_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S')
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_ali_1688_all_goods_id()) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse() for item in result: # 实时更新数据 data = {} if index % 5 == 0: ali_1688 = ALi1688LoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index)) data = ali_1688.get_ali_1688_data(item[0]) if isinstance(data, int) is True: # 单独处理返回tt为4041 continue else: pass if data.get('is_delete') == 1: # 单独处理【原先插入】就是 下架状态的商品 data['goods_id'] = item[0] data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2] ) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data(data, pipeline=tmp_sql_server) sleep(1.5) # 避免服务器更新太频繁 index += 1 gc.collect() continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = item[0] data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time( tmp_data=data, is_delete=item[1], MyShelfAndDownTime=item[2] ) # print('------>>>| 爬取到的数据为: ', data) ali_1688.to_right_and_update_data(data, pipeline=tmp_sql_server) sleep(.3) # 避免服务器更新太频繁 else: # 表示返回的data值为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del ali_1688 # except: # pass gc.collect() sleep(2.2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60*60*5.5) else: sleep(5) # del ali_1688 gc.collect()
def run_forever(): while True: #### 实时更新数据 tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() try: result = list(tmp_sql_server.select_ali_1688_all_goods_id()) result_2 = list(tmp_sql_server.select_old_table_all_goods_id()) # print(result_2) except TypeError as e: print('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None if result is None: pass else: print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') print(result_2) print('--------------------------------------------------------') print('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) index = 1 # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放 ali_1688 = ALi1688LoginAndParse() # 新表 GoodsInfoAutoGet new_table_ali_1688_all_goods_id_list = list( set([item[0] for item in result])) # 新表里面的goods_id print(new_table_ali_1688_all_goods_id_list) sleep(2) # 老表 old_table_ali_1688_all_goods_list = [] for item in result_2: tmp_goods_id = ali_1688.get_goods_id_from_url(item[0]) if tmp_goods_id != '' and tmp_goods_id not in new_table_ali_1688_all_goods_id_list: old_table_ali_1688_all_goods_list.append([ 'https://detail.1688.com/offer/' + tmp_goods_id + '.html', item[1], tmp_goods_id, ]) else: print('@@@ 原地址为: ', item[0]) # print(old_table_ali_1688_all_goods_list) print('老表待转数据个数为: ', len(old_table_ali_1688_all_goods_list)) sleep(2) for item in old_table_ali_1688_all_goods_list: # 实时更新数据 data = {} if index % 10 == 0: ali_1688 = ALi1688LoginAndParse() if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 print('正在重置,并与数据库建立新连接中...') # try: # del tmp_sql_server # except: # pass # gc.collect() tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline() print('与数据库的新连接成功建立...') if tmp_sql_server.is_connect_success: goods_id = str(item[2]) # print(goods_id) if goods_id in new_table_ali_1688_all_goods_id_list: print('该goods_id已经存在于数据库中, 此处跳过!') index += 1 gc.collect() continue # 跳过sleep else: try: # 老是有重复的,索性单独检查 is_in_db = list( tmp_sql_server. select_the_goods_id_is_in_ali_1688_table( goods_id=goods_id)) except: is_in_db = [] pass if is_in_db != []: print('该goods_id已经存在于数据库中, 此处跳过!') index += 1 gc.collect() continue print( '------>>>| 正在插入的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (goods_id, index)) tt = ali_1688.get_ali_1688_data(goods_id) if tt.get('is_delete') == 1 and tt.get( 'before') is False: # 处理已下架的但是还是要插入的 tt['goods_id'] = goods_id tt['goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' tt['username'] = '******' tt['main_goods_id'] = item[1] ali_1688.old_ali_1688_goods_insert_into_new_table( data=tt, pipeline=tmp_sql_server) index += 1 gc.collect() sleep(1.2) continue else: pass data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' data['username'] = '******' data['main_goods_id'] = item[1] ali_1688.old_ali_1688_goods_insert_into_new_table( data=data, pipeline=tmp_sql_server) else: # 表示返回的data为空值 pass else: # 表示返回的data值为空值 print('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 # try: # del ali_1688 # except: # pass gc.collect() sleep(2) print('全部数据更新完毕'.center(100, '#')) # sleep(60*60) if get_shanghai_time().hour == 0: # 0点以后不更新 sleep(60 * 60 * 5.5) else: sleep(5) # del ali_1688 gc.collect()