Ejemplo n.º 1
0
        '''
        设置待爬取的url
        :param wait_to_deal_with_url:
        :return:
        '''
        self.wait_to_deal_with_url = wait_to_deal_with_url


if __name__ == '__main__':
    login_ali = LoginAndParse()
    login_ali.get_qrcode_url()
    login_ali.login()
    login_ali.set_self_driver_with_phantomjs()  # 不能放在循环内不然会生成很多phantomjs

    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
    result = list(tmp_sql_server.select_ali_1688_all_goods_id())
    print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
    print(result)
    print('--------------------------------------------------------')

    # while True:
    print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
    for item in result:  # 实时更新数据
        tmp_url = 'https://detail.1688.com/offer/' + str(item[0]) + '.html'
        wait_to_deal_with_url = tmp_url
        login_ali.set_wait_to_deal_with_url(wait_to_deal_with_url)
        data = login_ali.deal_with_page_url()
        if data:
            data['goods_id'] = item[0]
            data['deal_with_time'] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
Ejemplo n.º 2
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_ali_1688_all_goods_id())
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            ali_1688 = ALi1688LoginAndParse()
            for item in result:  # 实时更新数据
                data = {}
                if index % 5 == 0:
                    ali_1688 = ALi1688LoginAndParse()

                if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                    data = ali_1688.get_ali_1688_data(item[0])
                    if isinstance(data, int) is True:     # 单独处理返回tt为4041
                        continue
                    else:
                        pass

                    if data.get('is_delete') == 1:        # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]

                        data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[1],
                            MyShelfAndDownTime=item[2]
                        )

                        # print('------>>>| 爬取到的数据为: ', data)
                        ali_1688.to_right_and_update_data(data, pipeline=tmp_sql_server)

                        sleep(1.5)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = ali_1688.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[1],
                            MyShelfAndDownTime=item[2]
                        )

                        # print('------>>>| 爬取到的数据为: ', data)
                        ali_1688.to_right_and_update_data(data, pipeline=tmp_sql_server)

                        sleep(.3)       # 避免服务器更新太频繁
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del ali_1688
                # except:
                #     pass
                gc.collect()
                sleep(2.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
Ejemplo n.º 3
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_ali_1688_all_goods_id())
            result_2 = list(tmp_sql_server.select_old_table_all_goods_id())
            # print(result_2)
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            ali_1688 = ALi1688LoginAndParse()
            # 新表 GoodsInfoAutoGet
            new_table_ali_1688_all_goods_id_list = list(
                set([item[0] for item in result]))  # 新表里面的goods_id
            print(new_table_ali_1688_all_goods_id_list)
            sleep(2)

            # 老表
            old_table_ali_1688_all_goods_list = []
            for item in result_2:
                tmp_goods_id = ali_1688.get_goods_id_from_url(item[0])
                if tmp_goods_id != '' and tmp_goods_id not in new_table_ali_1688_all_goods_id_list:
                    old_table_ali_1688_all_goods_list.append([
                        'https://detail.1688.com/offer/' + tmp_goods_id +
                        '.html',
                        item[1],
                        tmp_goods_id,
                    ])
                else:
                    print('@@@ 原地址为: ', item[0])
            # print(old_table_ali_1688_all_goods_list)
            print('老表待转数据个数为: ', len(old_table_ali_1688_all_goods_list))
            sleep(2)

            for item in old_table_ali_1688_all_goods_list:  # 实时更新数据
                data = {}
                if index % 10 == 0:
                    ali_1688 = ALi1688LoginAndParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = str(item[2])
                    # print(goods_id)
                    if goods_id in new_table_ali_1688_all_goods_id_list:
                        print('该goods_id已经存在于数据库中, 此处跳过!')
                        index += 1
                        gc.collect()
                        continue  # 跳过sleep

                    else:
                        try:  # 老是有重复的,索性单独检查
                            is_in_db = list(
                                tmp_sql_server.
                                select_the_goods_id_is_in_ali_1688_table(
                                    goods_id=goods_id))
                        except:
                            is_in_db = []
                            pass
                        if is_in_db != []:
                            print('该goods_id已经存在于数据库中, 此处跳过!')
                            index += 1
                            gc.collect()
                            continue

                        print(
                            '------>>>| 正在插入的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        tt = ali_1688.get_ali_1688_data(goods_id)
                        if tt.get('is_delete') == 1 and tt.get(
                                'before') is False:  # 处理已下架的但是还是要插入的
                            tt['goods_id'] = goods_id
                            tt['goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html'
                            tt['username'] = '******'
                            tt['main_goods_id'] = item[1]

                            ali_1688.old_ali_1688_goods_insert_into_new_table(
                                data=tt, pipeline=tmp_sql_server)

                            index += 1
                            gc.collect()
                            sleep(1.2)
                            continue
                        else:
                            pass

                        data = ali_1688.deal_with_data()
                        if data != {}:
                            data['goods_id'] = goods_id
                            data[
                                'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html'
                            data['username'] = '******'
                            data['main_goods_id'] = item[1]

                            ali_1688.old_ali_1688_goods_insert_into_new_table(
                                data=data, pipeline=tmp_sql_server)

                        else:  # 表示返回的data为空值
                            pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del ali_1688
                # except:
                #     pass
                gc.collect()
                sleep(2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()