コード例 #1
0
 async def _get_new_ali_obj(self, index) -> None:
     if index % 10 == 0:
         # 不能共享一个对象了, 否则驱动访问会异常!
         try:
             del self.ali_1688
         except:
             pass
         collect()
         self.ali_1688 = ALi1688LoginAndParse(logger=self.lg)
コード例 #2
0
    async def _update_db(self):
        """
        常规数据实时更新
        :return:
        """
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.ali_1688 = ALi1688LoginAndParse(logger=self.lg)
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        goods_id = item[1]
                        db_goods_info_obj = ALDbGoodsInfoObj(item=item,
                                                             logger=self.lg)
                        self.lg.info('创建 task goods_id: {}'.format(goods_id))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(
                                    db_goods_info_obj=db_goods_info_obj,
                                    index=index)))
                        index += 1
                    if tasks != []:
                        await _get_async_task_result(tasks=tasks,
                                                     logger=self.lg)
                    else:
                        pass

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(5.5)
            try:
                del self.ali_1688
            except:
                pass
            collect()
コード例 #3
0
ファイル: al.py プロジェクト: yfeng2018/python-1
def get_one_1688_data(**kwargs):
    '''
    抓取一个1688 url 的data
    :param kwargs:
    :return:
    '''
    username = kwargs.get('username', '18698570079')
    wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '')
    my_lg = kwargs.get('my_lg')

    login_ali = ALi1688LoginAndParse(logger=my_lg)
    goods_id = login_ali.get_goods_id_from_url(
        wait_to_deal_with_url)  # 获取goods_id
    if goods_id == '':  # 如果得不到goods_id, 则return error
        my_lg.info('获取到的goods_id为空!')
        try:
            del login_ali  # 每次都回收一下
        except:
            pass
        gc.collect()

        return {'goods_id': ''}  # 错误1: goods_id为空值

    tmp_result = login_ali.get_ali_1688_data(goods_id=goods_id)
    data = login_ali.deal_with_data()  # 如果成功获取的话, 返回的是一个data的dict对象
    if data == {} or tmp_result == {}:
        my_lg.info('获取到的data为空!')
        try:
            del login_ali
        except:
            pass
        gc.collect()

        return {'goods_id': goods_id, 'msg': 'data为空!'}  # 错误2: 抓取失败

    wait_to_save_data = add_base_info_2_processed_data(
        data=data,
        spider_url=wait_to_deal_with_url,
        username=username,
        goods_id=goods_id)
    try:
        del login_ali
    except:
        pass

    return wait_to_save_data
コード例 #4
0
    def _1688_keywords_spider(self, **kwargs):
        '''
        1688对应关键字的商品信息抓取存储
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = [
            'https://detail.1688.com/offer/{0}.html'.format(item)
            for item in goods_id_list
        ]

        self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:
            result = False  # 每次重置
            try:
                goods_id = re.compile('offer/(.*?).html').findall(item)[0]
            except IndexError:
                self.my_lg.error('re获取goods_id时出错, 请检查!')
                continue
            if goods_id in self.db_existed_goods_id_list:
                self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass
            else:
                ali_1688 = ALi1688LoginAndParse(logger=self.my_lg)
                if self.add_goods_index % 20 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = ali_1688.get_goods_id_from_url(item)
                    if goods_id == '':
                        self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.my_lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id, str(self.add_goods_index)))
                        tt = ali_1688.get_ali_1688_data(goods_id)
                        if tt.get('is_delete') == 1 and tt.get(
                                'before') is False:  # 处理已下架的但是还是要插入的
                            # 下架的商品就pass
                            continue

                        data = ali_1688.deal_with_data()
                        if data != {}:
                            data['goods_id'] = goods_id
                            data[
                                'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html'
                            data['username'] = '******'
                            data['main_goods_id'] = None

                            result = ali_1688.old_ali_1688_goods_insert_into_new_table(
                                data=data, pipeline=self.my_pipeline)
                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                try:
                    del ali_1688
                except:
                    pass
                gc.collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.my_lg.info('该关键字的商品已经抓取完毕!')

        return True
コード例 #5
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = 'select GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=2 order by ID desc'
        sql_str_2 = 'select GoodsOutUrl, goods_id from db_k85u.dbo.goodsinfo where OutGoodsType<=13 and onoffshelf=1 and not exists (select maingoodsid from gather.dbo.GoodsInfoAutoGet c where c.maingoodsid=goodsinfo.goods_id)'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
            result_2 = list(tmp_sql_server._select_table(sql_str=sql_str_2))
            # print(result_2)
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            ali_1688 = ALi1688LoginAndParse()
            # 新表 GoodsInfoAutoGet
            new_table_ali_1688_all_goods_id_list = list(
                set([item[0] for item in result]))  # 新表里面的goods_id
            print(new_table_ali_1688_all_goods_id_list)
            sleep(2)

            # 老表
            old_table_ali_1688_all_goods_list = []
            for item in result_2:
                tmp_goods_id = ali_1688.get_goods_id_from_url(item[0])
                if tmp_goods_id != '' and tmp_goods_id not in new_table_ali_1688_all_goods_id_list:
                    old_table_ali_1688_all_goods_list.append([
                        'https://detail.1688.com/offer/' + tmp_goods_id +
                        '.html',
                        item[1],
                        tmp_goods_id,
                    ])
                else:
                    print('@@@ 原地址为: ', item[0])
            # print(old_table_ali_1688_all_goods_list)
            print('老表待转数据个数为: ', len(old_table_ali_1688_all_goods_list))
            sleep(2)

            for item in old_table_ali_1688_all_goods_list:  # 实时更新数据
                if index % 10 == 0:
                    ali_1688 = ALi1688LoginAndParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = str(item[2])
                    # print(goods_id)
                    if goods_id in new_table_ali_1688_all_goods_id_list:
                        print('该goods_id已经存在于数据库中, 此处跳过!')
                        index += 1
                        gc.collect()
                        continue  # 跳过sleep

                    else:
                        sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=2 and GoodsID=%s'
                        try:  # 老是有重复的,索性单独检查
                            is_in_db = list(
                                tmp_sql_server._select_table(
                                    sql_str=sql_str, params=(goods_id)))
                        except:
                            is_in_db = []
                            pass
                        if is_in_db != []:
                            print('该goods_id已经存在于数据库中, 此处跳过!')
                            index += 1
                            gc.collect()
                            continue

                        print(
                            '------>>>| 正在插入的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        tt = ali_1688.get_ali_1688_data(goods_id)
                        if tt.get('is_delete') == 1 and tt.get(
                                'before') is False:  # 处理已下架的但是还是要插入的
                            tt['goods_id'] = goods_id
                            tt['goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html'
                            tt['username'] = '******'
                            tt['main_goods_id'] = item[1]

                            ali_1688.old_ali_1688_goods_insert_into_new_table(
                                data=tt, pipeline=tmp_sql_server)

                            index += 1
                            gc.collect()
                            sleep(1.2)
                            continue
                        else:
                            pass

                        data = ali_1688.deal_with_data()
                        if data != {}:
                            data['goods_id'] = goods_id
                            data[
                                'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html'
                            data['username'] = '******'
                            data['main_goods_id'] = item[1]

                            ali_1688.old_ali_1688_goods_insert_into_new_table(
                                data=data, pipeline=tmp_sql_server)

                        else:  # 表示返回的data为空值
                            pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del ali_1688
                # except:
                #     pass
                gc.collect()
                sleep(2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
コード例 #6
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=2 order by ID desc'

        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            ali_1688 = ALi1688LoginAndParse()
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    ali_1688 = ALi1688LoginAndParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[0], index))
                    data = ali_1688.get_ali_1688_data(item[0])
                    if isinstance(data, int) is True:  # 单独处理返回tt为4041
                        continue
                    else:
                        pass

                    if data.get('is_delete') == 1:  # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]

                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])

                        # print('------>>>| 爬取到的数据为: ', data)
                        ali_1688.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(1.5)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = ali_1688.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])
                        '''为了实现这个就必须保证price, taobao_price在第一次抓下来后一直不变,变得记录到_price_change_info字段中'''
                        # 业务逻辑
                        #   公司后台 modify_time > 转换时间,is_price_change=1, 然后对比pricechange里面的数据,要是一样就不提示平台员工改价格
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        # print('------>>>| 爬取到的数据为: ', data)
                        ali_1688.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(.3)  # 避免服务器更新太频繁
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del ali_1688
                # except:
                #     pass
                gc.collect()
                sleep(2.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
コード例 #7
0
def run_forever():
    while True:
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/1688/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server._select_table(sql_str=al_select_str_6))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')
            my_lg.info('待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            ali_1688 = ALi1688LoginAndParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    ali_1688 = ALi1688LoginAndParse(logger=my_lg)

                if index % 50 == 0:
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'
                        .format(item[0], index))
                    data = ali_1688.get_ali_1688_data(item[0])
                    if isinstance(data, int) is True:  # 单独处理返回tt为4041
                        continue
                    else:
                        pass

                    if data.get('is_delete') == 1:  # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]

                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        # my_lg.info('上架时间:{0}, 下架时间:{1}'.format(data['shelf_time'], data['delete_time']))
                        ali_1688.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(1.5)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = ali_1688.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        # my_lg.info('上架时间:{0}, 下架时间:{1}'.format(data['shelf_time'], data['delete_time']))
                        '''为了实现这个就必须保证price, taobao_price在第一次抓下来后一直不变,变得记录到_price_change_info字段中'''
                        # 业务逻辑
                        #   公司后台 modify_time > 转换时间,is_price_change=1, 然后对比pricechange里面的数据,要是一样就不提示平台员工改价格
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[2],
                                old_taobao_price=item[3],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        try:
                            old_sku_info = format_price_info_list(
                                price_info_list=json_2_dict(item[6]),
                                site_id=2)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[6]
                        data['_is_price_change'], data[
                            'sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(
                                    data['sku_map'], site_id=2),
                                is_price_change=item[7]
                                if item[7] is not None else 0)

                        ali_1688.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                        sleep(.3)  # 避免服务器更新太频繁
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                sleep(2.2)
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
コード例 #8
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_ali_1688_all_goods_id())
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            ali_1688 = ALi1688LoginAndParse()
            for item in result:  # 实时更新数据
                data = {}
                if index % 5 == 0:
                    ali_1688 = ALi1688LoginAndParse()

                if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                    data = ali_1688.get_ali_1688_data(item[0])
                    if isinstance(data, int) is True:     # 单独处理返回tt为4041
                        continue
                    else:
                        pass

                    if data.get('is_delete') == 1:        # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]

                        data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[1],
                            MyShelfAndDownTime=item[2]
                        )

                        # print('------>>>| 爬取到的数据为: ', data)
                        ali_1688.to_right_and_update_data(data, pipeline=tmp_sql_server)

                        sleep(1.5)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = ali_1688.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[1],
                            MyShelfAndDownTime=item[2]
                        )

                        # print('------>>>| 爬取到的数据为: ', data)
                        ali_1688.to_right_and_update_data(data, pipeline=tmp_sql_server)

                        sleep(.3)       # 避免服务器更新太频繁
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del ali_1688
                # except:
                #     pass
                gc.collect()
                sleep(2.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()