async def insert_into_table(self, tmp_item, category, current_page,
                                sql_cli, index):
        '''
        执行插入到淘宝天天特价的操作
        :param tmp_item:
        :param category:
        :param current_page:
        :param sql_cli:
        :param index:
        :return: index 加1
        '''
        tmp_url = 'https://item.taobao.com/item.htm?id=' + str(
            tmp_item.get('goods_id', ''))
        taobao = TaoBaoLoginAndParse(
            logger=self.lg,
            is_real_times_update_call=self.is_real_times_update_call)
        goods_id = taobao.get_goods_id_from_url(tmp_url)
        try:
            taobao.get_goods_data(goods_id=goods_id)
            goods_data = taobao.deal_with_data(goods_id=goods_id)
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)
            index += 1

            return index

        if goods_data != {}:
            goods_data['goods_id'] = tmp_item.get('goods_id', '')
            goods_data['goods_url'] = tmp_url
            goods_data['schedule'] = [{
                'begin_time':
                tmp_item.get('start_time', ''),
                'end_time':
                tmp_item.get('end_time', ''),
            }]
            goods_data['tejia_begin_time'], goods_data[
                'tejia_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                    miaosha_time=goods_data.get('schedule', [])[0])
            goods_data['block_id'] = str(category)
            goods_data['tag_id'] = str(current_page)
            goods_data['father_sort'] = self.main_sort[category][0]
            goods_data['child_sort'] = ''
            # pprint(goods_data)

            if len(goods_data['all_img_url']) <= 1:
                self.lg.info('[goods_id: {}]主图个数<=1, pass'.format(goods_id))
                return index

            await taobao.insert_into_taobao_tiantiantejia_table(
                data=goods_data, pipeline=sql_cli)
        else:
            await async_sleep(4)  # 否则休息4秒
        index += 1
        # await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)

        return index
Exemple #2
0
def test_tb():
    goods_id = '533127076450'
    pc_url = 'https://item.taobao.com/item.htm?id={}'.format(goods_id)
    phone_url = 'https://h5.m.taobao.com/awp/core/detail.htm?id={}'.format(goods_id)
    print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url))

    tb = TaoBaoLoginAndParse(is_real_times_update_call=True)
    goods_id = tb.get_goods_id_from_url(pc_url)
    ori_data = tb.get_goods_data(goods_id=goods_id)
    # pprint(ori_data)
    data = tb.deal_with_data(goods_id=goods_id)
    pprint(data)

    try:
        del tb
    except:
        pass
Exemple #3
0
    async def insert_into_table(self, tmp_item, category, current_page,
                                my_pipeline, index):
        '''
        执行插入到淘宝天天特价的操作
        :param tmp_item:
        :param category:
        :param current_page:
        :param my_pipeline:
        :param index:
        :return: index 加1
        '''
        tmp_url = 'https://item.taobao.com/item.htm?id=' + str(
            tmp_item.get('goods_id', ''))
        taobao = TaoBaoLoginAndParse(logger=self.my_lg)
        goods_id = taobao.get_goods_id_from_url(tmp_url)
        taobao.get_goods_data(goods_id=goods_id)
        goods_data = taobao.deal_with_data(goods_id=goods_id)

        if goods_data != {}:
            goods_data['goods_id'] = tmp_item.get('goods_id', '')
            goods_data['goods_url'] = tmp_url
            goods_data['schedule'] = [{
                'begin_time':
                tmp_item.get('start_time', ''),
                'end_time':
                tmp_item.get('end_time', ''),
            }]
            goods_data['tejia_begin_time'], goods_data[
                'tejia_end_time'] = await self.get_tejia_begin_time_and_tejia_end_time(
                    schedule=goods_data.get('schedule', [])[0])
            goods_data['block_id'] = str(category)
            goods_data['tag_id'] = str(current_page)
            goods_data['father_sort'] = self.main_sort[category][0]
            goods_data['child_sort'] = ''
            # pprint(goods_data)

            await taobao.insert_into_taobao_tiantiantejia_table(
                data=goods_data, pipeline=my_pipeline)
        else:
            await asyncio.sleep(4)  # 否则休息4秒
            pass
        index += 1

        return index
Exemple #4
0
def get_one_tb_data(**kwargs):
    '''
    抓取一个tb url的data
    :return: a dict
    '''
    username = kwargs.get('username', '18698570079')
    tb_url = kwargs.get('tb_url', '')
    my_lg = kwargs.get('my_lg')

    login_taobao = TaoBaoLoginAndParse(logger=my_lg)
    goods_id = login_taobao.get_goods_id_from_url(tb_url)  # 获取goods_id
    if goods_id == '':
        my_lg.info('获取到的goods_id为空!')
        try: del login_taobao  # 每次都回收一下
        except: pass
        gc.collect()

        return {'goods_id': ''}                                    # 错误1: goods_id为空!

    wait_to_deal_with_url = 'https://item.taobao.com/item.htm?id={0}'.format(goods_id)  # 构造成标准干净的淘宝商品地址
    tmp_result = login_taobao.get_goods_data(goods_id=goods_id)
    data = login_taobao.deal_with_data(goods_id=goods_id)  # 如果成功获取的话, 返回的是一个data的dict对象

    sleep(TAOBAO_SLEEP_TIME)  # 这个在服务器里面可以注释掉为.5s
    if data == {} or tmp_result == {}:
        my_lg.info('获取到的data为空!')
        try:del login_taobao
        except:pass
        gc.collect()

        return {'goods_id': goods_id, 'msg': 'data为空!'}           # 错误2: 抓取data为空!

    wait_to_save_data = add_base_info_2_processed_data(
        data=data,
        spider_url=wait_to_deal_with_url,
        username=username,
        goods_id=goods_id
    )
    try: del login_taobao
    except: pass

    return wait_to_save_data
Exemple #5
0
    def _taobao_keywords_spider(self, **kwargs):
        '''
        抓取goods_id_list的数据,并存储
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = [
            'https://item.taobao.com/item.htm?id=' + item
            for item in goods_id_list
        ]

        self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:  # item为goods_url
            result = False  # 用于判断某个goods是否被插入的参数
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.my_lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass

            else:
                taobao = TaoBaoLoginAndParse(logger=self.my_lg)
                if self.add_goods_index % 20 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item)
                    if goods_id == '':
                        self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue

                    else:
                        self.my_lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id, str(self.add_goods_index)))
                        tt = taobao.get_goods_data(goods_id)
                        data = taobao.deal_with_data(goods_id=goods_id)
                        if data != {}:
                            data['goods_id'] = goods_id
                            data[
                                'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                    goods_id)
                            data['username'] = '******'
                            data['main_goods_id'] = None

                            # print('------>>>| 爬取到的数据为: ', data)
                            result = taobao.old_taobao_goods_insert_into_new_table(
                                data, pipeline=self.my_pipeline)
                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                gc.collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.my_lg.info('该关键字的商品已经抓取完毕!')

        return True
Exemple #6
0
def run_forever():
    #### 实时更新数据
    while True:
        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()
            result_2 = list(tmp_sql_server_2.select_old_table_all_goods_id())
            # print(result_2)
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            new_table_ali_1688_all_goods_id_list = [item[0] for item in result]
            for item in result_2:  # 实时更新数据
                data = {}
                taobao = TaoBaoLoginAndParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item[0])
                    if goods_id == '':
                        print('@@@ 原商品的地址为: ', item[0])
                        continue
                    else:
                        if goods_id in new_table_ali_1688_all_goods_id_list:
                            print('该goods_id已经存在于数据库中, 此处跳过!')
                            continue

                        else:
                            print(
                                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                                % (goods_id, index))
                            tt = taobao.get_goods_data(goods_id)
                            if tt.get('is_delete') == 1:  # 处理已下架的但是还是要插入的
                                tt['goods_id'] = goods_id
                                tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                    goods_id)
                                tt['username'] = '******'
                                tt['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data=tt, pipeline=tmp_sql_server_2)

                                index += 1
                                gc.collect()
                                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                                continue
                            else:
                                pass

                            data = taobao.deal_with_data(goods_id=goods_id)
                            if data != {}:
                                data['goods_id'] = goods_id
                                data[
                                    'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                        goods_id)
                                data['username'] = '******'
                                data['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data, pipeline=tmp_sql_server_2)
                            else:
                                pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Exemple #7
0
    async def _crawl_and_save_these_goods(self, goods_url_list):
        '''
        采集该文章推荐的商品
        :param goods_url_list:
        :return:
        '''
        sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6'

        try:
            result = self.my_pipeline._select_table(sql_str=sql_str)
        except TypeError:
            result = []

        self.my_lg.info('即将开始抓取该文章的goods, 请耐心等待...')
        index = 1

        db_all_goods_id_list = [item[0] for item in result]
        for item in goods_url_list:
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item.get('goods_url', ''))[0]
            except IndexError:
                self.my_lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in db_all_goods_id_list:
                self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                continue

            else:
                taobao = TaoBaoLoginAndParse(logger=self.my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item.get('goods_url', ''))
                    if goods_id == '':
                        self.my_lg.info('@@@ 原商品的地址为: {0}'.format(item.get('goods_url', '')))
                        continue

                    else:
                        self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index)))
                        tt = taobao.get_goods_data(goods_id)
                        data = taobao.deal_with_data(goods_id=goods_id)
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id)
                            data['username'] = '******'
                            data['main_goods_id'] = None

                            # print('------>>>| 爬取到的数据为: ', data)
                            taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.my_pipeline)

                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)

        self.my_lg.info('该文章的商品已经抓取完毕!')

        return True
    def _taobao_keywords_spider(self, **kwargs):
        '''
        抓取goods_id_list的数据,并存储
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = ['https://item.taobao.com/item.htm?id=' + item for item in goods_id_list]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')
        for item in goods_url_list:     # item为goods_url
            # 用于判断某个goods是否被插入的参数
            result = False
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True   # 原先存在的情况
                pass

            else:
                taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True)
                self.sql_cli = _block_get_new_db_conn(
                    db_obj=self.sql_cli,
                    index=self.add_goods_index,
                    logger=self.lg,
                    remainder=20,)
                if self.sql_cli.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item)
                    if goods_id == '':
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue

                    else:
                        self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index)))
                        tt = taobao.get_goods_data(goods_id)
                        data = taobao.deal_with_data(goods_id=goods_id)
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id)
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            if not self.check_target_data_is_legal(target_data=data):
                                return False

                            result = taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.sql_cli)

                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:
                # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True