async def insert_into_table(self, tmp_item, category, current_page,
                                sql_cli, index):
        '''
        执行插入到淘宝天天特价的操作
        :param tmp_item:
        :param category:
        :param current_page:
        :param sql_cli:
        :param index:
        :return: index 加1
        '''
        tmp_url = 'https://item.taobao.com/item.htm?id=' + str(
            tmp_item.get('goods_id', ''))
        taobao = TaoBaoLoginAndParse(
            logger=self.lg,
            is_real_times_update_call=self.is_real_times_update_call)
        goods_id = taobao.get_goods_id_from_url(tmp_url)
        try:
            taobao.get_goods_data(goods_id=goods_id)
            goods_data = taobao.deal_with_data(goods_id=goods_id)
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)
            index += 1

            return index

        if goods_data != {}:
            goods_data['goods_id'] = tmp_item.get('goods_id', '')
            goods_data['goods_url'] = tmp_url
            goods_data['schedule'] = [{
                'begin_time':
                tmp_item.get('start_time', ''),
                'end_time':
                tmp_item.get('end_time', ''),
            }]
            goods_data['tejia_begin_time'], goods_data[
                'tejia_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                    miaosha_time=goods_data.get('schedule', [])[0])
            goods_data['block_id'] = str(category)
            goods_data['tag_id'] = str(current_page)
            goods_data['father_sort'] = self.main_sort[category][0]
            goods_data['child_sort'] = ''
            # pprint(goods_data)

            if len(goods_data['all_img_url']) <= 1:
                self.lg.info('[goods_id: {}]主图个数<=1, pass'.format(goods_id))
                return index

            await taobao.insert_into_taobao_tiantiantejia_table(
                data=goods_data, pipeline=sql_cli)
        else:
            await async_sleep(4)  # 否则休息4秒
        index += 1
        # await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)

        return index
async def update_expired_goods_to_normal_goods(goods_id, index, tmp_sql_server,
                                               logger):
    '''
    过期的不删除, 降为更新为常规爆款促销商品
    :param goods_id:
    :param index:
    :param tmp_sql_server:
    :param logger:
    :return: index
    '''
    # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0])
    # logger.info('该商品goods_id({0})已过期, 天天特价结束时间为 [{1}], 删除成功!'.format(item[0], str(item[2].strftime('%Y-%m-%d %H:%M:%S'))))
    logger.info('++++++>>>| 此为过期商品, 正在更新! |<<<++++++')
    logger.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                (goods_id, str(index)))
    taobao = TaoBaoLoginAndParse(logger=logger)
    data_before = taobao.get_goods_data(goods_id)
    if data_before.get('is_delete') == 1:  # 单独处理下架状态的商品
        data_before['goods_id'] = goods_id
        data_before['schedule'] = []
        '''不更新特价时间段'''
        # data_before['tejia_begin_time'], data_before['tejia_end_time'] = '', ''

        # logger.info('------>>>| 爬取到的数据为: %s' % str(data_before))
        await taobao.update_taobao_tiantiantejia_table(data_before,
                                                       pipeline=tmp_sql_server)

        await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 避免服务器更新太频繁
        index += 1
        try:
            del taobao
        except:
            pass
        gc.collect()

        return index

    goods_data = taobao.deal_with_data(goods_id=goods_id)
    if goods_data != {}:
        goods_data['goods_id'] = goods_id
        await taobao.update_expired_goods_id_taobao_tiantiantejia_table(
            data=goods_data, pipeline=tmp_sql_server)
    else:
        await asyncio.sleep(4)  # 否则休息4秒
        pass
    await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
    index += 1
    try:
        del taobao
    except:
        pass
    gc.collect()

    return index
Beispiel #3
0
def test_tb():
    goods_id = '533127076450'
    pc_url = 'https://item.taobao.com/item.htm?id={}'.format(goods_id)
    phone_url = 'https://h5.m.taobao.com/awp/core/detail.htm?id={}'.format(goods_id)
    print('pc_url: {}, phone_url: {}'.format(pc_url, phone_url))

    tb = TaoBaoLoginAndParse(is_real_times_update_call=True)
    goods_id = tb.get_goods_id_from_url(pc_url)
    ori_data = tb.get_goods_data(goods_id=goods_id)
    # pprint(ori_data)
    data = tb.deal_with_data(goods_id=goods_id)
    pprint(data)

    try:
        del tb
    except:
        pass
Beispiel #4
0
    async def insert_into_table(self, tmp_item, category, current_page,
                                my_pipeline, index):
        '''
        执行插入到淘宝天天特价的操作
        :param tmp_item:
        :param category:
        :param current_page:
        :param my_pipeline:
        :param index:
        :return: index 加1
        '''
        tmp_url = 'https://item.taobao.com/item.htm?id=' + str(
            tmp_item.get('goods_id', ''))
        taobao = TaoBaoLoginAndParse(logger=self.my_lg)
        goods_id = taobao.get_goods_id_from_url(tmp_url)
        taobao.get_goods_data(goods_id=goods_id)
        goods_data = taobao.deal_with_data(goods_id=goods_id)

        if goods_data != {}:
            goods_data['goods_id'] = tmp_item.get('goods_id', '')
            goods_data['goods_url'] = tmp_url
            goods_data['schedule'] = [{
                'begin_time':
                tmp_item.get('start_time', ''),
                'end_time':
                tmp_item.get('end_time', ''),
            }]
            goods_data['tejia_begin_time'], goods_data[
                'tejia_end_time'] = await self.get_tejia_begin_time_and_tejia_end_time(
                    schedule=goods_data.get('schedule', [])[0])
            goods_data['block_id'] = str(category)
            goods_data['tag_id'] = str(current_page)
            goods_data['father_sort'] = self.main_sort[category][0]
            goods_data['child_sort'] = ''
            # pprint(goods_data)

            await taobao.insert_into_taobao_tiantiantejia_table(
                data=goods_data, pipeline=my_pipeline)
        else:
            await asyncio.sleep(4)  # 否则休息4秒
            pass
        index += 1

        return index
Beispiel #5
0
def get_one_tb_data(**kwargs):
    '''
    抓取一个tb url的data
    :return: a dict
    '''
    username = kwargs.get('username', '18698570079')
    tb_url = kwargs.get('tb_url', '')
    my_lg = kwargs.get('my_lg')

    login_taobao = TaoBaoLoginAndParse(logger=my_lg)
    goods_id = login_taobao.get_goods_id_from_url(tb_url)  # 获取goods_id
    if goods_id == '':
        my_lg.info('获取到的goods_id为空!')
        try: del login_taobao  # 每次都回收一下
        except: pass
        gc.collect()

        return {'goods_id': ''}                                    # 错误1: goods_id为空!

    wait_to_deal_with_url = 'https://item.taobao.com/item.htm?id={0}'.format(goods_id)  # 构造成标准干净的淘宝商品地址
    tmp_result = login_taobao.get_goods_data(goods_id=goods_id)
    data = login_taobao.deal_with_data(goods_id=goods_id)  # 如果成功获取的话, 返回的是一个data的dict对象

    sleep(TAOBAO_SLEEP_TIME)  # 这个在服务器里面可以注释掉为.5s
    if data == {} or tmp_result == {}:
        my_lg.info('获取到的data为空!')
        try:del login_taobao
        except:pass
        gc.collect()

        return {'goods_id': goods_id, 'msg': 'data为空!'}           # 错误2: 抓取data为空!

    wait_to_save_data = add_base_info_2_processed_data(
        data=data,
        spider_url=wait_to_deal_with_url,
        username=username,
        goods_id=goods_id
    )
    try: del login_taobao
    except: pass

    return wait_to_save_data
Beispiel #6
0
class TBUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(self,
                              *params,
                              **kwargs,
                              log_print=True,
                              log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/')
        self.sql_cli = None
        self.goods_index = 1
        # 并发量
        self.concurrency = 50
        # self.server_ip = 'http://0.0.0.0:5000'
        self.server_ip = 'http://118.31.39.97'

    async def _update_db(self):
        '''
        实时更新数据
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger()
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.taobao = TaoBaoLoginAndParse(logger=self.lg)
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                    except AssertionError:
                        break

                    tasks = []
                    for item in slice_params_list:
                        db_goods_info_obj = TBDbGoodsInfoObj(item=item,
                                                             logger=self.lg)
                        self.lg.info('创建 task goods_id: {}'.format(
                            db_goods_info_obj.goods_id))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(
                                    db_goods_info_obj=db_goods_info_obj,
                                    index=index)))
                        index += 1

                    res = await _get_async_task_result(tasks=tasks,
                                                       logger=self.lg)
                    await self._except_sleep(res=res)

                self.lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(5.)
            try:
                del self.lg
                del result
            except:
                pass
            collect()

    async def _get_db_old_data(self) -> (list, None):
        '''
        获取db需求更新的数据
        :return:
        '''
        # self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        # 使用sqlalchemy管理数据库连接池
        self.sql_cli = SqlPools()
        result = None
        try:
            # result = self.sql_cli._select_table(sql_str=tb_select_str_3,)
            result = await get_waited_2_update_db_data_from_server(
                server_ip=self.server_ip,
                _type='tb',
                child_type=0,
            )
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_new_tb_obj(self, index) -> None:
        if index % 10 == 0:
            try:
                del self.taobao
            except:
                pass
            collect()
            self.taobao = TaoBaoLoginAndParse(logger=self.lg)

    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新单个goods
        :return:
        '''
        res = False
        await self._get_new_tb_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg,
                                              db_conn_type=2,
                                              remainder=50)
        if self.sql_cli.is_connect_success:
            self.lg.info(
                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                (db_goods_info_obj.goods_id, str(index)))
            oo = self.taobao.get_goods_data(
                goods_id=db_goods_info_obj.goods_id)
            oo_is_delete = oo.get('is_delete', 0)  # 避免下面解析data错误休眠
            data = self.taobao.deal_with_data(
                goods_id=db_goods_info_obj.goods_id)
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='tb',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,
                )
                res = to_right_and_update_tb_data(data=data,
                                                  pipeline=self.sql_cli,
                                                  logger=self.lg)

            else:
                if oo_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('------>>>| 休眠8s中...')
                    await async_sleep(delay=8, loop=self.loop)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=10, loop=self.loop)

        index += 1
        self.goods_index = index
        collect()
        # 国外服务器上可以缩短时间, 可以设置为0s
        await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量

        return [db_goods_info_obj.goods_id, res]

    async def _except_sleep(self, res):
        '''
        异常休眠
        :param res:
        :return:
        '''
        count = 0
        all_count_fail_sleep_time = 100.
        sleep_time = 50.
        for item in res:
            try:
                if not item[1]:
                    count += 1
            except IndexError:
                pass
        self.lg.info('Fail count: {}个, 并发量: {}个'.format(
            count, self.concurrency))
        if count / self.concurrency >= .9:
            # 全失败的休眠方式
            self.lg.info('抓取异常!! 休眠{}s中...'.format(all_count_fail_sleep_time))
            await async_sleep(all_count_fail_sleep_time)

        else:
            if count >= int(self.concurrency / 5):
                self.lg.info('抓取异常!! 休眠{}s中...'.format(sleep_time))
                await async_sleep(sleep_time)

        return None

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
Beispiel #7
0
async def run_forever():
    #### 实时更新数据
    # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志
    lg = set_logger(logger_name=get_uuid1(),
                    log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' +
                    str(get_shanghai_time())[0:10] + '.txt',
                    console_log_level=INFO,
                    file_log_level=ERROR)

    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
    # 由于不处理下架的商品,所以is_delete=0
    try:
        # todo 先不处理过期的因为后台没有同步下架会导致其无法查到数据
        # tmp_sql_server._delete_table(sql_str=tb_delete_str_2, params=None)
        # await async_sleep(10)
        result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7))
    except TypeError:
        lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)')
        return None

    await _print_db_old_data(
        result=result,
        logger=lg,
    )

    index = 1
    for item in result:
        goods_id = item[0]
        tejia_end_time = item[2]

        tmp_sql_server = await _get_new_db_conn(
            db_obj=tmp_sql_server,
            index=index,
            logger=lg,
            db_conn_type=1,
        )
        if tmp_sql_server.is_connect_success:
            # lg.info(str(tejia_end_time))
            if tejia_end_time < get_shanghai_time():
                # 过期的不删除, 降为更新为常规爆款促销商品
                # index = await update_expired_goods_to_normal_goods(
                #     goods_id=goods_id,
                #     index=index,
                #     tmp_sql_server=tmp_sql_server,
                #     logger=lg
                # )
                # 过期直接下架
                lg.info('@@ 过期下架[goods_id: {}]'.format(goods_id))
                _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=lg,
                    update_sql_str=tb_update_str_5,
                )
                index += 1

            else:
                # 下面为天天特价商品信息更新
                '''
                ** 由于天天特价不会提前下架商品,就不对应更新特价时间段
                '''
                # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间
                # if index % 6 == 0:
                #     try: del tmp_taobao_tiantiantejia
                #     except: pass
                #     collect()
                #     tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=lg)
                #
                # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3])
                # if tmp_body == '':
                #     msg = '获取到的tmp_body为空str! 出错category为: ' + item[3]
                #     lg.error(msg)
                #     continue
                #
                # try:
                #     tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0]
                # except IndexError:
                #     msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3]
                #     lg.error(msg)
                #     continue
                # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body)
                # if tmp_sort_data == 'no items':
                #     lg.info('该api接口获取到的item_list为no items!请检查')
                #     break
                # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data)
                # # lg.info(str(tejia_goods_list))
                # await async_sleep(.45)
                # # lg.info('111')
                '''
                研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过
                '''
                # if is_in_child_sort(tejia_goods_list, goods_id=goods_id) is False:     # 表示被官方提前下架
                #     # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=goods_id)
                #     # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(goods_id))
                #     print('222')
                #     pass

                # else:       # 表示商品未被提前下架
                lg.info(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                    (goods_id, str(index)))
                taobao = TaoBaoLoginAndParse(
                    logger=lg,
                    is_real_times_update_call=is_real_times_update_call)
                taobao.get_goods_data(goods_id)
                goods_data = taobao.deal_with_data(goods_id=goods_id)
                if goods_data != {}:
                    # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=goods_id)
                    # if tmp_time != []:
                    #     begin_time, end_time = tmp_time
                    #
                    #     goods_data['goods_id'] = goods_id
                    #     goods_data['schedule'] = [{
                    #         'begin_time': begin_time,
                    #         'end_time': end_time,
                    #     }]
                    #     goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    #     await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server)
                    # else:
                    #     lg.info('该goods_id不在该api接口的商品中!!')
                    #     pass

                    goods_data['goods_id'] = goods_id
                    if goods_data.get('is_delete', 0) == 1:
                        lg.info('@该商品已下架...')

                    await taobao.update_taobao_tiantiantejia_table(
                        data=goods_data, pipeline=tmp_sql_server)

                else:
                    await async_sleep(4)

                await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                index += 1
                collect()

        else:
            lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass
        collect()
    lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
    if get_shanghai_time().hour == 0:  # 0点以后不更新
        # sleep(60 * 60 * .5)
        await async_sleep(5 * 60)

    else:
        await async_sleep(60 * 1)
    collect()

    return True
async def run_forever():
    #### 实时更新数据
    # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志
    my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' +
                       str(get_shanghai_time())[0:10] + '.txt',
                       console_log_level=INFO,
                       file_log_level=ERROR)

    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
    # 由于不处理下架的商品,所以is_delete=0
    sql_str = '''
    select goods_id, is_delete, tejia_end_time, block_id, tag_id 
    from dbo.taobao_tiantiantejia 
    where site_id=19 and is_delete=0 and GETDATE()-modfiy_time>2 and MainGoodsID is not null
    '''

    try:
        result = list(tmp_sql_server._select_table(sql_str=sql_str))
    except TypeError:
        my_lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)')
        return None

    my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
    my_lg.info(str(result))
    my_lg.info('--------------------------------------------------------')
    my_lg.info('待更新的goods_id个数: {0}'.format(len(result)))

    my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
    index = 1
    # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg)
    for item in result:  # 实时更新数据
        if index % 50 == 0:
            my_lg.info('正在重置,并与数据库建立新连接中...')
            # try: del tmp_sql_server
            # except: pass
            # gc.collect()
            tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
            my_lg.info('与数据库的新连接成功建立...')

        if tmp_sql_server.is_connect_success:
            tejia_end_time = item[2]
            # my_lg.info(str(tejia_end_time))

            if item[1] == 1:  # 原先下架的商品,扫描到不处理
                # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0])
                # my_lg.info('该商品goods_id[{0}]已售完, 删除成功!'.format(item[0]))
                my_lg.info(
                    '&&&&&& 该商品({0})原先状态为is_delete=1, 不进行实际删除操作! 索引为({1})'.
                    format(item[0], str(index)))
                index += 1
                pass

            elif tejia_end_time < datetime.datetime.now():
                # 过期的不删除, 降为更新为常规爆款促销商品
                index = await update_expired_goods_to_normal_goods(
                    goods_id=item[0],
                    index=index,
                    tmp_sql_server=tmp_sql_server,
                    logger=my_lg)
                pass

            else:
                # 下面为天天特价商品信息更新
                '''
                ** 由于天天特价不会提前下架商品,就不对应更新特价时间段
                '''
                # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间
                # if index % 6 == 0:
                #     try: del tmp_taobao_tiantiantejia
                #     except: pass
                #     gc.collect()
                #     tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg)
                #
                # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3])
                # if tmp_body == '':
                #     msg = '获取到的tmp_body为空str! 出错category为: ' + item[3]
                #     my_lg.error(msg)
                #     continue
                #
                # try:
                #     tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0]
                # except IndexError:
                #     msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3]
                #     my_lg.error(msg)
                #     continue
                # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body)
                # if tmp_sort_data == 'no items':
                #     my_lg.info('该api接口获取到的item_list为no items!请检查')
                #     break
                # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data)
                # # my_lg.info(str(tejia_goods_list))
                # await asyncio.sleep(.45)
                # # my_lg.info('111')
                '''
                研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过
                '''
                # if is_in_child_sort(tejia_goods_list, goods_id=item[0]) is False:     # 表示被官方提前下架
                #     # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0])
                #     # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(item[0]))
                #     print('222')
                #     pass

                # else:       # 表示商品未被提前下架
                my_lg.info(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                    (item[0], str(index)))
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                taobao.get_goods_data(item[0])
                goods_data = taobao.deal_with_data(goods_id=item[0])
                if goods_data != {}:
                    # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=item[0])
                    # if tmp_time != []:
                    #     begin_time, end_time = tmp_time
                    #
                    #     goods_data['goods_id'] = item[0]
                    #     goods_data['schedule'] = [{
                    #         'begin_time': begin_time,
                    #         'end_time': end_time,
                    #     }]
                    #     goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    #     await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server)
                    # else:
                    #     my_lg.info('该goods_id不在该api接口的商品中!!')
                    #     pass

                    goods_data['goods_id'] = item[0]
                    '''不专门更新上下架时间段'''
                    # goods_data['schedule'] = [{
                    #     'begin_time': begin_time,
                    #     'end_time': end_time,
                    # }]
                    # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    if goods_data.get('is_delete', 0) == 1:
                        my_lg.info('@该商品已下架...')

                    await taobao.update_taobao_tiantiantejia_table(
                        data=goods_data, pipeline=tmp_sql_server)

                else:
                    await asyncio.sleep(4)  # 否则休息4秒
                    pass

                await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                index += 1
                gc.collect()

        else:  # 表示返回的data值为空值
            my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass
        gc.collect()
    my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
    if get_shanghai_time().hour == 0:  # 0点以后不更新
        # sleep(60 * 60 * .5)
        pass

    else:
        sleep(5)
    gc.collect()

    return True
Beispiel #9
0
    def _taobao_keywords_spider(self, **kwargs):
        '''
        抓取goods_id_list的数据,并存储
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = [
            'https://item.taobao.com/item.htm?id=' + item
            for item in goods_id_list
        ]

        self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:  # item为goods_url
            result = False  # 用于判断某个goods是否被插入的参数
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.my_lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass

            else:
                taobao = TaoBaoLoginAndParse(logger=self.my_lg)
                if self.add_goods_index % 20 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item)
                    if goods_id == '':
                        self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue

                    else:
                        self.my_lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id, str(self.add_goods_index)))
                        tt = taobao.get_goods_data(goods_id)
                        data = taobao.deal_with_data(goods_id=goods_id)
                        if data != {}:
                            data['goods_id'] = goods_id
                            data[
                                'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                    goods_id)
                            data['username'] = '******'
                            data['main_goods_id'] = None

                            # print('------>>>| 爬取到的数据为: ', data)
                            result = taobao.old_taobao_goods_insert_into_new_table(
                                data, pipeline=self.my_pipeline)
                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                gc.collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.my_lg.info('该关键字的商品已经抓取完毕!')

        return True
Beispiel #10
0
def run_forever():
    #### 实时更新数据
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()

        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    data = taobao.get_goods_data(item[0])

                    if data.get('is_delete') == 1:  # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        my_lg.info('------>>>| 休眠5s中...')
                        sleep(5)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(10)
                    pass

                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
        restart_program()
Beispiel #11
0
def run_forever():
    #### 实时更新数据
    while True:
        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()
            result_2 = list(tmp_sql_server_2.select_old_table_all_goods_id())
            # print(result_2)
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            new_table_ali_1688_all_goods_id_list = [item[0] for item in result]
            for item in result_2:  # 实时更新数据
                data = {}
                taobao = TaoBaoLoginAndParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item[0])
                    if goods_id == '':
                        print('@@@ 原商品的地址为: ', item[0])
                        continue
                    else:
                        if goods_id in new_table_ali_1688_all_goods_id_list:
                            print('该goods_id已经存在于数据库中, 此处跳过!')
                            continue

                        else:
                            print(
                                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                                % (goods_id, index))
                            tt = taobao.get_goods_data(goods_id)
                            if tt.get('is_delete') == 1:  # 处理已下架的但是还是要插入的
                                tt['goods_id'] = goods_id
                                tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                    goods_id)
                                tt['username'] = '******'
                                tt['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data=tt, pipeline=tmp_sql_server_2)

                                index += 1
                                gc.collect()
                                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                                continue
                            else:
                                pass

                            data = taobao.deal_with_data(goods_id=goods_id)
                            if data != {}:
                                data['goods_id'] = goods_id
                                data[
                                    'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                        goods_id)
                                data['username'] = '******'
                                data['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data, pipeline=tmp_sql_server_2)
                            else:
                                pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Beispiel #12
0
class TBUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/')
        self.sql_cli = None
        # 1 SqlServerMyPageInfoSaveItemPipeline | 2 SqlPools
        self.db_conn_type = 1
        self.goods_index = 1
        # 并发量
        self.concurrency = 100
        self.concurrent_type = CONCURRENT_TYPE
        # 0 sqlserver | 1 new_my_server | 2 redis
        self.db_res_from = 2
        if 'armv7l-with-debian' in platform.platform():
            self.server_ip = 'http://0.0.0.0:80'
        else:
            self.server_ip = 'http://118.31.39.97'
            # self.server_ip = 'http://0.0.0.0:5000'

    async def _update_db(self):
        '''
        实时更新数据
        :return:
        '''
        while True:
            # 长期运行报: OSError: [Errno 24] Too many open files, 故不采用每日一志
            # self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency)
                self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True)
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                    except AssertionError:
                        break

                    one_res, index = await self._get_one_res(
                        slice_params_list=slice_params_list,
                        index=index)
                    await self._except_sleep(res=one_res)

                self.lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

            if get_shanghai_time().hour == 0:
                # 0点以后不更新
                await async_sleep(60 * 60 * .5)
            else:
                await async_sleep(5.)
            try:
                # del self.lg
                del result
            except:
                pass
            collect()

    async def _get_db_old_data(self) -> (list, None):
        '''
        获取db需求更新的数据
        :return:
        '''
        if self.db_conn_type == 1:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        elif self.db_conn_type == 2:
            # 使用sqlalchemy管理数据库连接池
            self.sql_cli = SqlPools()
        else:
            raise ValueError('db_conn_type 值异常!')

        result = None
        try:
            if self.db_res_from == 0:
                result = self.sql_cli._select_table(sql_str=tb_select_str_3,)

            elif self.db_res_from == 1:
                result = await get_waited_2_update_db_data_from_server(
                    server_ip=self.server_ip,
                    _type='tb',
                    child_type=0,)
            elif self.db_res_from == 2:
                # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新
                result = get_waited_2_update_db_data_from_redis_server(
                    spider_name='tb0',
                    logger=self.lg,
                    slice_num=800,)
            else:
                raise ValueError('self.db_res_from value异常!')

        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_one_res(self, slice_params_list: list, index) -> tuple:
        """
        获取slice_params_list对应的one_res
        :param slice_params_list:
        :param index:
        :return: (list, int)
        """
        def get_tasks_params_list(slice_params_list: list, index: int) -> list:
            tasks_params_list = []
            for item in slice_params_list:
                db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg)
                tasks_params_list.append({
                    'db_goods_info_obj': db_goods_info_obj,
                    'index': index,
                })
                index += 1

            return tasks_params_list

        def get_create_task_msg(k) -> str:
            return 'create task[where is goods_id: {}, index: {}] ...'.format(
                k['db_goods_info_obj'].goods_id,
                k['index'],)

        def get_now_args(k) -> list:
            return [
                'tb',
                k['db_goods_info_obj'].goods_id,
                k['index'],
                self.lg,
            ]

        async def handle_one_res(one_res: list):
            """
            one_res后续处理
            :param one_res:
            :return:
            """
            nonlocal slice_params_list

            # 获取新new_slice_params_list
            new_slice_params_list = []
            for item in slice_params_list:
                goods_id = item[1]
                for i in one_res:
                    # self.lg.info(str(i))
                    try:
                        goods_id2 = i[1]
                        index = i[2]
                        if goods_id == goods_id2:
                            new_slice_params_list.append({
                                'index': index,
                                'before_goods_data': i[3],
                                'end_goods_data': i[4],
                                'item': item,
                            })
                            break
                        else:
                            continue
                    except IndexError:
                        continue

            # 阻塞方式进行存储, 避免db高并发导致大量死锁
            tasks = []
            for k in new_slice_params_list:
                item = k['item']
                index = k['index']
                db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg)
                self.lg.info('create task[where is goods_id: {}, index: {}]...'.format(
                    db_goods_info_obj.goods_id,
                    index))
                tasks.append(self.loop.create_task(self._update_one_goods_info_in_db(
                    db_goods_info_obj=db_goods_info_obj,
                    index=index,
                    before_goods_data=k['before_goods_data'],
                    end_goods_data=k['end_goods_data'],)))

            # self.lg.error(str(one_res))
            # self.lg.error(str(tasks))
            one_res = await _get_async_task_result(
                tasks=tasks,
                logger=self.lg)
            # pprint(one_res)
            try:
                del new_slice_params_list
            except:
                pass

            return one_res

        # tasks = []
        # # method 1
        # for item in slice_params_list:
        #     db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg)
        #     self.lg.info('创建 task goods_id: {}'.format(db_goods_info_obj.goods_id))
        #     tasks.append(self.loop.create_task(self._update_one_goods_info(
        #         db_goods_info_obj=db_goods_info_obj,
        #         index=index)))
        #     index += 1
        #
        # res = await _get_async_task_result(tasks=tasks, logger=self.lg)

        # method 2
        one_res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=get_tasks_params_list(
                slice_params_list=slice_params_list,
                index=index,),
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=block_get_one_goods_info_task_by_external_type,
            func_name_where_get_now_args=get_now_args,
            func_name_where_handle_one_res=None,
            func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res2,
            one_default_res=(),
            step=self.concurrency,
            logger=self.lg,
            get_all_res=True,
            concurrent_type=self.concurrent_type,
        )
        # pprint(one_res)
        res = await handle_one_res(one_res=one_res)

        return (res, index)

    async def _update_one_goods_info_in_db(self,
                                           db_goods_info_obj,
                                           index,
                                           before_goods_data,
                                           end_goods_data):
        """
        更新单个goods
        :param db_goods_info_obj:
        :param index:
        :param before_goods_data:
        :param end_goods_data:
        :return:
        """
        res = False

        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
            db_conn_type=self.db_conn_type,
            remainder=25,)
        if self.sql_cli.is_connect_success:
            self.lg.info('*' * 20 + ' updating goods_id: {}, index: {} ...'.format(
                db_goods_info_obj.goods_id,
                index, ))
            # 避免下面解析data错误休眠
            before_goods_data_is_delete = before_goods_data.get('is_delete', 0)
            if end_goods_data != {}:
                data = get_goods_info_change_data(
                    target_short_name='tb',
                    logger=self.lg,
                    data=end_goods_data,
                    db_goods_info_obj=db_goods_info_obj, )
                res = to_right_and_update_tb_data(
                    data=data,
                    pipeline=self.sql_cli,
                    logger=self.lg,)

            else:  # 表示返回的data值为空值
                if before_goods_data_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format(
                        db_goods_info_obj.goods_id,))
                    await async_sleep(delay=7., loop=self.loop)
                    # 改为阻塞进程, 机器会挂
                    # sleep(7.)

        else:
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=5, loop=self.loop)

        await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
        collect()

        return [db_goods_info_obj.goods_id, res]

    async def _get_new_tb_obj(self, index) -> None:
        if index % 10 == 0:
            try:
                del self.taobao
            except:
                pass
            collect()
            self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True)

    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新单个goods
        :return:
        '''
        res = False
        await self._get_new_tb_obj(index=index)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
            db_conn_type=self.db_conn_type,
            remainder=25,)
        if self.sql_cli.is_connect_success:
            self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (
                db_goods_info_obj.goods_id,
                str(index)))
            oo = self.taobao.get_goods_data(goods_id=db_goods_info_obj.goods_id)
            oo_is_delete = oo.get('is_delete', 0)  # 避免下面解析data错误休眠
            data = self.taobao.deal_with_data(goods_id=db_goods_info_obj.goods_id)
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='tb',
                    logger=self.lg,
                    data = data,
                    db_goods_info_obj=db_goods_info_obj,)
                res = to_right_and_update_tb_data(
                    data=data,
                    pipeline=self.sql_cli,
                    logger=self.lg)

            else:
                if oo_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('------>>>| 休眠8s中...')
                    await async_sleep(delay=8, loop=self.loop)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=10, loop=self.loop)

        index += 1
        self.goods_index = index
        collect()
        # 国外服务器上可以缩短时间, 可以设置为0s
        await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量

        return [db_goods_info_obj.goods_id, res]

    async def _except_sleep(self, res):
        '''
        异常休眠
        :param res:
        :return:
        '''
        count = 0
        all_count_fail_sleep_time = 100.
        # 本来休眠40., 现在不休眠
        sleep_time = 0.
        for item in res:
            try:
                if not item[1]:
                    count += 1
            except IndexError:
                pass
        self.lg.info('Fail count: {}个, 并发量: {}个'.format(count, self.concurrency))
        if count/self.concurrency >= .96:
            # 全失败的休眠方式
            self.lg.info('抓取异常!! 休眠{}s中...'.format(all_count_fail_sleep_time))
            await async_sleep(all_count_fail_sleep_time)

        else:
            if count >= int(self.concurrency/5):
                self.lg.info('抓取异常!! 休眠{}s中...'.format(sleep_time))
                await async_sleep(sleep_time)

        return None

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.sql_cli
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
Beispiel #13
0
    async def _crawl_and_save_these_goods(self, goods_url_list):
        '''
        采集该文章推荐的商品
        :param goods_url_list:
        :return:
        '''
        sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6'

        try:
            result = self.my_pipeline._select_table(sql_str=sql_str)
        except TypeError:
            result = []

        self.my_lg.info('即将开始抓取该文章的goods, 请耐心等待...')
        index = 1

        db_all_goods_id_list = [item[0] for item in result]
        for item in goods_url_list:
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item.get('goods_url', ''))[0]
            except IndexError:
                self.my_lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in db_all_goods_id_list:
                self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                continue

            else:
                taobao = TaoBaoLoginAndParse(logger=self.my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item.get('goods_url', ''))
                    if goods_id == '':
                        self.my_lg.info('@@@ 原商品的地址为: {0}'.format(item.get('goods_url', '')))
                        continue

                    else:
                        self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index)))
                        tt = taobao.get_goods_data(goods_id)
                        data = taobao.deal_with_data(goods_id=goods_id)
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id)
                            data['username'] = '******'
                            data['main_goods_id'] = None

                            # print('------>>>| 爬取到的数据为: ', data)
                            taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.my_pipeline)

                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)

        self.my_lg.info('该文章的商品已经抓取完毕!')

        return True
    def _taobao_keywords_spider(self, **kwargs):
        '''
        抓取goods_id_list的数据,并存储
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        goods_url_list = ['https://item.taobao.com/item.htm?id=' + item for item in goods_id_list]

        self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...')
        for item in goods_url_list:     # item为goods_url
            # 用于判断某个goods是否被插入的参数
            result = False
            try:
                goods_id = re.compile(r'id=(\d+)').findall(item)[0]
            except IndexError:
                self.lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True   # 原先存在的情况
                pass

            else:
                taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True)
                self.sql_cli = _block_get_new_db_conn(
                    db_obj=self.sql_cli,
                    index=self.add_goods_index,
                    logger=self.lg,
                    remainder=20,)
                if self.sql_cli.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item)
                    if goods_id == '':
                        self.lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue

                    else:
                        self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index)))
                        tt = taobao.get_goods_data(goods_id)
                        data = taobao.deal_with_data(goods_id=goods_id)
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id)
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            if not self.check_target_data_is_legal(target_data=data):
                                return False

                            result = taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.sql_cli)

                        else:
                            pass

                else:  # 表示返回的data值为空值
                    self.lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                collect()
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
            if result:
                # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.lg.info('该关键字的商品已经抓取完毕!')

        return True
Beispiel #15
0
def run_forever():
    #### 实时更新数据
    while True:
        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()

        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                data = {}
                taobao = TaoBaoLoginAndParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[0], index))
                    taobao.get_goods_data(item[0])
                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)
                        '''
                        设置最后刷新的商品状态上下架时间
                        '''
                        # 1.is_delete由0->1 为下架时间down_time  2. is_delete由1->0 为上架时间shelf_time
                        my_shelf_and_down_time = {
                            'shelf_time': '',
                            'down_time': '',
                        }
                        if data['is_delete'] != item[1]:
                            if data['is_delete'] == 0 and item[1] == 1:
                                # is_delete由0->1 表示商品状态上架变为下架
                                my_shelf_and_down_time['down_time'] = str(
                                    get_shanghai_time())
                            else:
                                # is_delete由1->0 表示商品状态下架变为上架
                                my_shelf_and_down_time['shelf_time'] = str(
                                    get_shanghai_time())
                        else:
                            if item[2] is None or item[
                                    2] == '{"shelf_time": "", "down_time": ""}' or len(
                                        item[2]) == 35:  # 35就是那串初始str
                                if data['is_delete'] == 0:  # 上架的状态
                                    my_shelf_and_down_time['shelf_time'] = str(
                                        get_shanghai_time())
                                else:  # 下架的状态
                                    my_shelf_and_down_time['down_time'] = str(
                                        get_shanghai_time())
                            else:
                                # 否则保存原始值不变
                                tmp_shelf_and_down_time = item[2]
                                my_shelf_and_down_time = json.loads(
                                    tmp_shelf_and_down_time)  # 先转换为dict
                        data['my_shelf_and_down_time'] = my_shelf_and_down_time
                        # print(my_shlef_and_down_time)

                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
def run_forever():
    #### 实时更新数据
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server._select_table(sql_str=tb_select_str_3, )
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')
            my_lg.info('总计待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlPools()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    oo = taobao.get_goods_data(item[0])
                    oo_is_delete = oo.get('is_delete', 0)  # 避免下面解析data错误休眠
                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[2],
                                old_taobao_price=item[3],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        try:
                            old_sku_info = format_price_info_list(
                                price_info_list=json_2_dict(item[6]),
                                site_id=1)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[6]
                        data['_is_price_change'], data[
                            'sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(
                                    data['price_info_list'], site_id=1),
                                is_price_change=item[7]
                                if item[7] is not None else 0)

                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        if oo_is_delete == 1:
                            pass
                        else:
                            my_lg.info('------>>>| 休眠5s中...')
                            sleep(4)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(10)
                    pass

                index += 1
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
        restart_program()