Example #1
0
 def _set_logger(self, logger):
     if logger is None:
         self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                 '/网易严选/_/' +
                                 str(get_shanghai_time())[0:10] + '.txt',
                                 console_log_level=INFO,
                                 file_log_level=ERROR)
     else:
         self.my_lg = logger
Example #2
0
 def _set_logger(self, logger):
     if logger is None:
         self.my_lg = set_logger(
             log_file_name=MY_SPIDER_LOGS_PATH + '/聚美优品/拼团/' +
             self.get_log_file_name_from_time() + '.txt',
             console_log_level=INFO,
             file_log_level=ERROR)
     else:
         self.my_lg = logger
 def _set_logger(self, logger):
     if logger is None:
         self.lg = set_logger(logger_name=get_uuid1(),
                              log_file_name=MY_SPIDER_LOGS_PATH +
                              '/聚美优品/拼团/' + str(get_shanghai_time())[0:10] +
                              '.txt',
                              console_log_level=INFO,
                              file_log_level=ERROR)
     else:
         self.lg = logger
Example #4
0
 def _set_logger(self, logger):
     '''
     设置logger
     :param logger:
     :return:
     '''
     if logger is None:
         self.my_lg = set_logger(
             log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/comment/' + str(get_shanghai_time())[0:10] + '.txt',
             console_log_level=INFO,
             file_log_level=ERROR
         )
     else:
         self.my_lg = logger
Example #5
0
async def run_forever():
    #### 实时更新数据
    # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志
    my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' +
                       str(get_shanghai_time())[0:10] + '.txt',
                       console_log_level=INFO,
                       file_log_level=ERROR)

    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
    # 由于不处理下架的商品,所以is_delete=0
    try:
        result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7))
    except TypeError:
        my_lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)')
        return None

    my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
    my_lg.info(str(result))
    my_lg.info('--------------------------------------------------------')
    my_lg.info('待更新的goods_id个数: {0}'.format(len(result)))

    my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
    index = 1
    # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg)
    for item in result:  # 实时更新数据
        if index % 50 == 0:
            my_lg.info('正在重置,并与数据库建立新连接中...')
            # try: del tmp_sql_server
            # except: pass
            # gc.collect()
            tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
            my_lg.info('与数据库的新连接成功建立...')

        if tmp_sql_server.is_connect_success:
            tejia_end_time = item[2]
            # my_lg.info(str(tejia_end_time))

            if item[1] == 1:  # 原先下架的商品,扫描到不处理
                # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0])
                # my_lg.info('该商品goods_id[{0}]已售完, 删除成功!'.format(item[0]))
                my_lg.info(
                    '&&&&&& 该商品({0})原先状态为is_delete=1, 不进行实际删除操作! 索引为({1})'.
                    format(item[0], str(index)))
                index += 1
                pass

            elif tejia_end_time < datetime.datetime.now():
                # 过期的不删除, 降为更新为常规爆款促销商品
                index = await update_expired_goods_to_normal_goods(
                    goods_id=item[0],
                    index=index,
                    tmp_sql_server=tmp_sql_server,
                    logger=my_lg)
                pass

            else:
                # 下面为天天特价商品信息更新
                '''
                ** 由于天天特价不会提前下架商品,就不对应更新特价时间段
                '''
                # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间
                # if index % 6 == 0:
                #     try: del tmp_taobao_tiantiantejia
                #     except: pass
                #     gc.collect()
                #     tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg)
                #
                # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3])
                # if tmp_body == '':
                #     msg = '获取到的tmp_body为空str! 出错category为: ' + item[3]
                #     my_lg.error(msg)
                #     continue
                #
                # try:
                #     tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0]
                # except IndexError:
                #     msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3]
                #     my_lg.error(msg)
                #     continue
                # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body)
                # if tmp_sort_data == 'no items':
                #     my_lg.info('该api接口获取到的item_list为no items!请检查')
                #     break
                # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data)
                # # my_lg.info(str(tejia_goods_list))
                # await asyncio.sleep(.45)
                # # my_lg.info('111')
                '''
                研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过
                '''
                # if is_in_child_sort(tejia_goods_list, goods_id=item[0]) is False:     # 表示被官方提前下架
                #     # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0])
                #     # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(item[0]))
                #     print('222')
                #     pass

                # else:       # 表示商品未被提前下架
                my_lg.info(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                    (item[0], str(index)))
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                taobao.get_goods_data(item[0])
                goods_data = taobao.deal_with_data(goods_id=item[0])
                if goods_data != {}:
                    # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=item[0])
                    # if tmp_time != []:
                    #     begin_time, end_time = tmp_time
                    #
                    #     goods_data['goods_id'] = item[0]
                    #     goods_data['schedule'] = [{
                    #         'begin_time': begin_time,
                    #         'end_time': end_time,
                    #     }]
                    #     goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    #     await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server)
                    # else:
                    #     my_lg.info('该goods_id不在该api接口的商品中!!')
                    #     pass

                    goods_data['goods_id'] = item[0]
                    '''不专门更新上下架时间段'''
                    # goods_data['schedule'] = [{
                    #     'begin_time': begin_time,
                    #     'end_time': end_time,
                    # }]
                    # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    if goods_data.get('is_delete', 0) == 1:
                        my_lg.info('@该商品已下架...')

                    await taobao.update_taobao_tiantiantejia_table(
                        data=goods_data, pipeline=tmp_sql_server)

                else:
                    await asyncio.sleep(4)  # 否则休息4秒
                    pass

                await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                index += 1
                gc.collect()

        else:  # 表示返回的data值为空值
            my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass
        gc.collect()
    my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
    if get_shanghai_time().hour == 0:  # 0点以后不更新
        # sleep(60 * 60 * .5)
        pass

    else:
        sleep(5)
    gc.collect()

    return True
Example #6
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR
        )

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        #  and GETDATE()-ModfiyTime>0.2
        sql_str = '''
        select SiteID, GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time
        from dbo.GoodsInfoAutoGet 
        where (SiteID=3 or SiteID=4 or SiteID=6) and MainGoodsID is not null 
        order by ID desc'''

        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info('总计待更新个数: {0}'.format(len(result)))
            my_lg.info('--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            tmall = TmallParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:del tmall
                    except: pass
                    tmall = TmallParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:    # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index)))
                    tmp_item = []
                    if item[0] == 3:        # 从数据库中取出时,先转换为对应的类型
                        tmp_item.append(0)
                    elif item[0] == 4:
                        tmp_item.append(1)
                    elif item[0] == 6:
                        tmp_item.append(2)
                    tmp_item.append(item[1])
                    data = tmall.get_goods_data(goods_id=tmp_item)
                    if isinstance(data, int):       # 单独处理return 4041
                        index += 1
                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        continue

                    if data.get('is_delete') == 1:  # 单独处理下架商品
                        data['goods_id'] = item[1]

                        data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[2],
                            shelf_time=item[5],
                            delete_time=item[6])

                        # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data))
                        tmall.to_right_and_update_data(data, pipeline=tmp_sql_server)

                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        index += 1
                        gc.collect()
                        continue

                    data = tmall.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]
                        data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[2],
                            shelf_time=item[5],
                            delete_time=item[6])
                        data['_is_price_change'], data['_price_change_info'] = _get_price_change_info(
                            old_price=item[3],
                            old_taobao_price=item[4],
                            new_price=data['price'],
                            new_taobao_price=data['taobao_price']
                        )
                        # my_lg.info(str(data['_is_price_change']) + ' ' +str(data['_price_change_info']))

                        # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data))
                        tmall.to_right_and_update_data(data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
            
        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(5)
        gc.collect()
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            logger_name=get_uuid1(),
            log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=mia_select_str_5))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            mia = MiaParse()
            for item in result:
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del mia
                    except:
                        pass
                    mia = MiaParse()
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    mia.get_goods_data(goods_id=goods_id)
                    data = mia.deal_with_data()
                    db_goods_info_obj = MIADbGoodsInfoObj(item=item,
                                                          logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:  # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='mia',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        mia._to_right_and_update_data(data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        try:
            del my_lg
        except:
            pass
        collect()
Example #8
0
from api import IpPoolsObj

from fzutils.log_utils import set_logger
from fzutils.time_utils import get_shanghai_time
from fzutils.data.pickle_utils import (
    deserializate_pickle_object,
    serialize_obj_item_2_dict,
)
from fzutils.safe_utils import get_uuid3
from fzutils.sql_utils import BaseRedisCli
from fzutils.data.list_utils import list_remove_repeat_dict
from fzutils.linux_utils import _get_simulate_logger
from fzutils.ip_utils import get_local_external_network_ip

lg = set_logger(log_file_name=SPIDER_LOG_PATH +
                str(get_shanghai_time())[0:10] + '.log',
                console_log_level=INFO,
                file_log_level=ERROR)
redis_cli = BaseRedisCli()
_key = get_uuid3(proxy_list_key_name)  # 存储proxy_list的key
_h_key = get_uuid3(high_proxy_list_key_name)
# 本地外网ip值
local_ip = ''
# ip pool obj
ip_pools_obj = IpPoolsObj(_k=high_proxy_list_key_name)


def get_proxy_process_data():
    '''
    抓取代理并更新redis中的值
    :return:
    '''
Example #9
0
async def run_forever():
    #### 实时更新数据
    # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志
    lg = set_logger(logger_name=get_uuid1(),
                    log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' +
                    str(get_shanghai_time())[0:10] + '.txt',
                    console_log_level=INFO,
                    file_log_level=ERROR)

    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
    # 由于不处理下架的商品,所以is_delete=0
    try:
        # todo 先不处理过期的因为后台没有同步下架会导致其无法查到数据
        # tmp_sql_server._delete_table(sql_str=tb_delete_str_2, params=None)
        # await async_sleep(10)
        result = list(tmp_sql_server._select_table(sql_str=tb_select_str_7))
    except TypeError:
        lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)')
        return None

    await _print_db_old_data(
        result=result,
        logger=lg,
    )

    index = 1
    for item in result:
        goods_id = item[0]
        tejia_end_time = item[2]

        tmp_sql_server = await _get_new_db_conn(
            db_obj=tmp_sql_server,
            index=index,
            logger=lg,
            db_conn_type=1,
        )
        if tmp_sql_server.is_connect_success:
            # lg.info(str(tejia_end_time))
            if tejia_end_time < get_shanghai_time():
                # 过期的不删除, 降为更新为常规爆款促销商品
                # index = await update_expired_goods_to_normal_goods(
                #     goods_id=goods_id,
                #     index=index,
                #     tmp_sql_server=tmp_sql_server,
                #     logger=lg
                # )
                # 过期直接下架
                lg.info('@@ 过期下架[goods_id: {}]'.format(goods_id))
                _handle_goods_shelves_in_auto_goods_table(
                    goods_id=goods_id,
                    logger=lg,
                    update_sql_str=tb_update_str_5,
                )
                index += 1

            else:
                # 下面为天天特价商品信息更新
                '''
                ** 由于天天特价不会提前下架商品,就不对应更新特价时间段
                '''
                # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间
                # if index % 6 == 0:
                #     try: del tmp_taobao_tiantiantejia
                #     except: pass
                #     collect()
                #     tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=lg)
                #
                # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3])
                # if tmp_body == '':
                #     msg = '获取到的tmp_body为空str! 出错category为: ' + item[3]
                #     lg.error(msg)
                #     continue
                #
                # try:
                #     tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0]
                # except IndexError:
                #     msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3]
                #     lg.error(msg)
                #     continue
                # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body)
                # if tmp_sort_data == 'no items':
                #     lg.info('该api接口获取到的item_list为no items!请检查')
                #     break
                # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data)
                # # lg.info(str(tejia_goods_list))
                # await async_sleep(.45)
                # # lg.info('111')
                '''
                研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过
                '''
                # if is_in_child_sort(tejia_goods_list, goods_id=goods_id) is False:     # 表示被官方提前下架
                #     # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=goods_id)
                #     # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(goods_id))
                #     print('222')
                #     pass

                # else:       # 表示商品未被提前下架
                lg.info(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                    (goods_id, str(index)))
                taobao = TaoBaoLoginAndParse(
                    logger=lg,
                    is_real_times_update_call=is_real_times_update_call)
                taobao.get_goods_data(goods_id)
                goods_data = taobao.deal_with_data(goods_id=goods_id)
                if goods_data != {}:
                    # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=goods_id)
                    # if tmp_time != []:
                    #     begin_time, end_time = tmp_time
                    #
                    #     goods_data['goods_id'] = goods_id
                    #     goods_data['schedule'] = [{
                    #         'begin_time': begin_time,
                    #         'end_time': end_time,
                    #     }]
                    #     goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    #     await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server)
                    # else:
                    #     lg.info('该goods_id不在该api接口的商品中!!')
                    #     pass

                    goods_data['goods_id'] = goods_id
                    if goods_data.get('is_delete', 0) == 1:
                        lg.info('@该商品已下架...')

                    await taobao.update_taobao_tiantiantejia_table(
                        data=goods_data, pipeline=tmp_sql_server)

                else:
                    await async_sleep(4)

                await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                index += 1
                collect()

        else:
            lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass
        collect()
    lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
    if get_shanghai_time().hour == 0:  # 0点以后不更新
        # sleep(60 * 60 * .5)
        await async_sleep(5 * 60)

    else:
        await async_sleep(60 * 1)
    collect()

    return True
 def _set_logger(self):
     self.my_lg = set_logger(
         log_file_name=MY_SPIDER_LOGS_PATH + '/折800/秒杀实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
         console_log_level=INFO,
         file_log_level=INFO
     )
Example #11
0
import requests
from pprint import pprint
from json import loads, dumps
import re
import asyncio
from logging import INFO, ERROR

from fzutils.log_utils import set_logger
from fzutils.time_utils import get_shanghai_time
from fzutils.cp_utils import get_taobao_sign_and_body

MY_SPIDER_LOGS_PATH = '/Users/afa/myFiles/my_spider_logs/电商项目'

my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/微淘/' +
                   str(get_shanghai_time())[0:10] + '.txt',
                   console_log_level=INFO,
                   file_log_level=ERROR)

headers = {
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'user-agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'accept': '*/*',
    'referer':
    'https://market.m.taobao.com/apps/market/content/index.html?ut_sk=1.VmYadv9DXkkDAFZm0VV4JBNq_21380790_1527298517854.Copy.33&params=%7B%22csid%22%3A%2254a52aea54b7c29d289a0e36b2bf2f51%22%7D&wh_weex=true&contentId=200668154273&source=weitao_2017_nocover&data_prefetch=true&suid=3D763077-A7BF-43BC-9092-C17B35E896F9&wx_navbar_transparent=false&wx_navbar_hidden=false&sourceType=other&un=bc80c9f324602d31384c4a342af87869&share_crt_v=1&sp_tk=o6R2Q0ZDMHZvaDBlS6Ok&cpp=1&shareurl=true&spm=a313p.22.68.948703884987&short_name=h.WAjz5RP&app=chrome',
    'authority': 'h5api.m.taobao.com',
    # cookie得注释掉, 否则为非法请求
    # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; cookie2=16c0da3976ab60d7c87ef7cea1e83cb2; v=0; _tb_token_=dd9fe0edb4b3; tk_trace=oTRxOWSBNwn9dPy4KVJVbutfzK5InlkjwbWpxHegXyGxPdWTLVRjn23RuZzZtB1ZgD6Khe0jl%2BAoo68rryovRBE2Yp933GccTPwH%2FTbWVnqEfudSt0ozZPG%2BkA1iKeVv2L5C1tkul3c1pEAfoOzBoBsNsJyRfZ0FH5AEyz0CWtQgYlWnUAkbLeBYDpeNMwsdmBZ5GYwOAPdU1B2IUBU8G0MXGQCqFCjZt1pjb2TJN2uXIiZePpK9SWkwA%2FlD1sTTfYGTmnCo0YJ7IAG%2BnJtbITMYZ3mzYjFZtYlGojOqye861%2FNFDJbTR41FruF%2BHJRnt%2BHJNgFj3F7IDGXJCs8K; linezing_session=4ic7MPhjlPi65fN5BzW36xB7_1527299424026Fe7K_1; isg=BDo6U2SENb2uULiLxiJ4XA6ri2ZWbZPa3G9M1kQz602YN9pxLHsO1QBGg8PrpzZd; _m_h5_tk=53d85a4f43d72bc623586c142f0c5293_1527305714711; _m_h5_tk_enc=cc75764d122f72920ae715c9102701a8'
}
Example #12
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server._select_table(sql_str=tm_select_str_3))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info('总计待更新个数: {0}'.format(len(result)))
            my_lg.info(
                '--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            tmall = TmallParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del tmall
                    except:
                        pass
                    tmall = TmallParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:  # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    tmp_item = []
                    if item[0] == 3:  # 从数据库中取出时,先转换为对应的类型
                        tmp_item.append(0)
                    elif item[0] == 4:
                        tmp_item.append(1)
                    elif item[0] == 6:
                        tmp_item.append(2)
                    tmp_item.append(item[1])
                    oo = tmall.get_goods_data(goods_id=tmp_item)
                    oo_is_delete = oo.get('is_detele', 0)  # 避免下面解析data错误休眠
                    if isinstance(oo, int):  # 单独处理return 4041
                        index += 1
                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        continue

                    data = tmall.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[2],
                                shelf_time=item[5],
                                delete_time=item[6])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        site_id = tmall._from_tmall_type_get_site_id(
                            type=data['type'])
                        try:
                            old_sku_info = format_price_info_list(
                                price_info_list=json_2_dict(item[7]),
                                site_id=site_id)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[7]
                        data['_is_price_change'], data[
                            'sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(
                                    data['price_info_list'], site_id=site_id),
                                is_price_change=item[8]
                                if item[8] is not None else 0)

                        tmall.to_right_and_update_data(data,
                                                       pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        if oo_is_delete == 1:
                            pass
                        else:
                            my_lg.info('------>>>| 休眠8s中...')
                            sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Example #13
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=kl_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True)
            for item in result:  # 实时更新数据
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del kaola
                    except:
                        pass
                    kaola = KaoLaParse(logger=my_lg,
                                       is_real_times_update_call=True)
                    collect()

                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    logger=my_lg,
                    remainder=10,
                )
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    db_goods_info_obj = KLDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    data = kaola._get_goods_data(goods_id=goods_id)
                    if data.get('is_delete', 0) == 1:
                        # 单独处理下架商品
                        data['goods_id'] = goods_id
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=db_goods_info_obj.is_delete,
                                shelf_time=db_goods_info_obj.shelf_time,
                                delete_time=db_goods_info_obj.delete_time,
                            )

                        try:
                            kaola.to_right_and_update_data(data,
                                                           pipeline=sql_cli)
                        except Exception:
                            my_lg.error(exc_info=True)

                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        index += 1
                        collect()
                        continue

                    data = kaola._deal_with_data()
                    if data != {}:
                        if data.get('is_delete', 0) == 1:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='kl',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )
                        kaola.to_right_and_update_data(data, pipeline=sql_cli)

                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠3s中...')
                        sleep(3.)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:
            # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
Example #14
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=yx_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            yanxuan = YanXuanParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del yanxuan
                    except:
                        pass
                    yanxuan = YanXuanParse(logger=my_lg)
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    yanxuan._get_goods_data(goods_id=item[1])

                    data = yanxuan._deal_with_data()
                    db_goods_info_obj = YXDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:
                            # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            sql_cli._update_table_2(
                                sql_str=yx_update_str_2,
                                params=(db_goods_info_obj.goods_id, ),
                                logger=my_lg,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='yx',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        yanxuan.to_right_and_update_data(data,
                                                         pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR
        )

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server._select_table(sql_str=yx_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info('--------------------------------------------------------')
            my_lg.info('总计待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            yanxuan = YanXuanParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del yanxuan
                    except:
                        pass
                    yanxuan = YanXuanParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:  # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (str(item[1]), str(index)))
                    yanxuan._get_goods_data(goods_id=item[1])

                    data = yanxuan._deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]
                        data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                            tmp_data=data,
                            is_delete=item[2],
                            shelf_time=item[5],
                            delete_time=item[6])
                        if data.get('is_delete') == 1:  # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            tmp_sql_server._update_table_2(sql_str=yx_update_str_2, params=(item[1],), logger=my_lg)
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data['_is_price_change'], data['_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price']
                            )
                            try:
                                old_sku_info = format_price_info_list(price_info_list=json_2_dict(item[7]), site_id=30)
                            except AttributeError:  # 处理已被格式化过的
                                old_sku_info = item[7]
                            data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(data['price_info_list'], site_id=30),
                                is_price_change=item[8] if item[8] is not None else 0
                            )

                        yanxuan.to_right_and_update_data(data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        gc.collect()
Example #16
0
def run_forever():
    while True:
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/1688/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server._select_table(sql_str=al_select_str_6))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')
            my_lg.info('待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            ali_1688 = ALi1688LoginAndParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    ali_1688 = ALi1688LoginAndParse(logger=my_lg)

                if index % 50 == 0:
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'
                        .format(item[0], index))
                    data = ali_1688.get_ali_1688_data(item[0])
                    if isinstance(data, int) is True:  # 单独处理返回tt为4041
                        continue
                    else:
                        pass

                    if data.get('is_delete') == 1:  # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]

                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        # my_lg.info('上架时间:{0}, 下架时间:{1}'.format(data['shelf_time'], data['delete_time']))
                        ali_1688.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(1.5)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = ali_1688.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        # my_lg.info('上架时间:{0}, 下架时间:{1}'.format(data['shelf_time'], data['delete_time']))
                        '''为了实现这个就必须保证price, taobao_price在第一次抓下来后一直不变,变得记录到_price_change_info字段中'''
                        # 业务逻辑
                        #   公司后台 modify_time > 转换时间,is_price_change=1, 然后对比pricechange里面的数据,要是一样就不提示平台员工改价格
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[2],
                                old_taobao_price=item[3],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        try:
                            old_sku_info = format_price_info_list(
                                price_info_list=json_2_dict(item[6]),
                                site_id=2)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[6]
                        data['_is_price_change'], data[
                            'sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(
                                    data['sku_map'], site_id=2),
                                is_price_change=item[7]
                                if item[7] is not None else 0)

                        ali_1688.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                        sleep(.3)  # 避免服务器更新太频繁
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                sleep(2.2)
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Example #17
0
app = Flask(__name__, root_path=getcwd())
# 存储tb user_id
tb_shop_id_list = []
tb_shop_info_list = []
company_id_bloom_filter = BloomFilter(capacity=500000, error_rate=.00001)
# 获取db company_id
company_id_bloom_filter = _get_db_company_unique_id_list_by_site_id(
    site_id=13,
    bloom_filter=company_id_bloom_filter,
)[1]

lg = set_logger(
    logger_name=get_uuid1(),
    log_file_name='/Users/afa/myFiles/my_spider_logs/fz_server/' +
    str(get_shanghai_time())[0:10] + '.txt',
    console_log_level=INFO,
    file_log_level=INFO,
)


@app.route('/', methods=['GET', 'POST'])
def home():
    return '欢迎来到 fzhook_server 主页!'


@app.route('/dy', methods=['GET', 'POST'])
def dy():
    """
    获取dy 无水印视频
    :return:
Example #18
0
    flowfilter,
    io,
)
import random
import typing
from logging import INFO, ERROR

from fzutils.log_utils import set_logger
from fzutils.spider.app_utils import get_mitm_flow_request_headers_user_agent
from fzutils.spider.async_always import *

LOG_SAVE_PATH = '/Users/afa/myFiles/my_spider_logs/mitmproxy/tb/'
logger = ctx.log
lg = set_logger(
    logger_name=get_uuid1(),
    log_file_name=LOG_SAVE_PATH + str(get_shanghai_time())[0:10] + '.txt',
    file_log_level=INFO,
)


class Writer:
    def __init__(self, path: str) -> None:
        self.f: typing.IO[bytes] = open(path, "wb")
        self.w = io.FlowWriter(self.f)

    def response(self, flow: http.HTTPFlow) -> None:
        response = flow.response
        # logger.info(str(response.text))
        # logger.info(response.headers)
        if random.choice([True, False]):
            self.w.add(flow)
def run_forever():
    #### 实时更新数据
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        sql_str = '''
        select GoodsID, IsDelete, Price, TaoBaoPrice, shelf_time, delete_time 
        from dbo.GoodsInfoAutoGet 
        where SiteID=1 and MainGoodsID is not null
        order by ID desc'''

        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server._select_table(sql_str=sql_str, )
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')
            my_lg.info('总计待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlPools()

                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    data = taobao.get_goods_data(item[0])

                    if data.get('is_delete') == 1:  # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[2],
                                old_taobao_price=item[3],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        my_lg.info('------>>>| 休眠5s中...')
                        sleep(5)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(10)
                    pass

                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
        restart_program()
Example #20
0
def run_forever():
    while True:
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/jd/实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR)
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        # and GETDATE()-ModfiyTime>1 and IsDelete=0
        try:
            result = list(tmp_sql_server._select_table(sql_str=jd_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            continue

        my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
        my_lg.info(str(result))
        my_lg.info('--------------------------------------------------------')
        my_lg.info('总计待更新个数:{}'.format(len(result)))

        my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
        index = 1

        # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
        jd = JdParse(logger=my_lg)
        for item in result:  # 实时更新数据
            # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            # jd = JdParse()
            if index % 10 == 0:
                try: del jd
                except: pass
                gc.collect()
                jd = JdParse(logger=my_lg)

            if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                my_lg.info('正在重置,并与数据库建立新连接中...')
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                my_lg.info('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                my_lg.info('------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'.format(item[1], index))
                tmp_item = []
                if item[0] == 7 or item[0] == 8:        # 从数据库中取出时,先转换为对应的类型
                    tmp_item.append(0)
                elif item[0] == 9:
                    tmp_item.append(1)
                elif item[0] == 10:
                    tmp_item.append(2)

                tmp_item.append(item[1])
                jd.get_goods_data(goods_id=tmp_item)
                data = jd.deal_with_data(goods_id=tmp_item)
                if data != {}:
                    data['goods_id'] = item[1]

                    data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                        tmp_data=data,
                        is_delete=item[2],
                        shelf_time=item[5],
                        delete_time=item[6])
                    my_lg.info('上架时间: {0}, 下架时间: {1}'.format(data['shelf_time'], data['delete_time']))

                    data['_is_price_change'], data['_price_change_info'] = _get_price_change_info(
                        old_price=item[3],
                        old_taobao_price=item[4],
                        new_price=data['price'],
                        new_taobao_price=data['taobao_price'])

                    site_id = jd._from_jd_type_get_site_id_value(jd_type=data['jd_type'])
                    try:
                        old_sku_info = format_price_info_list(
                            price_info_list=json_2_dict(item[7]),
                            site_id=site_id)
                    except AttributeError:  # 处理已被格式化过的
                        old_sku_info = item[7]
                    data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record(
                        old_sku_info=old_sku_info,
                        new_sku_info=format_price_info_list(data['price_info_list'], site_id=site_id),
                        is_price_change=item[8] if item[8] is not None else 0
                    )

                    jd.to_right_and_update_data(data, pipeline=tmp_sql_server)
                else:  # 表示返回的data值为空值
                    pass
            else:  # 表示返回的data值为空值
                my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                pass
            index += 1
            gc.collect()
            sleep(1.2)
        my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        try: del jd
        except: pass
        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(5)
        gc.collect()
Example #21
0
 def _set_logger(self):
     self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                             '/all_comment/_/' +
                             str(get_shanghai_time())[0:10] + '.txt',
                             console_log_level=INFO,
                             file_log_level=ERROR)
def run_forever():
    #### 实时更新数据
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server._select_table(sql_str=tb_select_str_3, )
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')
            my_lg.info('总计待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlPools()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    oo = taobao.get_goods_data(item[0])
                    oo_is_delete = oo.get('is_delete', 0)  # 避免下面解析data错误休眠
                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[2],
                                old_taobao_price=item[3],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        try:
                            old_sku_info = format_price_info_list(
                                price_info_list=json_2_dict(item[6]),
                                site_id=1)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[6]
                        data['_is_price_change'], data[
                            'sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(
                                    data['price_info_list'], site_id=1),
                                is_price_change=item[7]
                                if item[7] is not None else 0)

                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        if oo_is_delete == 1:
                            pass
                        else:
                            my_lg.info('------>>>| 休眠5s中...')
                            sleep(4)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(10)
                    pass

                index += 1
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
        restart_program()