Exemple #1
0
 def __init__(self, logger=None):
     super().__init__()
     self.result_data = {}
     self.msg = ''
     if logger is None:
         self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                 '/阿里1688/comment/' +
                                 str(get_shanghai_time())[0:10] + '.txt',
                                 console_log_level=INFO,
                                 file_log_level=ERROR)
     else:
         self.my_lg = logger
     self.my_phantomjs = MyPhantomjs()
     # 可动态执行的代码
     self._exec_code = '''
     self.driver.find_element_by_css_selector('div.tab-item.filter:nth-child(2)').click() 
     sleep(1.5)
     # 向下滚动10000像素
     js = 'document.body.scrollTop=10000'
     self.driver.execute_script(js)
     sleep(3)
     '''
     self.headers = {
         'accept-encoding': 'gzip, deflate, br',
         'accept-language': 'zh-CN,zh;q=0.9',
         'user-agent': HEADERS[randint(0,
                                       len(HEADERS) - 1)],
         'accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
         'referer': 'https://detail.1688.com/offer/45579899125.html',
     }
     self.page_size = '30'
 def __init__(self):
     self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                             '/all_comment/_/' +
                             str(get_shanghai_time())[0:10] + '.txt',
                             console_log_level=INFO,
                             file_log_level=ERROR)
     self.msg = ''
Exemple #3
0
 def __init__(self, logger=None):
     self.headers = {
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         # 'Accept-Encoding:': 'gzip',
         'Accept-Language': 'zh-CN,zh;q=0.9',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Host': 's.h5.jumei.com',
         'Referer': 'http://s.h5.jumei.com/yiqituan/list',
         'User-Agent': HEADERS[randint(0, len(HEADERS)-1)],  # 随机一个请求头
         'X-Requested-With': 'XMLHttpRequest',
     }
     self.msg = ''
     if logger is None:
         self.my_lg = set_logger(
             log_file_name=MY_SPIDER_LOGS_PATH + '/聚美优品/拼团/' + self.get_log_file_name_from_time() + '.txt',
             console_log_level=INFO,
             file_log_level=ERROR
         )
     else:
         self.my_lg = logger
     self.tab_dict = {
         '母婴健康': 'coutuan_baby',
         '家居': 'coutuan_furniture',
         '饰品配饰': 'coutuan_jewellery',
         '内衣': 'coutuan_underwear',
         '食品保健': 'coutuan_food',
         '美妆': 'coutuan_makeup',
         '女装': 'coutuan_ladies',
         '礼品箱包': 'coutuan_bag',
         '数码家电': 'coutuan_3c',
         '鞋类': 'coutuan_shose',
         '下期预告': 'coutuan_pre',
     }
 def _set_logger(self, logger):
     if logger is None:
         self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                 '/淘宝/淘抢购/' +
                                 str(get_shanghai_time())[0:10] + '.txt',
                                 console_log_level=INFO,
                                 file_log_level=ERROR)
     else:
         self.my_lg = logger
Exemple #5
0
 def _set_logger(self, logger):
     '''
     设置logger
     :param logger:
     :return:
     '''
     if logger is None:
         self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                 '/淘宝/comment/' +
                                 str(get_shanghai_time())[0:10] + '.txt',
                                 console_log_level=INFO,
                                 file_log_level=ERROR)
     else:
         self.my_lg = logger
Exemple #6
0
 def __init__(self, logger=None):
     self.headers = {
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         # 'Accept-Encoding:': 'gzip',
         'Accept-Language': 'zh-CN,zh;q=0.8',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Host': 'acs.m.taobao.com',
         'User-Agent': HEADERS[randint(0, 34)]      # 随机一个请求头
     }
     self.result_data = {}
     if logger is None:
         self.my_lg = set_logger(
             log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/_/' + str(get_shanghai_time())[0:10] + '.txt',
             console_log_level=INFO,
             file_log_level=ERROR
         )
     else: self.my_lg = logger
     self.msg = ''
Exemple #7
0
 def __init__(self):
     self.headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         # 'Accept-Encoding:': 'gzip',
         'Accept-Language': 'zh-CN,zh;q=0.9',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Host': 's.h5.jumei.com',
         'Referer': 'http://s.h5.jumei.com/yiqituan/list',
         'User-Agent': HEADERS[randint(0,
                                       len(HEADERS) - 1)],  # 随机一个请求头
         'X-Requested-With': 'XMLHttpRequest',
     }
     self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                             '/聚美优品/拼团/' +
                             self.get_log_file_name_from_time() + '.txt',
                             console_log_level=INFO,
                             file_log_level=ERROR)
     self.msg = ''
     self.api_all_goods_id = {}  # 预存储每个tab, index的item_list
 def __init__(self, logger=None):
     super().__init__()
     self.result_data = {}
     self.msg = ''
     if logger is None:
         self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                 '/淘宝/comment/' +
                                 str(get_shanghai_time())[0:10] + '.txt',
                                 console_log_level=INFO,
                                 file_log_level=ERROR)
     else:
         self.my_lg = logger
     self.headers = {
         'accept-encoding': 'gzip, deflate, br',
         'accept-language': 'zh-CN,zh;q=0.9',
         'user-agent': HEADERS[randint(0,
                                       len(HEADERS) - 1)],
         'accept': '*/*',
         'referer': 'https://item.taobao.com/item.htm?id=555635098639',
     }
     self.page_size = '20'  # 固定值
     self.comment_page_switch_sleep_time = 1.5  # 评论下一页sleep time
 def __init__(self, logger=None):
     self.headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Accept-Language': 'zh-CN,zh;q=0.9',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Host': 's.h5.jumei.com',
         'Referer':
         'https://s.h5.jumei.com/yiqituan/detail?item_id=ht180321p2453550t4&type=global_deal',
         'User-Agent': HEADERS[randint(0, 34)],
         'X-Requested-With': 'XMLHttpRequest',
     }
     self.result_data = {}
     if logger is None:
         self.my_lg = set_logger(
             log_file_name=MY_SPIDER_LOGS_PATH + '/聚美优品/拼团/' +
             self.get_log_file_name_from_time() + '.txt',
             console_log_level=INFO,
             file_log_level=ERROR)
     else:
         self.my_lg = logger
     self.msg = ''
 def __init__(self, logger=None):
     self.headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         # 'Accept-Encoding:': 'gzip',
         'Accept-Language': 'zh-CN,zh;q=0.8',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Host': 'h5api.m.taobao.com',
         'User-Agent': HEADERS[randint(0, 34)]  # 随机一个请求头
     }
     if logger is None:
         self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                 '/淘宝/天天特价/' +
                                 str(get_shanghai_time())[0:10] + '.txt',
                                 console_log_level=INFO,
                                 file_log_level=ERROR)
     else:
         self.my_lg = logger
     self.msg = ''
     self.main_sort = {
         '495000': ['时尚女装', 'mtopjsonp2'],
         '496000': ['潮流男装', 'mtopjsonp4'],
         '499000': ['性感内衣', 'mtopjsonp5'],
         '508000': ['家居百货', 'mtopjsonp6'],
         '502000': ['品质母婴', 'mtopjsonp7'],
         '503000': ['食品饮料', 'mtopjsonp8'],
         '497000': ['男女鞋品', 'mtopjsonp9'],  # ['497000', '498000']
         '498000': ['男女鞋品', 'mtopjsonp9'],
         '505000': ['美容美妆', 'mtopjsonp10'],
         '500000': ['箱包配饰', 'mtopjsonp11'],  # ['500000', '501000']
         '501000': ['箱包配饰', 'mtopjsonp11'],
         '504000': ['数码电器', 'mtopjsonp12'],
         '506000': ['户外运动', 'mtopjsonp13'],  # ['506000', '507000']
         '507000': ['户外运动', 'mtopjsonp13'],
     }
async def run_forever():
    #### 实时更新数据
    # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中, 不能实现每日一志
    my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/天天特价/' +
                       str(get_shanghai_time())[0:10] + '.txt',
                       console_log_level=INFO,
                       file_log_level=ERROR)

    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
    # 由于不处理下架的商品,所以is_delete=0
    sql_str = '''
    select goods_id, is_delete, tejia_end_time, block_id, tag_id 
    from dbo.taobao_tiantiantejia 
    where site_id=19 and is_delete=0 and GETDATE()-modfiy_time>2 and MainGoodsID is not null
    '''

    try:
        result = list(tmp_sql_server._select_table(sql_str=sql_str))
    except TypeError:
        my_lg.error('TypeError错误, 导致原因: 数据库连接失败...(可能维护中)')
        return None

    my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
    my_lg.info(str(result))
    my_lg.info('--------------------------------------------------------')
    my_lg.info('待更新的goods_id个数: {0}'.format(len(result)))

    my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
    index = 1
    # tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg)
    for item in result:  # 实时更新数据
        if index % 50 == 0:
            my_lg.info('正在重置,并与数据库建立新连接中...')
            # try: del tmp_sql_server
            # except: pass
            # gc.collect()
            tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
            my_lg.info('与数据库的新连接成功建立...')

        if tmp_sql_server.is_connect_success:
            tejia_end_time = item[2]
            # my_lg.info(str(tejia_end_time))

            if item[1] == 1:  # 原先下架的商品,扫描到不处理
                # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0])
                # my_lg.info('该商品goods_id[{0}]已售完, 删除成功!'.format(item[0]))
                my_lg.info(
                    '&&&&&& 该商品({0})原先状态为is_delete=1, 不进行实际删除操作! 索引为({1})'.
                    format(item[0], str(index)))
                index += 1
                pass

            elif tejia_end_time < datetime.datetime.now():
                # 过期的不删除, 降为更新为常规爆款促销商品
                index = await update_expired_goods_to_normal_goods(
                    goods_id=item[0],
                    index=index,
                    tmp_sql_server=tmp_sql_server,
                    logger=my_lg)
                pass

            else:
                # 下面为天天特价商品信息更新
                '''
                ** 由于天天特价不会提前下架商品,就不对应更新特价时间段
                '''
                # # 先检查该商品在对应的子分类中是否已经被提前下架, 并获取到该商品的上下架时间
                # if index % 6 == 0:
                #     try: del tmp_taobao_tiantiantejia
                #     except: pass
                #     gc.collect()
                #     tmp_taobao_tiantiantejia = TaoBaoTianTianTeJia(logger=my_lg)
                #
                # tmp_body = await tmp_taobao_tiantiantejia.get_one_api_body(current_page=item[4], category=item[3])
                # if tmp_body == '':
                #     msg = '获取到的tmp_body为空str! 出错category为: ' + item[3]
                #     my_lg.error(msg)
                #     continue
                #
                # try:
                #     tmp_body = re.compile(r'\((.*?)\)').findall(tmp_body)[0]
                # except IndexError:
                #     msg = 're筛选body时出错, 请检查! 出错category为: ' + item[3]
                #     my_lg.error(msg)
                #     continue
                # tmp_sort_data = await tmp_taobao_tiantiantejia.get_sort_data_list(body=tmp_body)
                # if tmp_sort_data == 'no items':
                #     my_lg.info('该api接口获取到的item_list为no items!请检查')
                #     break
                # tejia_goods_list = await tmp_taobao_tiantiantejia.get_tiantiantejia_goods_list(data=tmp_sort_data)
                # # my_lg.info(str(tejia_goods_list))
                # await asyncio.sleep(.45)
                # # my_lg.info('111')
                '''
                研究发现已经上架的天天特价商品不会再被官方提前下架,所以此处什么都不做,跳过
                '''
                # if is_in_child_sort(tejia_goods_list, goods_id=item[0]) is False:     # 表示被官方提前下架
                #     # tmp_sql_server.delete_taobao_tiantiantejia_expired_goods_id(goods_id=item[0])
                #     # print('该商品goods_id[{0}]已被官方提前下架, 删除成功!'.format(item[0]))
                #     print('222')
                #     pass

                # else:       # 表示商品未被提前下架
                my_lg.info(
                    '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                    (item[0], str(index)))
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                taobao.get_goods_data(item[0])
                goods_data = taobao.deal_with_data(goods_id=item[0])
                if goods_data != {}:
                    # tmp_time = await get_this_goods_id_tejia_time(tejia_goods_list, goods_id=item[0])
                    # if tmp_time != []:
                    #     begin_time, end_time = tmp_time
                    #
                    #     goods_data['goods_id'] = item[0]
                    #     goods_data['schedule'] = [{
                    #         'begin_time': begin_time,
                    #         'end_time': end_time,
                    #     }]
                    #     goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    #     await taobao.update_taobao_tiantiantejia_table(data=goods_data, pipeline=tmp_sql_server)
                    # else:
                    #     my_lg.info('该goods_id不在该api接口的商品中!!')
                    #     pass

                    goods_data['goods_id'] = item[0]
                    '''不专门更新上下架时间段'''
                    # goods_data['schedule'] = [{
                    #     'begin_time': begin_time,
                    #     'end_time': end_time,
                    # }]
                    # goods_data['tejia_begin_time'], goods_data['tejia_end_time'] = await tmp_taobao_tiantiantejia.get_tejia_begin_time_and_tejia_end_time(schedule=goods_data.get('schedule', [])[0])
                    if goods_data.get('is_delete', 0) == 1:
                        my_lg.info('@该商品已下架...')

                    await taobao.update_taobao_tiantiantejia_table(
                        data=goods_data, pipeline=tmp_sql_server)

                else:
                    await asyncio.sleep(4)  # 否则休息4秒
                    pass

                await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                index += 1
                gc.collect()

        else:  # 表示返回的data值为空值
            my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass
        gc.collect()
    my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
    if get_shanghai_time().hour == 0:  # 0点以后不更新
        # sleep(60 * 60 * .5)
        pass

    else:
        sleep(5)
    gc.collect()

    return True
Exemple #12
0
def run_forever():
    #### 实时更新数据
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()

        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    data = taobao.get_goods_data(item[0])

                    if data.get('is_delete') == 1:  # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        my_lg.info('------>>>| 休眠5s中...')
                        sleep(5)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(10)
                    pass

                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
        restart_program()
Exemple #13
0
# coding:utf-8
'''
@author = super_fazai
@File    : test.py
@Time    : 2017/10/11 14:24
@connect : [email protected]
'''

from my_logging import set_logger

my_lg = set_logger(log_file_name='test.txt')
my_lg.exception('啊')
my_lg.info('test')
Exemple #14
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/天猫/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = '''
        select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice 
        from dbo.GoodsInfoAutoGet 
        where (SiteID=3 or SiteID=4 or SiteID=6) and GETDATE()-ModfiyTime>0.2 and MainGoodsID is not null order by ID desc
        '''

        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info('总计待更新个数: {0}'.format(len(result)))
            my_lg.info(
                '--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            tmall = TmallParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del tmall
                    except:
                        pass
                    tmall = TmallParse(logger=my_lg)
                    gc.collect()

                if index % 10 == 0:  # 每10次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    tmp_item = []
                    if item[0] == 3:  # 从数据库中取出时,先转换为对应的类型
                        tmp_item.append(0)
                    elif item[0] == 4:
                        tmp_item.append(1)
                    elif item[0] == 6:
                        tmp_item.append(2)
                    tmp_item.append(item[1])
                    data = tmall.get_goods_data(goods_id=tmp_item)
                    if isinstance(data, int):  # 单独处理return 4041
                        index += 1
                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        continue

                    if data.get('is_delete') == 1:  # 单独处理下架商品
                        data['goods_id'] = item[1]

                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[2],
                                MyShelfAndDownTime=item[3])

                        # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data))
                        tmall.to_right_and_update_data(data,
                                                       pipeline=tmp_sql_server)

                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        index += 1
                        gc.collect()
                        continue

                    data = tmall.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[1]

                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[2],
                                MyShelfAndDownTime=item[3])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[4],
                                old_taobao_price=item[5],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])
                        # my_lg.info(str(data['_is_price_change']) + ' ' +str(data['_price_change_info']))

                        # my_lg.info('------>>>| 爬取到的数据为: %s' % str(data))
                        tmall.to_right_and_update_data(data,
                                                       pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                gc.collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
 def _set_logger(self):
     self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                             '/聚美优品/拼团/' + str(get_shanghai_time())[0:10] +
                             '.txt',
                             console_log_level=INFO,
                             file_log_level=ERROR)
Exemple #16
0
import requests
from pprint import pprint
from json import loads, dumps
import re
import asyncio
from my_logging import set_logger
from logging import INFO, ERROR
from my_utils import (
    get_shanghai_time,
    get_taobao_sign_and_body,
)

MY_SPIDER_LOGS_PATH = '/Users/afa/myFiles/my_spider_logs/电商项目'

my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/微淘/' +
                   str(get_shanghai_time())[0:10] + '.txt',
                   console_log_level=INFO,
                   file_log_level=ERROR)

headers = {
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'user-agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'accept': '*/*',
    'referer':
    'https://market.m.taobao.com/apps/market/content/index.html?ut_sk=1.VmYadv9DXkkDAFZm0VV4JBNq_21380790_1527298517854.Copy.33&params=%7B%22csid%22%3A%2254a52aea54b7c29d289a0e36b2bf2f51%22%7D&wh_weex=true&contentId=200668154273&source=weitao_2017_nocover&data_prefetch=true&suid=3D763077-A7BF-43BC-9092-C17B35E896F9&wx_navbar_transparent=false&wx_navbar_hidden=false&sourceType=other&un=bc80c9f324602d31384c4a342af87869&share_crt_v=1&sp_tk=o6R2Q0ZDMHZvaDBlS6Ok&cpp=1&shareurl=true&spm=a313p.22.68.948703884987&short_name=h.WAjz5RP&app=chrome',
    'authority': 'h5api.m.taobao.com',
    # cookie得注释掉, 否则为非法请求
    # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; cookie2=16c0da3976ab60d7c87ef7cea1e83cb2; v=0; _tb_token_=dd9fe0edb4b3; tk_trace=oTRxOWSBNwn9dPy4KVJVbutfzK5InlkjwbWpxHegXyGxPdWTLVRjn23RuZzZtB1ZgD6Khe0jl%2BAoo68rryovRBE2Yp933GccTPwH%2FTbWVnqEfudSt0ozZPG%2BkA1iKeVv2L5C1tkul3c1pEAfoOzBoBsNsJyRfZ0FH5AEyz0CWtQgYlWnUAkbLeBYDpeNMwsdmBZ5GYwOAPdU1B2IUBU8G0MXGQCqFCjZt1pjb2TJN2uXIiZePpK9SWkwA%2FlD1sTTfYGTmnCo0YJ7IAG%2BnJtbITMYZ3mzYjFZtYlGojOqye861%2FNFDJbTR41FruF%2BHJRnt%2BHJNgFj3F7IDGXJCs8K; linezing_session=4ic7MPhjlPi65fN5BzW36xB7_1527299424026Fe7K_1; isg=BDo6U2SENb2uULiLxiJ4XA6ri2ZWbZPa3G9M1kQz602YN9pxLHsO1QBGg8PrpzZd; _m_h5_tk=53d85a4f43d72bc623586c142f0c5293_1527305714711; _m_h5_tk_enc=cc75764d122f72920ae715c9102701a8'
}