Ejemplo n.º 1
0
    def __init__(self, file_path, comment_path):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:  # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        logging.log(31, '爬取时间段:{}到{}'.format(self.start_time, self.end_time))
        # 定义评论的抓取时间范围
        # self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-08-01'  # 一天回复
        self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = yesterday
        # 标记爬虫工作
        self.is_work = True
        self.file_name_time = self.get_file_name_time()
        self.commnet_port_url = 'http://comment.sina.com.cn/page/info?version=1&format=json&channel=ty&newsid=comos-{}&group=0&compress=0&ie=utf-8&oe=utf-8&page={}&page_size=10&t_size=3&h_size=3&thread=1&callback=jsonp_1542676393124&_=1542676393124'
        self.page_num = 1
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://jq-chance-05:9870',
                                     user='******')
        hour = str(datetime.now()).split(' ')[-1].split(':')[0]
        if str(hour) != '00':
            two_hour_ago = int(hour) - 2
            if len(str(two_hour_ago)) == 1:
                two_hour_ago = '0' + str(two_hour_ago)
            self.hour_name = str(two_hour_ago) + '_' + str(hour)
        else:
            self.hour_name = '22_24'
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.file_path,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.comment_apth,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
Ejemplo n.º 2
0
    def __init__(self, redis_example):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]
        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]
        print('爬取时间段:{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day

        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]
Ejemplo n.º 3
0
    def __init__(self, redis_example):
        # 时间判断部分
        date = datetime.datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]

        now = datetime.datetime.now() - timedelta(days=0)  # 昨天时间
        now_date = str(now).split(' ')[0]
        print('爬取时间段:{}到{}'.format(news_start_time, now_date))
        logging.info('爬取时间段:{}到{}'.format(news_start_time, now_date))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = now_date
        # 标记爬虫工作
        self.is_work = False
        self.redis_example = redis_example
        self.pid = os.getpid()

        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]
Ejemplo n.º 4
0
    def __init__(self, file_path, comment_path):

        # 时间判断部分
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # print('爬取时间段:{}到{}'.format(news_start_time, yesterday))

        logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2019-09-09'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-09-16'
        # 标记爬虫工作
        self.is_work = True
        self.is_stop = False
        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs('{}/{}'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}'.format(
            self.comment_apth,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
Ejemplo n.º 5
0
    def __init__(self, file_path):

        self.headers_one = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:   # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2019-09-09'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-09-16'
        logger.info('爬取时间段:{}到{}'.format(self.start_time, self.end_time))
        # logging.info('爬取时间段:{}到{}'.format(self.start_time, self.end_time))
        # 定义评论的抓取时间范围
        self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-09-09'  # 一天回复
        # self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = '2019-09-16'
        # 标记爬虫工作
        self.is_work = True
        self.set_list = []  #去重列表
        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.hdfsclient = HdfsClient(url='http://jq-chance-05:9870', user='******')

        hour = str(datetime.now()).split(' ')[-1].split(':')[0]
        if str(hour) != '00':
            two_hour_ago = int(hour) - 2
            if len(str(two_hour_ago)) == 1:
                two_hour_ago = '0' + str(two_hour_ago)
            self.hour_name = str(two_hour_ago) + '_' + str(hour)
        else:
            self.hour_name = '22_24'

        self.hdfsclient.makedirs('{}/{}/{}'.format(self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
Ejemplo n.º 6
0
    def __init__(self, redis_example):
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段:{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time

        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]
Ejemplo n.º 7
0
    def __init__(self, file_path, comment_path):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        # 时间判断部分
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        logging.log(31, '爬取时间段:{}到{}'.format(self.start_time, self.end_time))
        # 定义评论的抓取时间范围
        # self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-08-01'  # 一天回复
        self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = yesterday
        # 标记爬虫工作
        self.is_work = True
        self.commnet_port_url = 'http://comment.sina.com.cn/page/info?version=1&format=json&channel=ty&newsid=comos-{}&group=0&compress=0&ie=utf-8&oe=utf-8&page={}&page_size=10&t_size=3&h_size=3&thread=1&callback=jsonp_1542676393124&_=1542676393124'
        self.page_num = 1
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs('{}/{}'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}'.format(
            self.comment_apth,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]

        self.article_queue = Queue()
        self.comment_queue = Queue()
        self.total_item = ''
Ejemplo n.º 8
0
    def __init__(self):
        # 时间判断部分
        date = datetime.datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]

        now_date = datetime.datetime.now() - timedelta(days=0)  # 当前时间
        now_time = str(now_date).split(' ')[0]
        print('爬取时间段:{}到{}'.format(news_start_time, now_time))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = now_time
        self.is_break = False
        self.pid = os.getpid()

        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/weibo/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
Ejemplo n.º 9
0
    def __init__(self, file_path):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        # 时间判断部分
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2019-09-09'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-09-16'
        logger.info('爬取时间段:{}到{}'.format(self.start_time, self.end_time))
        # logging.info('爬取时间段:{}到{}'.format(self.start_time, self.end_time))
        # 定义评论的抓取时间范围
        self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-09-09'  # 一天回复
        # self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = '2019-09-16'
        # 标记爬虫工作
        self.is_work = True
        self.set_list = []  #去重列表
        self.file_path = file_path
        self.hdfsclient = HdfsClient(url='http://jq-chance-05:9870',
                                     user='******')
        self.hdfsclient.makedirs('{}/{}'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
        self.source_date = ''
        self.source_time = ''
Ejemplo n.º 10
0
    def __init__(self):
        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        self.start_url = ''
        # 评论接口模板
        self.commnet_port_url = ''

        # 时间部分,按小时抓取
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]

        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]

        print('爬取时间段:{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day

        # 标记爬虫工作1
        self.is_break = False
        # 标记爬虫工作2
        self.is_work = False
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/forum/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]
Ejemplo n.º 11
0
class Spider(object):
    """
    这是一个爬虫模板
    """

    def __init__(self):

        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段:{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 标记爬虫工作
        self.is_break = False
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******')
        self.hdfsclient.makedirs('/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub('<a.*></a>', '', ret1)
            return ret2
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    def parse_url(self, data):
        # 创建一个字典接收数据
        goods_dict = dict()
        goods_dict['平台'] = data['平台']
        goods_dict['关键词'] = data['关键词']
        goods_dict['商品名'] = data['商品名']
        goods_dict['商品图片'] = data['商品图片']
        goods_dict['URL'] = data['URL']
        goods_dict['shop_name'] = data['shop_name']
        goods_dict['goods_id'] = data['goods_id']
        goods_dict['品牌'] = data['品牌']
        goods_dict['月销量'] = data['月销量']
        # logger.log(31, '--------********正在抓取的商品是:%s********--------' % goods_dict)
        self.parse_goods_price(goods_dict)

    # 解析商品价格信息
    def parse_goods_price(self, goods_dict):
        try:
            goods_url = 'https://p.3.cn/prices/mgets?callback=jQuery6465631&source=jshop&skuids=J_{}'.format(goods_dict['goods_id'])
            headers = {
                'content-type': 'application/json;charset=utf-8',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                'accept-encoding': 'gzip, deflate, br',
                'accept-language': 'zh-CN,zh;q=0.9',
                'cache-control': 'no-cache',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                time.sleep(0.2)
                response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
            price_data = re.search(r'\[{".*"}]', response.text)
            goods_dict['价格'] = json.loads(price_data.group())[0]['p']
            # print(goods_dict)
            self.parse_comment_num(goods_dict)
        except:
            print(22222222222222222222222, traceback.format_exc())

    # 抓取商品评论数
    def parse_comment_num(self, goods_dict):
        try:
            productId = goods_dict['goods_id']
            referer_url = goods_dict['URL']
            comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(productId)
            headers = {
                'content-type': 'text/html;charset=GBK',
                'authority': 'sclub.jd.com',
                'method': 'GET',
                # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1',
                'referer': referer_url,
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            try:
                time.sleep(0.2)
                response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
            comment_data = re.search(r'{"productAttr":null.*]}', response.text)
            comment_number = json.loads(comment_data.group())['productCommentSummary']['commentCount']
            goods_dict['comment_num'] = json.loads(comment_data.group())['productCommentSummary']['commentCountStr']
            # print(comment_number)
            if int(comment_number) == 0:
                logger.log(31, '************该商品没有评论数据*********')
            else:
                pages = int(math.ceil(float(int(comment_number) / 10)))
                self.goods_comments(goods_dict, pages)
        except:
            print(33333333333333333333333, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, pages):
        try:
            is_break = self.is_break
            # print(goods_dict)
            productId = goods_dict['goods_id']
            referer_url = goods_dict['URL']
            headers = {
                'content-type': 'text/html;charset=GBK',
                'authority': 'sclub.jd.com',
                'method': 'GET',
                # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1',
                'referer': referer_url,
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            if int(pages) >= 50:
                pages_num = 50
            else:
                pages_num = pages
            # 抓取商品评论链接(总共50页,第一页从0开始)
            for i in range(0, int(pages_num)):
                comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(productId, i)
                # print(comment_url)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                except:
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                    except:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                # print("正在抓取的页面是: %s" % comment_url)
                comments = response.text
                # print(comments)  # {"productAttr":null,.*]}
                comment = re.search(r'{"productAttr":null.*]}', comments)
                # print(comment.group())
                items = json.loads(comment.group())['comments']
                if int(len(items)) == 0:
                    break
                else:
                    goods_comment_dict = dict()
                    for item in items:
                        date_data = item['creationTime'].split(' ', 1)[0]
                        time_data = item['creationTime'].split(' ', 1)[1]
                        # print(date, time)
                        try:
                            content = self.re_html(item['content'])
                        except:
                            content = ''
                        # 追加评论
                        try:
                            comments_2 = item['afterUserComment']['content']
                        except:
                            comments_2 = ''
                        # 判断评论时间是否在规定的抓取时间内
                        if self.start_time <= item['creationTime']:
                            goods_comment_dict['platform'] = goods_dict['平台']
                            goods_comment_dict['date'] = date_data.strip()
                            goods_comment_dict['time'] = time_data.strip()
                            goods_comment_dict['keyword'] = goods_dict['关键词']
                            goods_comment_dict['name'] = goods_dict['商品名']
                            goods_comment_dict['imageurl'] = goods_dict['商品图片']
                            goods_comment_dict['audiourl'] = ''
                            goods_comment_dict['url'] = goods_dict['URL']
                            goods_comment_dict['shop_name'] = goods_dict['shop_name']
                            goods_comment_dict['user_name'] = item['nickname']
                            goods_comment_dict['author_id'] = ''
                            goods_comment_dict['content'] = content + ';' + comments_2
                            goods_comment_dict['content_id'] = str(item['id'])
                            goods_comment_dict['brand'] = goods_dict['品牌']
                            goods_comment_dict['price'] = goods_dict['价格']
                            goods_comment_dict['sales'] = goods_dict['月销量']
                            goods_comment_dict['focus_count'] = ''
                            goods_comment_dict['comment_num'] = goods_dict['comment_num']
                            goods_comment_dict['views'] = ''
                            goods_comment_dict['likes'] = item['usefulVoteCount']
                            try:
                                goods_comment_dict['comments_count'] = item['replyCount']
                            except:
                                goods_comment_dict['comments_count'] = ''
                            goods_comment_dict['reposts_count'] = ''
                            goods_comment_dict['topic_id'] = str(goods_dict['goods_id'])
                            try:
                                goods_comment_dict['type'] = item['productColor']
                            except:
                                goods_comment_dict['type'] = ''
                            try:
                                goods_comment_dict['size'] = item['productSize']
                            except:
                                goods_comment_dict['size'] = ''
                            goods_comment_dict['file_code'] = '51'
                            # print('********--------开始写入商品数据--------********')
                            # print(goods_comment_dict)
                            item = json.dumps(dict(goods_comment_dict), ensure_ascii=False) + '\n'
                            self.hdfsclient.new_write('/user/cspider_daily/nike_2h/ecommerce/{}/{}/51_{}_jingdong_nike{}.json'.format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), self.pid), item, encoding='utf-8')
                        if date_data.strip() < self.start_time:
                            is_break = True
                    if is_break:
                        break
        except:
            print(444444444444444444444444444, traceback.format_exc())

    def run(self, data):
        self.parse_url(data)
Ejemplo n.º 12
0
class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, file_path):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        # 时间判断部分
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2019-09-09'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-09-16'
        logging.log(31, '爬取时间段:{}到{}'.format(self.start_time, self.end_time))
        # 定义评论的抓取时间范围
        self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-08-01'  # 一天回复
        # self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = yesterday
        # 标记爬虫工作
        self.is_work = True
        self.file_path = file_path
        self.hdfsclient = HdfsClient(url='http://*****:*****@class="ui-list"]/li')
        for li in li_list:
            title = li.xpath('.//h2/a/text()')[0]
            news_url = li.xpath('.//h2/a/@href')[0]
            topic_time = ''.join(
                li.xpath('.//div[@class="ui-topic-attr"]/span[2]/text()')
            ).strip().split(':')[1]
            last_rreplay_time = li.xpath(
                './/div[@class="ui-topic-attr"]/span[3]/text()')[0].split(
                    ':')[1]
            views_replay = li.xpath(
                './/div[@class="ui-topic-attr"]/span[4]/text()')[0]
            # 做时间判断部分---------------
            get_time = self.time_change(topic_time)
            get_news_time = time.mktime(time.strptime(get_time, "%Y-%m-%d"))
            end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time, "%Y-%m-%d"))
            else:
                start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
            if float(get_news_time) < float(start_time):
                self.is_work = False

            if float(start_time) <= float(get_news_time) < float(
                    end_time):  # 符合时间段的内容
                self.get_topic_page(news_url, title, views_replay, keyword)

        a_list = data.xpath('.//a[@class="ui-page-cell"]')  # 翻页判断和操作
        for a in a_list:
            get_text = a.xpath('.//parent::a/text()')
            get_text = ''.join(get_text)
            if '下一页' in get_text:
                next_url = 'https://www.libaclub.com/' + a.xpath(
                    './/parent::a/@href')[0]
                self.get_search_page(next_url, keyword)

    def get_topic_page(self, url, title, views_replay, keyword):
        logger.log(31, '主贴url: ' + url)
        response = requests.get(url, headers=self.headers_one, timeout=60)
        data = etree.HTML(response.content.decode('utf-8', 'ignore'))
        div_list = data.xpath('.//div[@class="ui-topic"]')
        total_item = ''
        for div in div_list:
            content = div.xpath(
                './/div[@class="ui-topic-content fn-break"]/text()')[0]
            item = {}
            item['platform'] = '篱笆网'
            date_all = div.xpath(
                './/div[@class="ui-topic-operate"]/div[@class="fn-left"]/text()'
            )[0]

            item['date'] = date_all.split(' ')[0]
            item['time'] = date_all.split(' ')[1]
            try:
                item['author'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[1]/a/text()')[0]
                item['author_id'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[1]/a/@href')[0].split(
                        '/')[-1]
            except:
                item['author'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[@class="ui-topic-author-name ui-topic-author-anonymityName"]/text()'
                )[0]
                item['author_id'] = ''
            try:
                item['post_client'] = div.xpath(
                    './/div[@class="from-iphone"]/a/text()')[0]
            except:
                item['post_client'] = ''
            item['title'] = title
            item['content'] = content.strip()
            item['content_id'] = div.xpath(
                './/div[@class="ui-topic-operate"]/div[@class="fn-right"]/a[1]/@href'
            )[0].split('/')[-1].split('.')[0].split('_')[-1]
            item['brand'] = ''
            item['carseries'] = ''
            try:
                item['from'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[4]/text()')[0]
            except:
                item['from'] = ''
            item['series_url'] = ''
            item['url'] = url

            floor = div.xpath('.//span[@class="ui-dropdown-self"]/text()')[0]
            item['floor'] = floor
            item['identification'] = ''
            item['favorite'] = ''
            try:
                item['signin_time'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[3]/text()')[0]
            except:
                item['signin_time'] = ''

            if floor == '楼主':
                item['views'] = views_replay.split('/')[0]
                item['reply_no'] = views_replay.split('/')[1]
                self.source_date = date_all.split(' ')[0]
                self.source_time = date_all.split(' ')[1]
                item['is_topics'] = '是'

            else:
                item['reply_no'] = ''
                item['views'] = ''
                item['is_topics'] = '否'
            item['source_date'] = self.source_date
            item['source_time'] = self.source_time
            item['likes'] = ''
            item['is_elite'] = ''
            item['topic_count'] = ''
            item['reply_count'] = ''
            item['pick_count'] = ''
            item['follows'] = ''
            item['topic_categroy'] = ''
            item['topic_type'] = ''
            item['insert_time'] = str(datetime.now()).split('.')[0]
            item['update_time'] = str(datetime.now()).split('.')[0]
            item['topic_id'] = url.split('.h')[0].split('_')[-2]
            item['reply_floor'] = ''
            item['keyword'] = keyword
            item['file_code'] = '185'
            item['reposts_count'] = ''
            # print(item)
            item = json.dumps(dict(item), ensure_ascii=False) + '\n'
            total_item += item

        self.__write_news_jsonfile(total_item)

        if data.xpath('.//a[@class="ui-paging-next"]/@href'):  # 判断是否有下一页, 翻页操作
            next_page_url = 'https://www.libaclub.com' + data.xpath(
                './/a[@class="ui-paging-next"]/@href')[0]
            self.get_topic_page(next_page_url, title, views_replay, keyword)

    # 写入json文件
    def __write_news_jsonfile(self, item):

        # with open('./../libawang/{}_liba_news_nike.json'.format(str(datetime.now()).split(' ')[0]), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write('{}/{}/185_{}_{}_liba_news.json'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', ''),
            str(datetime.now()).split(' ')[0].replace('-', '_'),
            self.time_time),
                                  item,
                                  encoding='utf-8')

    def time_change(self, str_time):
        """
        时间可是转换, 将‘分钟前’,‘小时前’,‘昨天’,‘前天’, '天前',转换成标准时间格式Y-m-d h:m:s
        :param str_time:
        :return:
        """
        if '秒' in str_time or '刚刚' in str_time:
            get_time = str(datetime.now()).split('.')[0]
            return get_time

        elif '分钟' in str_time:
            get_time_num = re.search('\d{1,2}', str_time).group(0)
            get_time_num = int(get_time_num) * 60
            int_time = int(str(time.time()).split('.')[0]) - get_time_num
            # #转换成localtime
            time_local = time.localtime(float(int_time))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d", time_local)  # "%Y-%m-%d %H:%M:%S"
            return dt

        elif '小时' in str_time:
            get_time_num = re.search('\d{1,2}', str_time).group(0)
            get_time_num = int(get_time_num) * 60 * 60
            # print(get_time_num)
            int_time = int(str(time.time()).split('.')[0]) - get_time_num
            # #转换成localtime
            time_local = time.localtime(float(int_time))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d", time_local)  # "%Y-%m-%d %H:%M:%S"
            return dt

        elif '昨天' in str_time:
            try:
                part_time = str_time.split(' ')[1]
                yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
                yesterday = str(yesterday).split(' ')[0]
            except:
                yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
                yesterday = str(yesterday).split(' ')[0]
            return yesterday

        elif '前天' in str_time:
            part_time = str_time.split(' ')[1]
            two_days_ago = datetime.now() - timedelta(days=2)  # 昨天时间
            two_days_ago = str(two_days_ago).split(
                ' ')[0] + ' ' + part_time.replace('点', ':').replace('分', '')
            return two_days_ago

        elif '天前' in str_time:
            part_time = str_time.split('天前')[0]
            two_days_ago = datetime.now() - timedelta(
                days=int(part_time))  # 昨天时间
            two_days_ago = str(two_days_ago).split(' ')[0]
            return two_days_ago

        elif '年' in str_time:
            str_time = str_time.replace('年',
                                        '-').replace('月',
                                                     '-').replace('日', '')
            return str_time

        elif ' ' in str_time and '202' not in str_time:
            str_time = str(
                datetime.now()).split('-')[0] + '-' + str_time.split(' ')[0]
            return str_time
        else:
            # str_time = '2019-' + str_time.replace('月', '-').replace('日', '')
            return str_time

    def run(self):
        url = 'https://www.libaclub.com/facade.php?act=search&searchAction=keyword&keyword={}&sId=&timetype=2&timeBegin=1563938285&timeEnd=1566530285&sid=0&searchScope=0&orderBy=0&page=1'
        url_list = get_config_para('nike_daily_keywords')
        logger.log(31, url_list)
        for item in url_list:
            # print(1)
            keyword = item['keywords']
            logger.log(31, keyword)
            if keyword:
                search_url = url.format(keyword)
                try:
                    self.get_search_page(search_url, keyword)
                except:
                    logger.error(traceback.format_exc())
Ejemplo n.º 13
0
    def __init__(self, file_path, comment_path, need_time):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        # 评论接口模板
        self.comment_port_url = 'http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/{}/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset={}&callback=jsonp_1542355418897&_=1542355418898'

        # # get_now_time = time.time() - 86400
        # get_now_time = time.time() - int(need_time)
        # time_local = time.localtime(float(get_now_time))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # end_t = time.time()
        # time_local = time.localtime(float(end_t))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # end_dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # # end_time = str(end_time).split(' ')[0]
        # logging.log(31, '爬取时间段:{}到{}'.format(dt, str(datetime.now())))
        # # 定义开始时间 y-m-d  离现在时间远
        # self.start_time = dt
        # # self.start_time = '2019-09-09 00:01'
        # # 定义结束时间 y-m-d  离现在时间近
        # self.end_time = end_dt
        # # self.end_time = '2019-09-16 12:57'

        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:  # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time + ' 0:00'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday + ' 23:59'
        # 标记爬虫工作
        self.is_work = True
        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://jq-chance-05:9870',
                                     user='******')
        hour = str(datetime.now()).split(' ')[-1].split(':')[0]
        if str(hour) != '00':
            two_hour_ago = int(hour) - 2
            if len(str(two_hour_ago)) == 1:
                two_hour_ago = '0' + str(two_hour_ago)
            self.hour_name = str(two_hour_ago) + '_' + str(hour)
        else:
            self.hour_name = '22_24'
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.file_path,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.comment_apth,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
Ejemplo n.º 14
0
class Spider(object):
    """
    网易体育新闻
    """
    def __init__(self, file_path, comment_path, need_time):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        # 评论接口模板
        self.comment_port_url = 'http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/{}/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset={}&callback=jsonp_1542355418897&_=1542355418898'

        # # get_now_time = time.time() - 86400
        # get_now_time = time.time() - int(need_time)
        # time_local = time.localtime(float(get_now_time))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # end_t = time.time()
        # time_local = time.localtime(float(end_t))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # end_dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # # end_time = str(end_time).split(' ')[0]
        # logging.log(31, '爬取时间段:{}到{}'.format(dt, str(datetime.now())))
        # # 定义开始时间 y-m-d  离现在时间远
        # self.start_time = dt
        # # self.start_time = '2019-09-09 00:01'
        # # 定义结束时间 y-m-d  离现在时间近
        # self.end_time = end_dt
        # # self.end_time = '2019-09-16 12:57'

        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:  # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time + ' 0:00'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday + ' 23:59'
        # 标记爬虫工作
        self.is_work = True
        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://*****:*****@class="articleList"]/li')
        for li in li_list:
            title = li.xpath('.//a/text()')[0]
            news_url = li.xpath('.//a/@href')[0]
            try:
                self.get_news_info_page(news_url, '', '')
            except:
                try:
                    self.get_news_info_page(news_url, '', '')
                except:
                    logger.error(traceback.format_exc())

    # 获取新闻详情页
    def get_news_info_page(self, news_url, comment_count, page_list):
        logger.log(31, '文章url:  ' + news_url)
        item = {}
        response = requests.get(news_url, headers=self.headers_one)
        status_code = response.status_code
        if status_code == 200:
            try:
                data = response.content.decode('gbk')
            except (UnicodeDecodeError, ):
                data = response.content.decode('utf-8')
            data = etree.HTML(data)
            news_id = news_url.split('/')[-1].split('.')[0]
            try:
                title = data.xpath('.//div[@id="epContentLeft"]/h1/text()')[0]
            except:
                title = data.xpath('.//h1/text()')[0]
            try:
                date_all = data.xpath(
                    './/div[@class="post_time_source"]/text()')[0]
                date_all = re.findall('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                      date_all)[0]
            except:
                date_all = data.xpath(
                    './/div[@class="headline"]/span/text()')[0]

            # 获取评论数
            try:
                comment_response = requests.get('http://comment.tie.163.com/' +
                                                str(news_id) + '.html',
                                                headers=self.headers_one)
                # print('http://comment.tie.163.com/' + str(news_id) + '.html')
                # comment_data = comment_response.content.decode()
                count = re.search('"tcount":\d{0,10}',
                                  comment_response.text).group(0)
                count = count.split(":")[1]
                comment_id = news_id
            except AttributeError:
                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
                    'Host': 'comment.tie.163.com',
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                    'Upgrade-Insecure-Requests': '1',
                    'Accept-Language': 'zh-CN,zh;q=0.9',
                }
                comment_id = re.search('docId" :  "(.*)?",',
                                       response.text).group(1)
                # print(comment_id)
                # print('http://comment.tie.163.com/' + str(comment_id) + '.html')
                comment_response = requests.get('http://comment.tie.163.com/' +
                                                str(comment_id) + '.html',
                                                headers=headers)
                count = re.search('"tcount":\d{0,10}',
                                  comment_response.text).group(0)
                count = count.split(":")[1]
            except:
                # print(traceback.format_exc())
                count = ''

            # 网站
            item['platform'] = '网易新闻'
            # 日期date
            #  评论部分做时间判断部分---------------
            get_news_time = time.mktime(
                time.strptime(str(date_all.split(' ')[0]).strip(), "%Y-%m-%d"))
            # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time.split(' ')[0], "%Y-%m-%d"))
            else:
                start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
            if float(get_news_time) < float(start_time):
                print('时间不符合')
            elif float(start_time) <= float(get_news_time):

                date = date_all.strip().split(' ')[0]
                item['date'] = date
                news_time = date_all.strip().split(' ')[1]
                item['time'] = news_time
                item['title'] = title
                # 来源
                try:
                    source = data.xpath(
                        './/div[@class="post_time_source"]/a/text()')[0]
                    item['article_source'] = source  # 文章来源
                except:
                    item['article_source'] = ''
                try:
                    item['article_author'] = data.xpath(
                        './/span[@class="ep-editor"]/text()')[0]
                except:
                    item['article_author'] = ''
                # 正文内容
                content = data.xpath(
                    './/div[@id="endText"]/p/text() | .//div[@id="endText"]/p/a/text() |.//div[@class="overview"]//p/text()'
                )
                images_url = data.xpath('.//div[@id="endText"]//img/@src')

                content = ''.join(content)
                content = content.replace('\n', '')
                content = content.replace(' ', '')
                item['content'] = content
                item['keyword'] = ''
                item['views'] = ''
                item['comments_count'] = count
                item['likes'] = ''
                item['clicks'] = ''
                item['article_url'] = news_url  # 文章详情URL
                item['dislikes'] = ''  # 踩人数
                item['series_url'] = ''  # 车系首页
                item['list_url'] = page_list  # 文章列表URL
                if 'buy' in page_list:
                    news_type = '购车'
                elif 'nauto' in page_list:
                    news_type = '新车'
                elif 'drive' in page_list:
                    news_type = '试驾'
                elif 'buyers_guides' in page_list:
                    news_type = '导购'
                elif 'auto_newenergy' in page_list:
                    news_type = '新能源'
                elif 'news' in page_list:
                    news_type = '行业'
                else:
                    news_type = ''

                item['article_type_1st'] = news_type  # 文章类型
                item['article_type_2nd'] = ''  # 文章类型
                item['insert_time'] = str(
                    datetime.now()).split('.')[0]  # 初始爬取时间
                item['update_time'] = str(
                    datetime.now()).split('.')[0]  # 最后爬取时间
                content_id = news_url.split('/')[-1].split('.')[0]
                item['content_id'] = content_id
                item['topic_id'] = str(content_id)  # 主贴id
                item['author_id'] = ''  # 作者id
                item['content_id'] = str(content_id)
                item['file_code'] = '15'
                item['reposts_count'] = ''
                item['imageurl'] = images_url
                item['audiourl'] = []
                # print(item)
                self.__write_news_jsonfile(item)

                # 调用爬取评论的函数
                # http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/E0IBEEA10008856S/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset=0&callback=jsonp_1542355418897&_=1542355418898
                self.is_get_comment = True
                self.comment_page_num = 30
                self.get_comment_info(
                    self.comment_port_url.format(comment_id, "0"), news_id,
                    date, news_time, title, news_url)
                # with open('./../wangyi/json_file/{}/{}_news_id.json'.format(self.file_name_time.split(' ')[0],self.file_name_time.split(' ')[0]),'a') as f:
                #     com_item = {}
                #     com_item['url'] = self.comment_port_url.format(comment_id, "0")
                #     com_item['news_id'] = news_id
                #     com_item['date'] = date
                #     com_item['news_time'] = news_time
                #     com_item['title'] = title
                #     com_item['news_url'] = news_url
                #     f.write(str(com_item) + '\n')

    # 获取评论信息
    def get_comment_info(self, url, news_id, source_date, source_time,
                         source_title, source_url):
        # time.sleep(1)

        s = requests.session()
        s.keep_alive = False
        respnse = requests.get(url, headers=self.headers_one)
        status_code = respnse.status_code
        if status_code == 200:
            data = respnse.content.decode()
            try:
                data = re.findall(r'{"commentIds.*newListSize":\d{0,10}}',
                                  data)[0]
                data = json.loads(data)
                comment_data = data['comments']
                comment_id = data['commentIds']
                if comment_id:
                    total_item = ''
                    for comment_info in comment_data.items():
                        # print(comment_info)
                        item = {}
                        comment_info = comment_info[1]
                        # 网站
                        item['platform'] = '网易新闻'
                        # 日期时间
                        date_all = comment_info['createTime']
                        get_date = date_all[:-3]
                        #  评论部分做时间判断部分---------------
                        logger.log(31, date_all)
                        logger.log(31, get_date)
                        get_news_time = time.mktime(
                            time.strptime(str(get_date), "%Y-%m-%d %H:%M"))
                        end_time = time.mktime(
                            time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
                        if self.start_time != '':
                            start_time = time.mktime(
                                time.strptime(self.start_time,
                                              "%Y-%m-%d %H:%M"))
                        else:
                            start_time = time.mktime(
                                time.strptime('2010-1-1', "%Y-%m-%d %H:%M"))
                        if float(get_news_time) < float(start_time):
                            self.is_get_comment = False  # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取,
                            break
                        elif float(start_time) <= float(
                                get_news_time) <= float(end_time):
                            item['date'] = get_date
                            comment_time = date_all.split(' ')[1]
                            item['time'] = comment_time
                            # 发帖作者
                            try:
                                author = comment_info['user']['nickname']
                            except KeyError:
                                author = comment_info['user']['location'] + '网友'
                            item['author'] = author

                            item['author_id'] = comment_info['user'][
                                'userId']  # 用户id
                            # 内容
                            content = comment_info['content']
                            item['content'] = content
                            # 点赞数
                            item['likes'] = comment_info['vote']
                            # 原文发布日期时间
                            item['source_date'] = source_date
                            item['source_time'] = source_time
                            # 原文标题
                            item['title'] = source_title
                            # 原文url
                            item['source_url'] = source_url
                            item['keyword'] = ''
                            item['floor'] = ''
                            item[
                                'comment_url'] = 'http://comment.tie.163.com/' + str(
                                    news_id) + '.html'
                            item['comments_count'] = ''
                            item['views'] = ''
                            item['dislikes'] = comment_info['against']  # 踩人数
                            item['insert_time'] = str(
                                datetime.now()).split('.')[0]  # 初始爬取时间
                            item['update_time'] = str(
                                datetime.now()).split('.')[0]  # 最后爬取时间
                            item['content_id'] = comment_info['commentId']
                            content_id = source_url.split('/')[-1].split(
                                '.')[0]
                            item['topic_id'] = str(content_id)  # 主贴id
                            item['content_id'] = comment_info[
                                'commentId']  # 主贴id
                            item['file_code'] = '29'
                            item['reposts_count'] = ''
                            item = json.dumps(dict(item),
                                              ensure_ascii=False) + '\n'
                            total_item += item
                            # print(item)
                    self.__write_comment_jsonfile(total_item)
                    if self.is_get_comment:
                        self.comment_page_num += 30
                        # print(self.comment_page_num, '111111111111111111111111')
                        self.get_comment_info(
                            self.comment_port_url.format(
                                news_id, str(self.comment_page_num)), news_id,
                            source_date, source_time, source_title, source_url)
                else:
                    logger.log(31, '评论爬取完毕')
                    self.comment_page_num = 30
            except:
                logger.error(traceback.format_exc())

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    # 写入json文件
    def __write_news_jsonfile(self, item):
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./../wangyi/json_file/{}/{}_wangyi_news.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write('{}/{}/{}/15_{}_{}_wangyi_news.json'.format(
            self.file_path,
            self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name,
            str(datetime.now()).split(' ')[0].replace('-', '_'),
            self.time_time),
                                  item,
                                  encoding='utf-8')

    def __write_comment_jsonfile(self, item):

        # with open('./../wangyi/json_file/{}/{}_wangyi_news_comment.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write(
            '{}/{}/{}/29_{}_{}_wangyi_news_comment.json'.format(
                self.comment_apth,
                self.file_name_time.split(' ')[0].replace('-',
                                                          ''), self.hour_name,
                str(datetime.now()).split(' ')[0].replace('-', '_'),
                self.time_time),
            item,
            encoding='utf-8')

    def run(self):
        # self.get_list_page('http://sports.163.com/special/0005rt/news_json.js?0.4744335570460496')
        #
        self.get_list_page_two(
            'http://sports.163.com/special/0005rt/sportsgd.html')
        for i in range(2, 5):
            if len(str(i)) == 1:
                i = '0' + str(i)
            self.get_list_page_two(
                'http://sports.163.com/special/0005rt/sportsgd_{}.html'.format(
                    str(i)))
Ejemplo n.º 15
0
class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]

        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]

        print('爬取时间段:{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day
        # 标记爬虫工作
        self.is_work = False
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;', '', ret6)
            ret8 = re.sub(r'https:', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 根据关键词搜索请求得到商品信息
    def parse_goods(self, key_word):
        try:
            # 根据关键词,例如:洗发水,抓取商品信息
            url = 'https://list.mogujie.com/search?q={}&cKey=43&page=1&sort=pop'.format(
                key_word)
            headers = {
                # 'authority': 'list.mogujie.com',
                # 'method': 'GET',
                # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                # 'accept-encoding': 'gzip, deflate, br',
                # 'accept-language': 'zh-CN,zh;q=0.9',
                # 'cache-control': 'no-cache',
                'cookie':
                '__mgjuuid=ebddbce7-601f-4f3d-a860-d5ba8f411688; _TDeParam=1-1RjCYYeGOiwg6JI5UDopvg',
                'pragma':
                'no-cache',
                'upgrade-insecure-requests':
                '1',
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                time.sleep(0.1)
                response = requests.get(url=url,
                                        headers=headers,
                                        allow_redirects=False,
                                        timeout=20)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=20)
                except:
                    time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=20)
            # print(response.text)
            # 判断比对获取的是否是关键词关联搜索的商品
            rewriteKeyword = json.loads(response.text)['result']
            if 'searchRewrite' in rewriteKeyword:
                if rewriteKeyword['searchRewrite'][
                        'rewriteKeyword'] == key_word.replace(' ', ''):
                    # 获取商品总数
                    goods_num = json.loads(
                        response.text)['result']['wall']['total']
                    # 商品总页数
                    page_num = int(math.ceil(float(int(goods_num) / 75)))
                    for i in range(1, page_num + 1):
                        logger.log(
                            31,
                            '------正在抓取关键词: %s 的第: %s 页商品数据, 商品总页数是: %s ------'
                            % (key_word, i, page_num))
                        goods_url = 'https://list.mogujie.com/search?q={}&cKey=43&page={}&sort=pop'.format(
                            key_word, i)
                        try:
                            time.sleep(0.2)
                            response1 = requests.get(url=goods_url,
                                                     headers=headers,
                                                     allow_redirects=False,
                                                     timeout=20)
                        except:
                            try:
                                time.sleep(0.2)
                                response1 = requests.get(url=goods_url,
                                                         headers=headers,
                                                         proxies=proxies,
                                                         allow_redirects=False,
                                                         timeout=20)
                            except:
                                time.sleep(0.2)
                                response1 = requests.get(url=goods_url,
                                                         headers=headers,
                                                         proxies=proxies,
                                                         allow_redirects=False,
                                                         timeout=20)
                        # 获取商品列表信息节点
                        goods_list = json.loads(
                            response1.text)['result']['wall']['docs']
                        # print(len(goods_list))
                        # 遍历商品信息节点列表
                        for node in goods_list:
                            goods_dict = dict()
                            goods_dict['platform'] = '蘑菇街'
                            goods_dict['keyword'] = key_word
                            goods_dict['url'] = node['link']
                            goods_dict['imageurl'] = node['img']
                            goods_dict['audiourl'] = ''
                            goods_dict['name'] = node['title']
                            goods_dict['sales'] = ''
                            goods_dict['price'] = node['price']
                            goods_dict['itemID'] = node['tradeItemId']
                            goods_dict['brand'] = ''
                            goods_dict['focus_count'] = node['cfav']
                            # print(goods_dict)
                            self.parse_goods_details(goods_dict)
                else:
                    logger.log(31,
                               '------关键词: %s 搜索不到对应的商品数据--------' % key_word)
            else:
                # 获取商品总数
                goods_num = json.loads(
                    response.text)['result']['wall']['total']
                # 商品总页数
                page_num = int(math.ceil(float(int(goods_num) / 75)))
                for i in range(1, page_num + 1):
                    # logger.log(31, '------正在抓取关键词: %s 的第: %s 页商品数据, 商品总页数是: %s ------' % (key_word, i, page_num))
                    goods_url = 'https://list.mogujie.com/search?q={}&cKey=43&page={}&sort=pop'.format(
                        key_word, i)
                    try:
                        time.sleep(0.2)
                        response1 = requests.get(url=goods_url,
                                                 headers=headers,
                                                 allow_redirects=False,
                                                 timeout=20)
                    except:
                        try:
                            time.sleep(0.2)
                            response1 = requests.get(url=goods_url,
                                                     headers=headers,
                                                     proxies=proxies,
                                                     allow_redirects=False,
                                                     timeout=20)
                        except:
                            time.sleep(0.2)
                            response1 = requests.get(url=goods_url,
                                                     headers=headers,
                                                     proxies=proxies,
                                                     allow_redirects=False,
                                                     timeout=20)
                    # 获取商品列表信息节点
                    goods_list = json.loads(
                        response1.text)['result']['wall']['docs']
                    # print(len(goods_list))
                    # 遍历商品信息节点列表
                    for node in goods_list:
                        goods_dict = dict()
                        goods_dict['platform'] = '蘑菇街'
                        goods_dict['keyword'] = key_word
                        goods_dict['url'] = node['link']
                        goods_dict['imageurl'] = node['img']
                        goods_dict['audiourl'] = ''
                        goods_dict['name'] = node['title']
                        goods_dict['sales'] = ''
                        goods_dict['price'] = node['price']
                        goods_dict['itemID'] = node['tradeItemId']
                        goods_dict['brand'] = ''
                        goods_dict['focus_count'] = node['cfav']
                        # print(goods_dict)
                        self.parse_goods_details(goods_dict)
        except:
            print(111111111111111111111, traceback.format_exc())

    # 解析商品评论人数
    def parse_goods_details(self, goods_dict):
        try:
            headers = {
                'cookie':
                '__mgjuuid=7e841984-d679-49eb-9994-89abaec55322; _mwp_h5_token_enc=36d248108519bf86cf2fa681dbc521f8; _mwp_h5_token=3c71c26a371458b615f433396b39eccf_1564968570925; _ga=GA1.2.2057442167.1565061045; _gid=GA1.2.2144070558.1565061045; __mgjref=https%3A%2F%2Fshop.mogu.com%2Fdetail%2F1m6os9s%3Facm%3D3.ms.1_4_1m6os9s.43.1185-68998.4aiUQrym0Gs9T.sd_117-swt_43-imt_6-t_4aiUQrym0Gs9T-lc_4-pit_1-qid_21841-dit_170-idx_0-dm1_5001%26ptp%3D31.nXjSr.0.0.wLDh8N89',
                'pragma':
                'no-cache',
                'Referer':
                goods_dict['url'],
                'upgrade-insecure-requests':
                '1',
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            # print(goods_dict)
            url = 'https://rate.mogu.com/jsonp/pc.rate.ratelist/v2?pageSize=20&sort=1&isNewDetail=1&itemId={}&type=1&marketType=market_mogujie&page=1'.format(
                goods_dict['itemID'])
            try:
                time.sleep(0.2)
                response = requests.get(url=url,
                                        headers=headers,
                                        allow_redirects=False,
                                        timeout=20)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=20)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=20)
            # print(response.url)
            # print(response.text)
            commnent_num_data = re.search(r'{".*"success":true}',
                                          response.text)
            num_data = commnent_num_data.group()
            # print(num_data)
            if 'total' not in num_data:
                pass
                # logger.log(31, '-----------没有商品评论数据-----------')
            else:
                goods_dict['achieve_num'] = json.loads(
                    num_data)['data']['total']
                # 获取评论页数
                page_num = int(
                    math.ceil(float(int(goods_dict['achieve_num']) / 20)))
                # print(goods_dict['achieve_num'], page_num)
                self.goods_comments(goods_dict, page_num)
        except:
            print(2222222222222222222, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, page_num):
        try:
            is_break = self.is_work
            headers = {
                'cookie':
                '__mgjuuid=7e841984-d679-49eb-9994-89abaec55322; _mwp_h5_token_enc=36d248108519bf86cf2fa681dbc521f8; _mwp_h5_token=3c71c26a371458b615f433396b39eccf_1564968570925; _ga=GA1.2.2057442167.1565061045; _gid=GA1.2.2144070558.1565061045; __mgjref=https%3A%2F%2Fshop.mogu.com%2Fdetail%2F1m6os9s%3Facm%3D3.ms.1_4_1m6os9s.43.1185-68998.4aiUQrym0Gs9T.sd_117-swt_43-imt_6-t_4aiUQrym0Gs9T-lc_4-pit_1-qid_21841-dit_170-idx_0-dm1_5001%26ptp%3D31.nXjSr.0.0.wLDh8N89',
                'pragma':
                'no-cache',
                'Referer':
                goods_dict['url'],
                'upgrade-insecure-requests':
                '1',
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            # 抓取商品评论链接(总共50页,第一页从1开始)
            for i in range(1, int(page_num) + 1):
                comment_url = 'https://rate.mogu.com/jsonp/pc.rate.ratelist/v2?pageSize=20&sort=1&isNewDetail=1&itemId={}&type=1&marketType=market_mogujie&page={}'.format(
                    goods_dict['itemID'], i)
                # print(comment_url)
                # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            allow_redirects=False,
                                            timeout=20)
                except:
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=20)
                    except:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=20)
                # print(comment_data)
                comment = re.search(r'{".*"success":true}', response.text)
                # print(comment.group())
                items = json.loads(comment.group())['data']['list']
                # print(len(items))

                goods_comment = dict()
                for item in items:
                    # print(item)
                    date_data = item['time'].replace('年', '-').replace(
                        '月', '-').replace('日', '')
                    if len(date_data.split('-')) == 2:
                        date_data_test = time.strftime('%Y') + '-' + date_data
                    else:
                        date_data_test = date_data
                    # print(date_data_test)
                    # 判断评论时间是否在规定的抓取时间内
                    if self.start_time <= date_data_test.strip():
                        goods_comment['platform'] = goods_dict['platform']
                        goods_comment['date'] = date_data_test.strip()
                        goods_comment['time'] = ''
                        goods_comment['keyword'] = goods_dict['keyword']
                        goods_comment['name'] = goods_dict['name']
                        goods_comment['imageurl'] = goods_dict['imageurl']
                        goods_comment['audiourl'] = goods_dict['audiourl']
                        goods_comment['url'] = goods_dict['url']
                        goods_comment['shop_name'] = ''
                        goods_comment['user_name'] = item['user']['uname']
                        goods_comment['content'] = item['content']
                        goods_comment['content_id'] = item['rateId']
                        goods_comment['brand'] = goods_dict['brand']
                        goods_comment['price'] = goods_dict['price']
                        goods_comment['sales'] = goods_dict['sales']
                        goods_comment['focus_count'] = goods_dict[
                            'focus_count']
                        goods_comment['comment_num'] = goods_dict[
                            'achieve_num']
                        goods_comment['views'] = ''
                        goods_comment['likes'] = ''
                        goods_comment['comments_count'] = ''
                        goods_comment['reposts_count'] = ''
                        goods_comment['author_id'] = item['user']['uid']
                        goods_comment['topic_id'] = goods_dict['itemID']
                        try:
                            goods_comment['type'] = item['style'].split(
                                ':')[1].replace(' 尺码', '')
                        except:
                            goods_comment['type'] = ''
                        try:
                            goods_comment['size'] = item['style'].split(':')[2]
                        except:
                            goods_comment['size'] = ''
                        goods_comment['file_code'] = '177'
                        # logger.log(31, '--------------正在写入符合时间的商品评论-----------------------')
                        # print(goods_comment)
                        item = json.dumps(dict(goods_comment),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_daily/ecommerce/{}/177_{}_{}_MoGujie_nike.json'
                            .format(time.strftime('%Y%m%d'),
                                    time.strftime('%Y%m%d'), self.time_data),
                            item,
                            encoding='utf-8')
                    if date_data.strip() < self.start_time:
                        is_break = True
                if is_break:
                    break
        except:
            print(3333333333333333333, traceback.format_exc())

    # 读取excel获取关键词
    def parse_xlsx(self):
        # 设置路径
        path = './快消采集关键词_v3_20200330.xlsx'
        # 打开execl
        workbook = xlrd.open_workbook(path)

        # 根据sheet索引或者名称获取sheet内容
        Data_sheet = workbook.sheets()[0]  # 通过索引获取

        rowNum = Data_sheet.nrows  # sheet行数
        colNum = Data_sheet.ncols  # sheet列数

        # 获取所有单元格的内容
        list = []
        for i in range(rowNum):
            rowlist = []
            for j in range(colNum):
                rowlist.append(Data_sheet.cell_value(i, j))
            list.append(rowlist)

        for data in list[1::]:
            brand = data[0]
            # print(brand)
            yield {
                '关键词': brand,
            }

    def run(self):
        key_word_list = []
        for item in self.parse_xlsx():
            # print(item)
            key_word_list.append(item)
        for item_dat in key_word_list:
            # print(item_dat['关键词'])
            self.parse_goods(item_dat['关键词'])
Ejemplo n.º 16
0
class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        self.headers = {
            'Content-Type':
            'text/html; charset=utf-8',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cache-Control':
            'no-cache',
            # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; user_class=a; VipUINFO=luc%3Aa%7Csuc%3Aa%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495; visit_id=4C5B033907F8247A18F2811FF8D147F0; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564740333894.1564740386032.1564740333894.1564740386032.0.0.0.24.24; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af',
            'Host':
            'category.vip.com',
            'Pragma':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
        }
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段:{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time
        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 时间戳转换时间
    def time_change(self, data):
        # 替换抓取数据中的html标签
        try:
            timeStamp = float(int(data) / 1000)
            timeArray = time.localtime(timeStamp)
            otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
            return otherStyleTime
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 获取评论量
    def parse_comments_num(self, goods_dict):
        try:
            headers = {
                # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; VipDFT=1; visit_id=2221152ECC2AD948DF7AB8D56322CE59; vipAc=cf3c0da6d5b52c0f6088b0148efbdb22; vipshop_passport_src=https%3A%2F%2Fdetail.vip.com%2Fdetail-1710618487-6918048587083491095.html; PASSPORT_ACCESS_TOKEN=1FDEBDAAF470FFB2C3C6A9EEAF7256FBA60D1F08; VipRUID=298018734; VipUID=0f94f94cc1ea26b39e78438380499d64; VipRNAME=152*****067; VipLID=0%7C1564973676%7C4b447f; VipDegree=D1; user_class=c; VipUINFO=luc%3Ac%7Csuc%3Ac%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; PHPSESSID=b9bnc95dlt7r4eg2r196td02i4; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495%2C6918048587083491095%2C6917922115290256471; VipCI_te=0%7C%7C1564974326; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564974076993.1564974326073.1564974076993.1564974326073.0.0.0.39.39; waitlist=%7B%22pollingId%22%3A%22F90BE7CF-3F21-4012-800F-E1F26000E5BF%22%2C%22pollingStamp%22%3A1564974516121%7D; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af',
                'Host':
                'detail.vip.com',
                'Pragma':
                'no-cache',
                'Referer':
                goods_dict['url'],
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            url = 'https://detail.vip.com/v2/mapi?_path=rest/content/reputation/getCountBySpuId&spuId={}&brandId={}&app_name=shop_pc'.format(
                goods_dict['spuId'], goods_dict['brandId'])
            try:
                time.sleep(0.1)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            achieve_num_data = json.loads(response.text)['data']
            goods_dict['achieve_num'] = achieve_num_data
            if int(achieve_num_data) == 0:
                page_num = int(
                    math.ceil(float((int(achieve_num_data) + 1) / 10)))
                # logger.log(31, '评论数是: %s , 评论页数是: %s ' % (goods_dict['achieve_num'], str(page_num)))
                # print(goods_dict)
                self.parse_comments(goods_dict, page_num)
            else:
                page_num = int(math.ceil(float(int(achieve_num_data) / 10)))
                # logger.log(31, '评论数是: %s , 评论页数是: %s ' % (goods_dict['achieve_num'], str(page_num)))
                # print(goods_dict)
                self.parse_comments(goods_dict, page_num)
        except:
            print(222222222222222222222222, traceback.format_exc())

    # 抓取商品评论
    def parse_comments(self, goods_dict, page_num):
        try:
            if page_num == 0:
                pass
                # logger.log(31, '0000000000000000没有商品评论信息000000000000000000')
            else:
                is_break = self.is_break
                headers = {
                    # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; VipDFT=1; visit_id=2221152ECC2AD948DF7AB8D56322CE59; vipAc=cf3c0da6d5b52c0f6088b0148efbdb22; vipshop_passport_src=https%3A%2F%2Fdetail.vip.com%2Fdetail-1710618487-6918048587083491095.html; PASSPORT_ACCESS_TOKEN=1FDEBDAAF470FFB2C3C6A9EEAF7256FBA60D1F08; VipRUID=298018734; VipUID=0f94f94cc1ea26b39e78438380499d64; VipRNAME=152*****067; VipLID=0%7C1564973676%7C4b447f; VipDegree=D1; user_class=c; VipUINFO=luc%3Ac%7Csuc%3Ac%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; PHPSESSID=b9bnc95dlt7r4eg2r196td02i4; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495%2C6918048587083491095%2C6917922115290256471; VipCI_te=0%7C%7C1564974326; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564974076993.1564974326073.1564974076993.1564974326073.0.0.0.39.39; waitlist=%7B%22pollingId%22%3A%22F90BE7CF-3F21-4012-800F-E1F26000E5BF%22%2C%22pollingStamp%22%3A1564974516121%7D; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af',
                    'Host':
                    'detail.vip.com',
                    'Pragma':
                    'no-cache',
                    'Referer':
                    goods_dict['url'],
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
                }
                for i in range(1, int(page_num) + 1):
                    # logger.log(31, '*************************抓取评论第:%s 页' % i)
                    url = 'https://detail.vip.com/v2/mapi?_path=rest/content/reputation/queryBySpuId&spuId={}&brandId={}&page={}&pageSize=10&app_name=shop_pc&keyWordNlp=%E6%9C%80%E6%96%B0'.format(
                        goods_dict['spuId'], goods_dict['brandId'], i)
                    try:
                        time.sleep(0.1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        try:
                            time.sleep(0.1)
                            response = requests.get(url=url,
                                                    headers=headers,
                                                    proxies=proxies,
                                                    allow_redirects=False,
                                                    timeout=30)
                        except:
                            time.sleep(0.1)
                            response = requests.get(url=url,
                                                    headers=headers,
                                                    proxies=proxies,
                                                    allow_redirects=False,
                                                    timeout=30)
                    # 商品评价列表
                    comments_list = json.loads(response.text)['data']
                    if int(len(comments_list)) == 0:
                        break
                    else:
                        comment_dict = dict()
                        for item in comments_list:
                            date_data = self.time_change(
                                item['reputation']['postTime'])
                            # print(date_data)
                            if self.start_time <= date_data:
                                comment_dict['platform'] = goods_dict[
                                    'platform']
                                comment_dict['date'] = date_data.split(' ')[0]
                                comment_dict['time'] = date_data.split(' ')[1]
                                comment_dict['keyword'] = goods_dict['keyword']
                                comment_dict['name'] = goods_dict['name']
                                comment_dict['imageurl'] = goods_dict['商品图片']
                                comment_dict['audiourl'] = ''
                                comment_dict['url'] = goods_dict['url']
                                comment_dict['shop_name'] = ''
                                comment_dict['user_name'] = item[
                                    'reputationUser']['authorName']
                                comment_dict['author_id'] = str(
                                    item['reputationUser']['userIdentity'])
                                comment_dict['content'] = item['reputation'][
                                    'content']
                                comment_dict['content_id'] = str(
                                    item['reputation']['reputationId'])
                                comment_dict['brand'] = goods_dict['brand']
                                comment_dict['price'] = goods_dict['price']
                                comment_dict['sales'] = goods_dict['sales']
                                comment_dict['focus_count'] = ''
                                comment_dict['comment_num'] = goods_dict[
                                    'achieve_num']
                                comment_dict['views'] = ''
                                comment_dict['likes'] = ''
                                comment_dict['comments_count'] = ''
                                comment_dict['reposts_count'] = ''
                                comment_dict['topic_id'] = str(
                                    goods_dict['url'].split('-')[2].replace(
                                        '.html', ''))
                                try:
                                    comment_dict['type'] = item[
                                        'reputationProduct']['colorInfo']
                                except:
                                    comment_dict['type'] = ''
                                try:
                                    comment_dict['size'] = item[
                                        'reputationProduct']['size']
                                except:
                                    comment_dict['size'] = ''
                                comment_dict['file_code'] = '179'
                                # logger.log(31, '---------------正在写入符合时间的商品评论---------------------')
                                # print(comment_dict)
                                # self.write_Nike_jsonfile(comment_dict)
                                item = json.dumps(dict(comment_dict),
                                                  ensure_ascii=False) + '\n'
                                self.hdfsclient.new_write(
                                    '/user/cspider_daily/nike_2h/ecommerce/{}/{}/179_{}_WPH_nike{}.json'
                                    .format(self.date_time, self.h2_name,
                                            time.strftime('%Y%m%d'), self.pid),
                                    item,
                                    encoding='utf-8')
                            if self.start_time > date_data.split(
                                    ' ')[0].strip():
                                is_break = True
                        if is_break:
                            break
        except:
            print(33333333333333333333, traceback.format_exc())

    # def parse_xlsx(self):
    #     # 设置路径
    #     path = './快消采集关键词_0916_v3-1.xlsx'
    #     # 打开execl
    #     workbook = xlrd.open_workbook(path)
    #
    #     # 根据sheet索引或者名称获取sheet内容
    #     Data_sheet = workbook.sheets()[0]  # 通过索引获取
    #
    #     # print(Data_sheet.name)  # 获取sheet名称
    #     rowNum = Data_sheet.nrows  # sheet行数
    #     colNum = Data_sheet.ncols  # sheet列数
    #
    #     # 获取所有单元格的内容
    #     list = []
    #     for i in range(rowNum):
    #         rowlist = []
    #         for j in range(colNum):
    #             rowlist.append(Data_sheet.cell_value(i, j))
    #         list.append(rowlist)
    #
    #     for data in list[1::]:
    #         brand = data[0]
    #         # print(brand)
    #         yield {
    #             '关键词': brand,
    #         }

    def run(self, lock):
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('WPH_nike_url')
            if str(redis_url_num) == '0':
                print(
                    '**********Redis消息队列中url为空.....进程 {} 抓取结束.....***********'.
                    format(str(os.getpid())))

            item = self.redis_example.brpop('WPH_nike_url', timeout=3600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_comments_num(item1)
Ejemplo n.º 17
0
class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]
        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]
        print('爬取时间段:{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day

        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;&hellip;', '', ret6)
            ret8 = re.sub(r'":', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 过滤url里面的#detail
    def re_detail(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'#detail', '', message)
            return ret1
        except:
            pass

    # 解析请求得到的商品信息
    def parse_goods_url(self, data):
        goods_dict = dict()
        goods_dict['平台'] = '天猫'
        goods_dict['URL'] = data['URL']
        goods_dict['商品名'] = data['商品名']
        goods_dict['价格'] = data['价格']
        goods_dict['shop_name'] = data['shop_name']
        goods_dict['关键词'] = data['关键词']
        goods_dict['品牌'] = data['品牌']
        goods_dict['itemId'] = data['itemId']
        goods_dict['sellerId'] = data['sellerId']
        goods_dict['imageurl'] = data['商品图片']
        goods_dict['audiourl'] = ''
        # logger.log(31, '***************************正在抓取的商品是:%s.................' % goods_dict)
        self.parse_goods_details(goods_dict)

    # 解析商品详情信息
    def parse_goods_details(self, goods_dict):
        try:
            url = 'https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?jsv=2.4.8&appKey=12574478&sign=88ccc8f2a1382304046c12960f2711c9&api=mtop.taobao.detail.getdetail&v=6.0&dataType=jsonp&ttid=2017%40taobao_h5_6.6.0&AntiCreep=true&type=jsonp&data=%7B%22itemNumId%22%3A%22{}%22%7D'.format(
                goods_dict['itemId'])
            headers = {'User-Agent': random.choice(user_agent_list)}
            try:
                time.sleep(0.3)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            timeout=30)
                except:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            timeout=30)
            if 'apiStack' not in response.text:
                # print('******该商品已下架******')
                pass
            else:
                json_data = json.loads(
                    json.loads(
                        response.text,
                        strict=False)['data']['apiStack'][0]['value'])['item']

                commentCount = json.loads(
                    response.text)['data']['item']['commentCount']  # 评价数
                favcount = json.loads(
                    response.text)['data']['item']['favcount']  # 收藏数
                sellCount = json_data['sellCount']  # 月销量
                spuId = json_data['spuId']
                goods_dict['spuid'] = spuId
                goods_dict['月销量'] = sellCount
                goods_dict['人气数'] = favcount
                goods_dict['评价人数'] = commentCount
                if int(self.re_not_number(goods_dict['评价人数'])) == 0:
                    # logger.log(31, '---------该商品没有评价数据-------')
                    response.close()
                    pass
                else:
                    pages_num = int(
                        math.ceil(float(int(goods_dict['评价人数']) / 20)))
                    response.close()
                    self.goods_comments(goods_dict, pages_num)
        except:
            print(5555555555555555555555, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, pages_num):
        try:
            is_break = self.is_break
            # print(goods_dict)
            itemId = goods_dict['itemId']
            sellerId = goods_dict['sellerId']
            spuId = goods_dict['spuid']

            headers = {
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer':
                'https://detail.tmall.com/item.htm?spm=a230r.1.14.16.26804e4ck29eWS&id=597034992998&ns=1&abbucket=1',
                'User-Agent': random.choice(user_agent_list)
            }
            if int(pages_num) >= 99:
                pages = 99
            else:
                pages = pages_num
            # logger.log(31, '-------------评论总页数是:%s --------------' % pages)
            # 抓取商品评论链接(总共99页,从1开始)
            for i in range(1, int(pages) + 1):
                comment_url = 'https://rate.tmall.com/list_detail_rate.htm?itemId={}&spuId={}&sellerId={}&order=1&currentPage={}'.format(
                    itemId, spuId, sellerId, i)
                # print(comment_url)
                # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                comment_data = response.text
                # logger.log(31, '开始抓取评论')
                # print(comment_data)
                comment = re.search(
                    r'{"rateDetail":{"rateCount":{"total":.*"tags":\[]}}',
                    comment_data)
                # print(comment.group())
                items = json.loads(comment.group())['rateDetail']['rateList']
                # print(items)
                goods_data = dict()
                logger.log(31, '--------********开始写入商品数据********--------')
                for item in items:
                    date_data = item['rateDate'].split(' ', 1)[0]
                    time_data = item['rateDate'].split(' ', 1)[1]
                    # print('评论时间', date_data, time_data)
                    try:
                        content = item['appendComment']['content']
                    except:
                        content = ''

                    # 判断评论时间是否在规定的抓取时间内
                    if self.start_time <= date_data.strip():
                        goods_data['platform'] = goods_dict['平台']
                        goods_data['date'] = date_data.strip()
                        goods_data['time'] = time_data.strip()
                        goods_data['keyword'] = goods_dict['关键词']
                        goods_data['name'] = goods_dict['商品名']
                        goods_data['url'] = goods_dict['URL']
                        goods_data['shop_name'] = goods_dict['shop_name']
                        goods_data['user_name'] = item['displayUserNick']
                        goods_data['content'] = self.re_html(
                            item['rateContent']) + ';' + str(
                                self.re_html(content))
                        goods_data['content_id'] = str(item['id'])
                        goods_data['brand'] = goods_dict['品牌']
                        goods_data['price'] = goods_dict['价格']
                        goods_data['sales'] = goods_dict['月销量']
                        goods_data['focus_count'] = goods_dict['人气数']
                        goods_data['comment_num'] = goods_dict['评价人数']
                        goods_data['views'] = ''
                        goods_data['likes'] = ''
                        goods_data['comments_count'] = ''
                        goods_data['author_id'] = ''
                        goods_data['reposts_count'] = ''
                        goods_data['topic_id'] = str(goods_dict['itemId'])
                        # 判断size和type
                        test_data = item['auctionSku']
                        if '分类' in test_data:
                            goods_data['type'] = test_data.split(
                                ':')[1].replace(';尺码:',
                                                '').replace(';鞋码', '').replace(
                                                    ';尺码', '')
                            try:
                                goods_data['size'] = test_data.split(
                                    ':')[2].split(';')[0]
                            except:
                                try:
                                    goods_data['size'] = test_data.split(
                                        ':')[2]
                                except:
                                    goods_data['size'] = ''
                        else:
                            goods_data['type'] = ''
                            goods_data['size'] = ''
                        goods_data['imageurl'] = goods_dict['imageurl']
                        goods_data['audiourl'] = goods_dict['audiourl']
                        goods_data['file_code'] = '50'
                        # print(goods_data)
                        # item = json.dumps(dict(goods_data), ensure_ascii=False) + '\n'
                        # with open('./json_data/{}_tmall_goods_data_{}.json'.format(time.strftime('%Y%m%d'), self.pid), 'ab') as f:
                        #     f.write(item.encode("utf-8"))
                        item = json.dumps(dict(goods_data),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_daily/ecommerce/{}/50_{}_{}_Tmall_nike{}.json'
                            .format(time.strftime('%Y%m%d'),
                                    time.strftime('%Y%m%d'), self.time_data,
                                    self.pid),
                            item,
                            encoding='utf-8')
                    if date_data.strip() < self.start_time:
                        is_break = True
                if is_break:
                    break
        except:
            print(7777777777777777777, traceback.format_exc())

    def run(self, lock):
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('Tmall_day_url')
            if str(redis_url_num) == '0':
                print('*******Redis消息队列中url为空,程序等待中...进程 {} 等待中....******'.
                      format(str(os.getpid())))
            item = self.redis_example.brpop('Tmall_day_url', timeout=600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print('正在抓取商品:', item1)
            self.parse_goods_url(item1)
Ejemplo n.º 18
0
    def __init__(self, file_path, comment_path):
        self.headers_two = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            # 'Connection':'keep-alive',
            'Cookie':
            'cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201542776168%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201542776168%7D%7D; UM_distinctid=16730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461; JSESSIONID=208cee9fea61049d61e7d18f9e9c275ecf530a9e308a94dde36658adc01a0594; wuid=154945905891357; wuid_createAt=2018-11-21 12:56:9',
            'Host': 'www.baidu.com',
            'Referer': 'http://www.yidianzixun.com/channel/c11',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        self.proxies = ['218.95.55.154:4243']

        # 去重列表
        self.set_list = []
        #
        self.error_url_list = []
        self.headers_one = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'www.baidu.com',
            # 'Proxy-Connection': 'keep-alive',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        self.user_agent = [
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
        ]

        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:  # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        try:
            self.page_ip = proxies.res_ip()
            print('ip: ', self.page_ip)
            # self.page_ip = '116.248.160.138:4261'
        except:
            time.sleep(3)
            print('调用ip时发生错误:{}'.format(traceback.format_exc()))
            logger.error('调用ip时发生错误:{}'.format(traceback.format_exc()))
            self.page_ip = proxies.res_ip()
        self.ip_count = 0

        # 定义评论的抓取时间范围
        # self.comment_start_time = yesterday  # 一天回复
        self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = yesterday
        self.is_get_comment = True

        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        hour = str(datetime.now()).split(' ')[-1].split(':')[0]
        if str(hour) != '00':
            two_hour_ago = int(hour) - 2
            if len(str(two_hour_ago)) == 1:
                two_hour_ago = '0' + str(two_hour_ago)
            self.hour_name = str(two_hour_ago) + '_' + str(hour)
        else:
            self.hour_name = '22_24'
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.file_path,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.comment_apth,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, file_path, comment_path):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        self.start_url = ''
        # 评论接口模板
        self.commnet_port_url = ''
        # # 时间判断部分
        date = datetime.now() - timedelta(days=2)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # print('爬取时间段:{}到{}'.format(news_start_time, yesterday))
        #
        # logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday))
        #
        # # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2010-03-20'
        # # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-12-09 14:08'
        # # 标记爬虫工作

        # get_now_time = time.time() - 86400  # 一天或者三小时 的秒数
        # # get_now_time = time.time() - 8640000  # 一百天
        # print(get_now_time)
        # time_local = time.localtime(float(get_now_time))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # print(dt)
        # end_t = time.time()
        # print(end_t)
        # time_local = time.localtime(float(end_t))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # end_dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # print(end_dt)
        # # end_time = str(end_time).split(' ')[0]
        # print('爬取时间段:{}到{}'.format(dt, end_dt))

        # logging.info('爬取时间段:{}到{}'.format(dt, str(datetime.now())))
        # 定义开始时间 y-m-d  离现在时间远
        # self.start_time = dt
        # self.start_time = '2019-09-09 00:22'
        # 定义结束时间 y-m-d  离现在时间近
        # self.end_time = end_dt
        # self.end_time = '2019-09-16 10:22'

        self.is_work = True

        self.xhsapi = XhsApi('8ac1d719cd0a2d16')
        # 代理服务器
        proxyHost = "http-cla.abuyun.com"
        proxyPort = "9030"

        # 代理隧道验证信息
        proxyUser = "******"
        proxyPass = "******"

        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }

        self.proxies = {"http": proxyMeta, "https": proxyMeta}
        self.set_list = []
        self.info = seeeion_id_list
        # try:
        #     os.mkdir('./json_file/{}'.format(str(datetime.now()).split(' ')[0]))
        # except:
        #     pass

        with open('./session_id_list_topic.json') as f:
            session_id = f.read()

        self.session_id_list = eval(session_id)
        self.session_id_error = []
        self.error_count = 0
        self.file_path = file_path
        self.comment_path = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs('{}/{}'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}'.format(
            self.comment_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]

    # def get_session_id(self):
    #     register_smid_ret = self.xhsapi.register_smid_proxy(self.ip)
    #     print('register_smid_ret:' + register_smid_ret)
    #     smid = json.loads(register_smid_ret)['detail']['deviceId']
    #     print('smid:' + smid)
    #     self.xhsapi.set_smid(smid)
    #     # 激活用户
    #     active_user_ret = self.xhsapi.active_user_proxy(self.ip)
    #     print('active_user_ret:' + active_user_ret)
    #     # 设置session id
    #     session_id = json.loads(active_user_ret)['data']['session']
    #     print('session_id:' + session_id)
    #     item = {
    #         'deviceId': "abbd5bf5-3a82-3fcd-b8b8-4e4c48f68950",
    #         'device_fingerprint': "201908191457046c8b8bd154ae84d8f7c9f8e912c573870183341147f781ee",
    #         'device_fingerprint1': "201908191457046c8b8bd154ae84d8f7c9f8e912c573870183341147f781ee",
    #         'sid': "session.1566198308579055731492",
    #         'search_id': "A9F65F9019EF946464D38BF16C0E250A",
    #     }
    #     item['device_fingerprint'] = smid
    #     item['device_fingerprint1'] = smid
    #     item['sid'] = "session." + session_id
    #     print(item)

    def get_sid(self):
        register_smid_ret = self.xhsapi.register_smid()
        print('register_smid_ret:' + register_smid_ret)
        smid = json.loads(register_smid_ret)['detail']['deviceId']
        print('smid:' + smid)
        self.xhsapi.set_smid(smid)
        # 激活用户
        active_user_ret = self.xhsapi.active_user()
        print('active_user_ret:' + active_user_ret)
        # 设置session id
        session_id = json.loads(active_user_ret)['data']['session']
        print('session_id:' + session_id)

        return smid, session_id

    def change_ip(self):
        logger.log(31, '开始切换ip')
        url = 'http://proxy.abuyun.com/switch-ip'
        time.sleep(random.randint(1, 15))
        response = requests.get(url, proxies=self.proxies)
        logger.log(31, '现使用ip:' + response.text)

    def res_ip(self):
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Host':
            'webapi.http.zhimacangku.com',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        }
        # 5-25分 500个ip
        import time
        time.sleep(3)
        url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=310000&city=0&yys=0&port=1&time=2&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions='
        ip_pro = requests.get(url, headers=headers)
        # print(ip_pro.text)
        # ip_data = json.loads(ip_pro.text)
        ip = ip_pro.text.strip()

        # ip = str(ip_data['data'][0]['ip']) + ':' + str(ip_data['data'][0]['port'])
        return ip

    def get_serach_list(self, page, keyword):
        info = random.choice(self.session_id_list)
        # info = self.sid_info
        print(self.session_id_list.index(info))
        parms = {
            'keyword': keyword,
            'platform': 'android',
            'filters': '',
            # 'sort': '',  # 综合 排序
            # 'sort': 'popularity_descending',  # 最热 排序
            'sort': 'time_descending',  # 最新 排序
            'page': page,
            'page_size': '20',
            'source': 'explore_feed',
            # 'search_id': info['search_id'],
            'api_extra': '',
            'deviceId': info['deviceId'],
            'device_fingerprint': info['device_fingerprint'],
            'device_fingerprint1': info['device_fingerprint1'],
            'versionName': '5.35.1',
            'channel': 'YingYongBao',
            'sid': info['sid'],
            'lang': 'zh',
            't': str(round(time.time())),
        }

        url = 'https://www.xiaohongshu.com/api/sns/v9/search/notes'
        for i in range(10):
            res = self.xhsapi.get_sign(url, parms)
            print(1111, res)
            if len(res['shield']) == 32:
                break
        res = self.xhsapi.get_sign(url, parms)
        print(res['sign'])
        parms['sign'] = res['sign']
        headers = {
            # 'authorization': info['sid'],
            # 'device_id': info['deviceId'],
            'user-agent':
            'Dalvik/2.1.0 (Linux; U; Android 6.0; DIG-AL00 Build/HUAWEIDIG-AL00) Resolution/720*1280 Version/6.8.0.3 Build/6080103 Device/(HUAWEI;DIG-AL00) NetType/WiFi',
            'shield': res['shield'],
            'Host': 'www.xiaohongshu.com',
            'accept-encoding': 'gzip',
            'Connection': 'Keep-Alive',
        }

        response = requests.get(url, params=parms, headers=headers)
        print(response.url)
        if '"result":0' in response.text and 'msg:' in response.text:
            del self.session_id_list[self.session_id_list.index(info)]
            return
        json_text = json.loads(response.text)
        print(json_text)
        note_list = json_text["data"]["notes"]
        for note in note_list:
            title = note["title"]
            if not title:
                title = note["desc"]
            id = note["id"]
            print(title)
            time.sleep(0.1)
            if id not in self.set_list:

                try:
                    self.get_note(id, keyword)
                except:
                    print(traceback.format_exc())
                    try:
                        self.get_note(id, keyword)
                    except:
                        print(traceback.format_exc())
                self.set_list.append(id)

    def get_note(self, note_id, keyword, index=0):
        info = random.choice(self.info)
        # info = self.sid_info
        logger.log(31, 'session_id下标:  ' + str(self.info.index(info)))

        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])
        note_ret = self.xhsapi.get_note(note_id)
        # print(333333, note_ret)

        if '参数错误' in note_ret:
            logger.log(31, '参数错误,重试.....')
            self.get_note(note_id, keyword, index)
            return
        # print(response.text)
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     logger.log(31, '无效id:', info)
        #     del self.session_id_list[self.session_id_list.index(info)]
        #     return
        if '{"msg":"","result":0,"success":true}' in note_ret:
            self.session_id_error.append(info)
            if self.session_id_error.count(info) > 5:
                logger.log(31, '无效id:' + str(info))
                # del self.info[self.info.index(info)]
            if self.error_count > 5:
                self.change_ip()
                self.error_count = 0
            self.error_count += 1
            self.get_note(note_id, keyword, index)
            return

        json_text = json.loads(note_ret)
        # print(11111, json_text)
        data = json_text["data"][0]['note_list'][0]
        item = {}
        item['platform'] = '小红书'
        # print(222222, data)
        date_all = data['time']
        time_local = time.localtime(float(date_all))
        # 转换成新的时间格式(2016-05-05 20:28:54)
        dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        logger.log(31, "时间:    " + str(dt))
        # # 做时间判断部分---------------
        get_news_time = time.mktime(
            time.strptime(str(dt).split(' ')[0], "%Y-%m-%d"))
        end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
        if self.start_time != '':
            start_time = time.mktime(time.strptime(self.start_time,
                                                   "%Y-%m-%d"))
        else:
            start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
        if float(get_news_time) < float(start_time) and index > 1:
            logger.log(31, '不符合时间')
            # self.redis_example.sadd('xiaohongshu_out_day_url', note_id)
            self.is_work = False
            return
        elif float(start_time) <= float(get_news_time) <= float(end_time):

            logging.log(31, '符合时间')
            news_date = dt.split(' ')[0]
            news_time = dt.split(' ')[1]
            item['date'] = news_date
            item['time'] = news_time
            title = data['share_info']["title"]
            item['title'] = title
            item['content'] = data["desc"] + '#今日份AJ女生'
            note_id = data["id"]
            item['content_id'] = note_id
            item['article_author'] = data["user"]["nickname"]
            item['clicks'] = ''
            item['views'] = data['view_count']
            comments = data["comments_count"]
            item['comments_count'] = comments
            item['likes'] = data["liked_count"]
            item['dislikes'] = ''
            item['keyword'] = keyword
            article_url = data['share_info']["link"]
            item['article_url'] = article_url
            item['series_url'] = ''
            item['list_url'] = ''
            item['article_type'] = ''
            item['article_source'] = ''
            item['insert_time'] = str(datetime.now()).split('.')[0]
            item['update_time'] = str(datetime.now()).split('.')[0]
            item['topic_id'] = note_id
            item['author_id'] = data["user"]["id"]
            item['file_code'] = '28'
            item['reposts_count'] = data['shared_count']
            if data['topics']:
                item['topic'] = data['topics'][0]['name']
                item['get_topic_id'] = data['topics'][0]['id']
                item['get_topic_url'] = data['topics'][0]['link']
            else:
                item['topic'] = ''
                item['get_topic_id'] = ''
                item['get_topic_url'] = ''
            # if '韩束' not in item['title'] and '韩束' not in item['content']:
            #     print('检索文章没有包含关键词,判定不符合数据......')
            #     return
            # else:
            #     print('符合检索关键词的文章......')
            #     print(item)
            logging.log(31, item)
            self.write_news_jsonfile(item)
            # self.queue.put(item)
            if int(comments) > 0:
                try:
                    self.get_note_comment(note_id, keyword, article_url,
                                          news_date, news_time, title)
                except:
                    if '503 Service Temporarily' in traceback.format_exc(
                    ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                    ):
                        self.change_ip()
                    logging.error(traceback.format_exc())
                    try:
                        self.get_note_comment(note_id, keyword, article_url,
                                              news_date, news_time, title)
                    except:
                        if '503 Service Temporarily' in traceback.format_exc(
                        ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                        ):
                            self.change_ip()
                        logging.error(traceback.format_exc())

    # @retry(stop_max_attempt_number=2, retry_on_exception=retry_if_key_error)
    def get_note_comment(self,
                         note_id,
                         keyword,
                         article_url,
                         news_date,
                         news_time,
                         title,
                         start='',
                         now_page=1):
        if start:
            response = self.xhsapi.get_note_comments(note_id, 20, start)
        else:
            response = self.xhsapi.get_note_comments(note_id, 20)
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     del self.session_id_list[self.session_id_list.index(s)]
        #     return

        data = json.loads(response)
        # print(data)
        try:
            comment_list = data['data']["comments"]
        except:
            logging.log(31, data)
            logging.error(traceback.format_exc())
            return
        comment_count = data['data']["comment_count_l1"]

        last_comment_id = ''
        # total_item = ''
        for comment in comment_list:
            item = {}
            item['platform'] = '小红书'
            item['source_date'] = news_date
            item['source_time'] = news_time
            date_all = comment['time']
            # #转换成localtime
            time_local = time.localtime(float(date_all))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            comment_date = time.strftime("%Y-%m-%d %H:%M",
                                         time_local)  # "%Y-%m-%d %H:%M:%S"
            # # 做时间判断部分---------------
            # get_news_time = time.mktime(time.strptime(str(comment_date), "%Y-%m-%d %H:%M"))
            # # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
            # if self.start_time != '':
            #     start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d %H:%M"))
            # else:
            #     start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d %H:%M"))
            # if float(get_news_time) < float(start_time):
            #     self.is_work = False
            #     return
            #
            # if float(start_time) <= float(get_news_time):

            get_news_time = time.mktime(
                time.strptime(str(comment_date).split(' ')[0], "%Y-%m-%d"))
            end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time, "%Y-%m-%d"))
            else:
                start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
            if float(get_news_time) < float(start_time):
                self.is_get_comment = False  # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取,
                # break
            elif float(start_time) <= float(get_news_time) <= float(end_time):
                item['date'] = comment_date.split(' ')[0]
                item['time'] = comment_date.split(' ')[1]
                item['title'] = title
                item['author'] = comment['user']["nickname"]
                item['author_id'] = comment['user']["userid"]
                item['content'] = comment["content"]
                comment_id = comment["id"]
                last_comment_id = comment_id
                item['content_id'] = comment_id
                item['floor'] = ''
                item['keyword'] = keyword
                item['source_url'] = article_url
                item['comment_url'] = article_url
                item['views'] = ''
                item['comments_count'] = ''
                item['likes'] = comment["like_count"]
                item['dislikes'] = ''
                item['insert_time'] = str(datetime.now()).split('.')[0]
                item['update_time'] = str(datetime.now()).split('.')[0]
                item['topic_id'] = note_id
                item['file_code'] = '42'
                item['reposts_count'] = ''
                # print(item)
                # print(11111111, item)
                # item = json.dumps(dict(item), ensure_ascii=False) + '\n'
                # total_item = total_item + item
                # self.comment_queue.put(item)
                self.write_comment_jsonfile(item)
        # self.comment_queue.put

        # print(last_comment_id)
        all_page_num = math.ceil(float(int(comment_count) / 20))
        if int(all_page_num) > now_page and self.is_work:
            now_page += 1
            time.sleep(0.1)
            try:
                self.get_note_comment(note_id, keyword, article_url, news_date,
                                      news_time, title, last_comment_id,
                                      now_page)
            except:
                try:
                    self.get_note_comment(note_id, keyword, article_url,
                                          news_date, news_time, title,
                                          last_comment_id, now_page)
                except:
                    pass

    def get_user(self, user_id, page, num):
        info = random.choice(self.info)
        # info = self.sid_info
        print(self.info.index(info))
        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])
        # response = requests.get(url, params=parms, headers=headers)
        note_ret = self.xhsapi.get_user_note(user_id, page, num)
        print(1111, note_ret)
        if '参数错误' in note_ret:
            logger.log(31, '参数错误,重试.....')
            self.get_user(user_id, page, num)
            return
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     logger.log(31, '无效id:', info)
        #     del self.session_id_list[self.session_id_list.index(info)]
        #     return
        if '{"msg":"","result":0,"success":true}' in note_ret:
            self.change_ip()
        #     self.session_id_error.append(info)
        #     if self.session_id_error.count(info) > 5:
        #         logger.log(31, '无效id:' + str(info))
        #         del self.session_id_list[self.session_id_list.index(info)]
        #     if self.error_count > 5:
        #         self.change_ip()
        #         self.error_count = 0
        #     self.error_count += 1
        #     self.get_user(user_id, page, num)
        #     return
        data = json.loads(note_ret)
        notes = data['data']['notes']
        if not notes:
            with open('uses_id', 'a') as f:
                f.write(user_id + '\n')
        else:
            for index, note in enumerate(notes):
                # item = {}
                # print(note)
                id = note['id']
                if not self.is_work:
                    return
                try:
                    time.sleep(1)
                    self.get_note(id, '', index)
                except:
                    if '503 Service Temporarily' in traceback.format_exc(
                    ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                    ):
                        self.change_ip()
                    logging.error(traceback.format_exc())
                    try:
                        time.sleep(1)
                        self.get_note(id, '', index)
                    except:
                        if '503 Service Temporarily' in traceback.format_exc(
                        ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                        ):
                            self.change_ip()
                        logging.error(traceback.format_exc())
                        try:
                            time.sleep(1)
                            self.get_note(id, '', index)
                        except:
                            if '503 Service Temporarily' in traceback.format_exc(
                            ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                            ):
                                self.change_ip()
                            logging.error(traceback.format_exc())
                time.sleep(1)

    def get_topic(self, topic_id, page, num):
        """
        get topice info
        :param user_id:
        :param page:
        :param num:
        :return:
        """

        info = random.choice(self.info)
        # info = self.sid_info
        logging.log(31, self.info.index(info))
        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])
        # response = requests.get(url, params=parms, headers=headers)
        note_ret = self.xhsapi.get_topic_notes(topic_id, page, num)
        # logging.log(31, note_ret)
        if '参数错误' in note_ret:
            logger.log(31, '参数错误,重试.....')
            self.get_topic(topic_id, page, num)
            return
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     logger.log(31, '无效id:', info)
        #     del self.session_id_list[self.session_id_list.index(info)]
        #     return
        if '{"msg":"","result":0,"success":true}' in note_ret:
            self.change_ip()
            #     self.session_id_error.append(info)
            #     if self.session_id_error.count(info) > 5:
            #         logger.log(31, '无效id:' + str(info))
            #         del self.session_id_list[self.session_id_list.index(info)]
            #     if self.error_count > 5:
            #         self.change_ip()
            #         self.error_count = 0
            #     self.error_count += 1
            self.get_topic(topic_id, page, num)
            return
        #     return
        data = json.loads(note_ret)
        notes = data['data']['noteList']
        if not notes:
            # with open('uses_id', 'a') as f:
            #     f.write(topic_id + '\n')
            pass
        else:
            for index, note in enumerate(notes):
                # item = {}
                # print(note)
                id = note['id']
                # if not self.is_work:
                #     return
                for i in range(10):
                    try:
                        time.sleep(1)
                        self.get_note(id, '', index)
                        break
                    except:
                        time.sleep(3)
                        logging.error(traceback.format_exc())
                time.sleep(1)

    def get_topic_v2(self, topic_id, page):
        info = random.choice(self.info)
        # info = self.sid_info
        logging.log(31, self.info.index(info))
        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])

        parms = {

            # 'sort': 'hot',  # 最热 排序
            'sort': 'time',  # 最新 排序
            'page': page,  # 最新 排序
            'pageSize': '6',
            # 'sid': info['sid'],
            'sid': 'session.1570584984409448341951',
        }

        url = 'https://www.xiaohongshu.com/fe_api/burdock/v1/page/{}/notes'.format(
            topic_id)
        for i in range(10):
            res = self.xhsapi.get_xsign(url, parms)
            # if len(res['shield']) == 32:
            break

        res = self.xhsapi.get_sign(url, parms)
        print(res)
        parms['sign'] = res['sign']
        headers = {
            # 'authorization': info['sid'],
            # 'device_id': info['deviceId'],
            'user-agent':
            'Dalvik/2.1.0 (Linux; U; Android 6.0; DIG-AL00 Build/HUAWEIDIG-AL00) Resolution/720*1280 Version/6.8.0.3 Build/6080103 Device/(HUAWEI;DIG-AL00) NetType/WiFi',
            'shield': res['shield'],
            'Host': 'www.xiaohongshu.com',
            'accept-encoding': 'gzip',
            'Connection': 'Keep-Alive',
        }

    # 写入json文件
    def write_news_jsonfile(self, item):
        # print(item)
        logging.log(31, '写入数据')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./json_file/{}/28_{}_xiaohongshu_article_topic_time.json'.format(str(datetime.now()).split(' ')[0], str(datetime.now()).split(' ')[0]), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        try:
            self.hdfsclient.new_write(
                '{}/{}/28_{}_{}_xiaohongshu_article_topic_time.json'.format(
                    self.file_path,
                    str(datetime.now()).split(' ')[0].replace('-', ''),
                    str(datetime.now()).split(' ')[0].replace('-', '_'),
                    self.time_time),
                item,
                encoding='utf-8')
        except urllib3.exceptions.NewConnectionError:
            self.write_news_jsonfile(item)

    def write_comment_jsonfile(self, item):
        # print(item)
        logging.log(31, '写入评论')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./json_file/{}/42_{}_xiaohongshu_comment_topic_time.json'.format(str(datetime.now()).split(' ')[0], str(datetime.now()).split(' ')[0]), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        try:
            self.hdfsclient.new_write(
                '{}/{}/42_{}_{}_xiaohongshu_comment_topic_time.json'.format(
                    self.comment_path,
                    str(datetime.now()).split(' ')[0].replace('-', ''),
                    str(datetime.now()).split(' ')[0].replace('-', '_'),
                    self.time_time),
                item,
                encoding='utf-8')
        except urllib3.exceptions.NewConnectionError:
            self.write_comment_jsonfile(item)

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    def run(self):
        # excel_file = xlrd.open_workbook(r'./韩束小红书投放.xlsx')
        # excel_form = excel_file.sheet_by_name('KOC')
        # for i in range(2, 102):
        #     rows = excel_form.row_values(i)
        #     print(rows)
        #     name = rows[2]
        #     user_url = rows[3]
        #     flows = rows[4]
        #     likes = rows[5]
        #     direction = rows[6]
        #     date_time = rows[7]
        #     print(user_url)
        #     print(date_time)
        #     user_id = user_url.split('/')[-1].split('?')[0]
        #     self.is_work = True
        #     self.tiezi_list = []
        #     print(user_id)
        #     if len(str(date_time)) > 5:
        #         date_time = str(date_time)[:4]
        #     get_date = '2020-' + str(date_time).replace('.', '-')
        #     print(get_date)
        #     # str_time = time.mktime(time.strptime(get_date, "%Y-%m-%d"))
        #     # print(str_time)
        #     # self.start_time = get_date
        #     # self.end_time = get_date
        #     for i in range(1, 400):
        #         if not self.is_work:
        #             break
        #         try:
        #             time.sleep(1)
        #             self.get_topic(user_id, i, '10')
        #         except:
        #             if '503 Service Temporarily' in traceback.format_exc() or 'requests.exceptions.SSLError' in traceback.format_exc():
        #                 self.change_ip()
        #             try:
        #                 time.sleep(1)
        #                 self.get_topic(user_id, i, '10')
        #             except:
        #                 if '503 Service Temporarily' in traceback.format_exc() or 'requests.exceptions.SSLError' in traceback.format_exc():
        #                     self.change_ip()
        #                 try:
        #                     time.sleep(1)
        #                     self.get_topic(user_id, i, '10')
        #                 except:
        #                     if '503 Service Temporarily' in traceback.format_exc() or 'requests.exceptions.SSLError' in traceback.format_exc():
        #                         self.change_ip()
        #                     print(traceback.format_exc())
        #         time.sleep(1)

        # self.get_note('5ce2a1ea0000000006016cd9')
        #
        # self.get_comment('5ce2a1ea0000000006016cd9', 20)
        for i in range(1, 400):

            logging.log(31, '主贴翻页:' + str(i))
            if not self.is_work and i > 3:
                break
            for j in range(10):
                try:
                    self.get_topic('5e60bd92dd0a2a00013fe218', i, 6)
                    break
                except:
                    self.change_ip()
                    logging.error(traceback.format_exc())
Ejemplo n.º 20
0
import datetime
import time
import requests
import logging
from lxml import etree
import json
from datetime import timedelta
now_table = str(datetime.datetime.now()).split(' ')[0]

ss_name = str(datetime.datetime.now()).split('.')[0]
ss_name = ss_name.replace(':', '-').split('-')
del ss_name[-1]
ss_names = "-".join(ss_name)

from with_hdfs import HdfsClient
hdfs = HdfsClient(url='http://192.168.1.209:14000', user='******')
sjc = str(time.time()).split('.')[0]
daily_date = str(datetime.datetime.now()).split(' ')[0].replace('-', '')


# 通过系统时间自动计算时间间隔 ---------------------------------------------------------
date = datetime.datetime.now() - timedelta(days=1)
news_start_time = str(date).split(' ')[0]
now_time = str(datetime.datetime.now()).split(' ')[0]
logging.info('爬取时间段:{}到{}'.format(news_start_time, now_time))

# --------------------------------------------------------------------------------------
# 抓取前两天的数据的时间
tmp_date_list = news_start_time.split('-')
dateC = datetime.datetime(int(tmp_date_list[0]), int(tmp_date_list[1]), int(tmp_date_list[2]))
news_start_time = time.mktime(dateC.timetuple())
    def __init__(self, file_path, comment_path):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        self.start_url = ''
        # 评论接口模板
        self.commnet_port_url = ''
        # # 时间判断部分
        date = datetime.now() - timedelta(days=2)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # print('爬取时间段:{}到{}'.format(news_start_time, yesterday))
        #
        # logging.info('爬取时间段:{}到{}'.format(news_start_time, yesterday))
        #
        # # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2010-03-20'
        # # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-12-09 14:08'
        # # 标记爬虫工作

        # get_now_time = time.time() - 86400  # 一天或者三小时 的秒数
        # # get_now_time = time.time() - 8640000  # 一百天
        # print(get_now_time)
        # time_local = time.localtime(float(get_now_time))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # print(dt)
        # end_t = time.time()
        # print(end_t)
        # time_local = time.localtime(float(end_t))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # end_dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # print(end_dt)
        # # end_time = str(end_time).split(' ')[0]
        # print('爬取时间段:{}到{}'.format(dt, end_dt))

        # logging.info('爬取时间段:{}到{}'.format(dt, str(datetime.now())))
        # 定义开始时间 y-m-d  离现在时间远
        # self.start_time = dt
        # self.start_time = '2019-09-09 00:22'
        # 定义结束时间 y-m-d  离现在时间近
        # self.end_time = end_dt
        # self.end_time = '2019-09-16 10:22'

        self.is_work = True

        self.xhsapi = XhsApi('8ac1d719cd0a2d16')
        # 代理服务器
        proxyHost = "http-cla.abuyun.com"
        proxyPort = "9030"

        # 代理隧道验证信息
        proxyUser = "******"
        proxyPass = "******"

        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }

        self.proxies = {"http": proxyMeta, "https": proxyMeta}
        self.set_list = []
        self.info = seeeion_id_list
        # try:
        #     os.mkdir('./json_file/{}'.format(str(datetime.now()).split(' ')[0]))
        # except:
        #     pass

        with open('./session_id_list_topic.json') as f:
            session_id = f.read()

        self.session_id_list = eval(session_id)
        self.session_id_error = []
        self.error_count = 0
        self.file_path = file_path
        self.comment_path = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs('{}/{}'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}'.format(
            self.comment_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
Ejemplo n.º 22
0
class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self):
        # 时间判断部分
        date = datetime.datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]

        now_date = datetime.datetime.now() - timedelta(days=0)  # 当前时间
        now_time = str(now_date).split(' ')[0]
        print('爬取时间段:{}到{}'.format(news_start_time, now_time))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = now_time
        self.is_break = False
        self.pid = os.getpid()

        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/weibo/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹

    # 使用try清理数据
    def clean_data(self, data):
        try:
            clean_data = data
        except:
            clean_data = ''
        return clean_data

    # 时间格式转换
    def changetime(self, timestr):
        fmt2 = '%a %b %d  %H:%M:%S %z %Y'
        timestrp = time.strptime(timestr, fmt2)
        # temp_time = time.strftime("%Y-%m-%d %H:%M:%S", timestrp)
        # logger.info(f"last time {temp_time}, continue request")
        timestampstr = time.mktime(timestrp)
        timeArray = time.localtime(int(timestampstr))
        otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
        # print(otherStyleTime)  # 2013--10--10 23:40:00
        return otherStyleTime

    # 请求获取用户age
    def parse_age(self, uid):
        try:
            weibo_dict = dict()
            # 请求接口需要的携带参数
            data = {
                'access_token': code,  # 访问许可
                'uid': '{}'.format(uid),  # 搜索关键词
            }
            url = 'https://c.api.weibo.com/2/users/birthday/other.json'
            try:
                time.sleep(0.1)
                response = requests.get(url, data, timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
            # print('000000000000000000', response.text)
            if 'birthday_visible' in response.text:
                # print('有用户年龄')
                age_data = json.loads(response.text)['birthday']
                birthday_visible = json.loads(
                    response.text)['birthday_visible']
                if int(birthday_visible) == 3:
                    if age_data == '':
                        weibo_dict['age'] = ''
                        # print('111111111111111111', weibo_dict)
                        self.parse_weibo(weibo_dict, uid)
                    elif int(age_data) >= 1900:
                        weibo_dict['age'] = age_data
                        # print('111111111111111111', weibo_dict)
                        self.parse_weibo(weibo_dict, uid)
                    else:
                        weibo_dict['age'] = ''
                        # print('111111111111111111', weibo_dict)
                        self.parse_weibo(weibo_dict, uid)
            else:
                weibo_dict['age'] = ''
                # print('111111111111111111', weibo_dict)
                self.parse_weibo(weibo_dict, uid)

        except:
            self.parse_age(uid)
            try:
                self.parse_age(uid)
            except:
                print(00000000000000, traceback.format_exc())

    # 根据关键词搜索请求得到微博信息
    def parse_weibo(self, weibo_dict, uid):
        try:
            is_break = self.is_break
            date = time.strftime("%Y%m%d")
            st = int(time.mktime(time.strptime(
                date, '%Y%m%d')))  # 自定义起始时间 '2019-10-21 00:00:00'
            et = st - 86400  # 自定义终止时间 '2018-11-26 00:00:00'

            url = 'https://c.api.weibo.com/2/statuses/user_timeline/other.json'  # 接口链接
            # 请求接口需要的携带参数
            data = {
                'access_token': code,  # 访问许可
                'uid': '{}'.format(uid),
                'endtime':
                '{}'.format(st),  # 首次采集终止点为当前日期的零点,'2019-10-23 00:00:00'
                'count': 20
            }  # 单页返回的记录条数,最大不超过100,超过100以100处理,默认为20。数据是从当前零点往前一天零点时间递减出现
            try:
                time.sleep(0.1)
                response = requests.get(url, data, timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
            # print(weibo_dict)
            # print(response.text)
            if 'statuses' in response.text:
                data_list = json.loads(response.text, strict=False)['statuses']
                # print(len(data_list))
                for item in data_list:
                    date_time_data = item['created_at']
                    # print(self.changetime(date_time_data))
                    try:
                        date_data = self.changetime(date_time_data).split(
                            ' ')[0]
                    except:
                        date_data = ''
                    try:
                        time_data = self.changetime(date_time_data).split(
                            ' ')[1]
                    except:
                        time_data = ''
                    # print(date_data, time_data)
                    weibo_dict['platform'] = '微博'
                    weibo_dict['keyword'] = str(uid)
                    weibo_dict['date'] = date_data.strip()
                    weibo_dict['time'] = time_data.strip()
                    weibo_dict['weibo_id'] = str(item['id'])
                    weibo_dict['mid'] = str(item['mid'])
                    weibo_dict['idstr'] = str(item['idstr'])
                    try:
                        weibo_dict['content'] = item['longText'][
                            'longTextContent'].replace('\u200b', ' ').replace(
                                '\u200e', ' ').replace('\u200c',
                                                       ' ').replace('\n', ' ')
                    except:
                        weibo_dict['content'] = item['text'].replace(
                            '\u200b',
                            ' ').replace('\u200e',
                                         ' ').replace('\u200c',
                                                      ' ').replace('\n', ' ')
                    weibo_dict['source'] = item['source']
                    weibo_dict['favorited'] = item['favorited']
                    weibo_dict['truncated'] = item['truncated']
                    try:
                        location_data = item['user']['location']
                    except:
                        location_data = ''
                    try:
                        weibo_dict['province_name'] = location_data.split(
                            ' ')[0]
                        weibo_dict['address'] = location_data.split(' ')[1]
                    except:
                        weibo_dict['province_name'] = location_data
                        weibo_dict['address'] = ''
                    # print(weibo_dict['province_name'], weibo_dict['address'])
                    try:
                        weibo_dict['pinyin'] = item['pinyin']
                    except:
                        weibo_dict['pinyin'] = ''
                    weibo_dict['uid'] = str(item['user']['id'])
                    try:
                        weibo_dict['screen_name'] = item['user']['screen_name']
                    except:
                        weibo_dict['screen_name'] = ''
                    try:
                        weibo_dict['name'] = item['user']['name']
                    except:
                        weibo_dict['name'] = ''
                    try:
                        weibo_dict['province'] = item['user']['province']
                    except:
                        weibo_dict['province'] = ''
                    try:
                        weibo_dict['city'] = item['user']['city']
                    except:
                        weibo_dict['city'] = ''
                    try:
                        weibo_dict['location'] = item['user']['location']
                    except:
                        weibo_dict['location'] = ''
                    try:
                        weibo_dict['gender'] = item['user']['gender']
                    except:
                        weibo_dict['gender'] = ''
                    try:
                        weibo_dict['allow_all_act_msg'] = item['user'][
                            'allow_all_act_msg']
                    except:
                        weibo_dict['allow_all_act_msg'] = ''
                    try:
                        weibo_dict['geo_enabled'] = item['user']['geo_enabled']
                    except:
                        weibo_dict['geo_enabled'] = ''
                    try:
                        weibo_dict['verified'] = item['user']['verified']
                    except:
                        weibo_dict['verified'] = ''
                    try:
                        weibo_dict['verified_reason'] = item['user'][
                            'verified_reason']
                    except:
                        weibo_dict['verified_reason'] = ''
                    weibo_dict['likes'] = item['attitudes_count']
                    try:
                        weibo_dict['views'] = item['views']
                    except:
                        weibo_dict['views'] = ''
                    try:
                        weibo_dict['retweeted_status'] = str(
                            item['retweeted_status'])
                    except:
                        weibo_dict['retweeted_status'] = ''
                    weibo_dict['reposts_count'] = item['reposts_count']
                    weibo_dict['comments_count'] = item['comments_count']
                    weibo_dict['attitudes_count'] = item['attitudes_count']
                    weibo_dict['visible'] = str(item['visible'])
                    weibo_dict['pic_ids'] = str(item['pic_ids'])
                    try:
                        weibo_dict['ad'] = item['ad']
                    except:
                        weibo_dict['ad'] = ''
                    weibo_dict['isLongText'] = item['isLongText']
                    weibo_dict['url'] = 'http://m.weibo.cn/' + str(
                        item['user']['id']) + '/' + str(item['idstr'])
                    try:
                        weibo_dict['followers_count'] = item['user'][
                            'followers_count']
                    except:
                        weibo_dict['followers_count'] = ''
                    try:
                        weibo_dict['favourites_count'] = item['user'][
                            'favourites_count']
                    except:
                        weibo_dict['favourites_count'] = ''
                    try:
                        weibo_dict['friends_count'] = item['user'][
                            'friends_count']
                    except:
                        weibo_dict['friends_count'] = ''
                    try:
                        weibo_dict['statuses_count'] = item['user'][
                            'statuses_count']
                    except:
                        weibo_dict['statuses_count'] = ''
                    try:
                        weibo_dict['bi_followers_count'] = item['user'][
                            'bi_followers_count']
                    except:
                        weibo_dict['bi_followers_count'] = ''
                    try:
                        weibo_dict['avatar_large'] = item['user'][
                            'avatar_large']
                    except:
                        weibo_dict['avatar_large'] = ''
                    try:
                        weibo_dict['avatar_hd'] = item['user']['avatar_hd']
                    except:
                        weibo_dict['avatar_hd'] = ''
                    try:
                        weibo_dict['retweeted_time'] = item[
                            'retweeted_status']['created_at']
                    except:
                        weibo_dict['retweeted_time'] = ''
                    try:
                        weibo_dict['retweeted_post_id'] = item[
                            'retweeted_status']['id']
                    except:
                        weibo_dict['retweeted_post_id'] = ''
                    try:
                        weibo_dict['retweeted_author'] = item[
                            'retweeted_status']['in_reply_to_screen_name']
                    except:
                        weibo_dict['retweeted_author'] = ''
                    try:
                        weibo_dict['retweeted_author_id'] = item[
                            'retweeted_status']['in_reply_to_status_id']
                    except:
                        weibo_dict['retweeted_author_id'] = ''
                    try:
                        weibo_dict['profile_url'] = item['user']['profile_url']
                    except:
                        weibo_dict['profile_url'] = ''
                    try:
                        weibo_dict['domain'] = item['user']['domain']
                    except:
                        weibo_dict['domain'] = ''
                    try:
                        weibo_dict['user_url'] = item['user']['domain']
                    except:
                        weibo_dict['user_url'] = ''
                    weibo_dict['author_url'] = 'http://m.weibo.cn/' + str(
                        item['user']['id'])
                    weibo_dict['tags'] = self.parse_tags(weibo_dict)

                    # 图片列表判断
                    img_list = item['pic_ids']
                    if len(img_list) == 0:
                        weibo_dict['imageurl'] = ''
                        weibo_dict['audiourl'] = ''
                    else:
                        weibo_img = []
                        original_pic = item['original_pic'].split(
                            'large/')[0] + 'large/'
                        for img in img_list:
                            img_data = original_pic + img + '.jpg'
                            weibo_img.append(img_data)
                        weibo_dict['imageurl'] = weibo_img
                        weibo_dict['audiourl'] = ''

                    # print(weibo_dict['imageurl'])
                    self.write_goods_jsonfile(weibo_dict)
                    index_num = data_list.index(item)
                    if index_num == len(data_list) - 1:
                        # print(index_num)
                        last_time = self.changetime(
                            data_list[int(index_num)]['created_at'])
                        last_date = self.changetime(data_list[int(
                            index_num)]['created_at']).split(' ')[0]
                        # print(last_time)
                        # print(last_date)
                        if self.start_time <= last_date:
                            # 将其转换为时间数组
                            timeArray = time.strptime(last_time,
                                                      "%Y-%m-%d %H:%M:%S")
                            # 转换为时间戳:
                            timeStamp = int(time.mktime(timeArray))
                            # print('最后一个时间%s转换成时间戳是: ' % last_time, timeStamp)
                            self.parse_weibo_data(weibo_dict, uid, timeStamp)
                            # pass
                        if self.start_time > last_date:
                            is_break = True
                    if is_break:
                        break
        except:
            print(111111111111111111111111, traceback.format_exc())

    # 根据关键词搜索请求得到微博信息
    def parse_weibo_data(self, weibo_dict, uid, timeStamp):
        try:
            is_break = self.is_break
            url = 'https://c.api.weibo.com/2/search/statuses/limited.json'  # 接口链接
            # 请求接口需要的携带参数
            data = {
                'access_token': code,  # 访问许可
                'uid': '{}'.format(uid),
                'endtime': '{}'.format(
                    timeStamp),  # 首次采集终止点为当前日期的零点,'2019-10-23 00:00:00'
                'count': 20
            }  # 单页返回的记录条数,最大不超过100,超过100以100处理,默认为20。数据是从当前零点往前一天零点时间递减出现
            try:
                time.sleep(0.1)
                response = requests.get(url, data, timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
            # print(response.text)
            if 'statuses' in response.text:
                data_list = json.loads(response.text, strict=False)['statuses']
                # print(len(data_list))
                for item in data_list:
                    date_time_data = item['created_at']
                    # print(self.changetime(date_time_data))
                    try:
                        date_data = self.changetime(date_time_data).split(
                            ' ')[0]
                    except:
                        date_data = ''
                    try:
                        time_data = self.changetime(date_time_data).split(
                            ' ')[1]
                    except:
                        time_data = ''
                    # print(date_data, time_data)
                    weibo_dict['platform'] = '微博'
                    weibo_dict['keyword'] = str(uid)
                    weibo_dict['date'] = date_data.strip()
                    weibo_dict['time'] = time_data.strip()
                    weibo_dict['weibo_id'] = str(item['id'])
                    weibo_dict['mid'] = str(item['mid'])
                    weibo_dict['idstr'] = str(item['idstr'])
                    try:
                        weibo_dict['content'] = item['longText'][
                            'longTextContent'].replace('\u200b', ' ').replace(
                                '\u200e', ' ').replace('\u200c',
                                                       ' ').replace('\n', ' ')
                    except:
                        weibo_dict['content'] = item['text'].replace(
                            '\u200b',
                            ' ').replace('\u200e',
                                         ' ').replace('\u200c',
                                                      ' ').replace('\n', ' ')
                    weibo_dict['source'] = item['source']
                    weibo_dict['favorited'] = item['favorited']
                    weibo_dict['truncated'] = item['truncated']
                    try:
                        location_data = item['user']['location']
                    except:
                        location_data = ''
                    try:
                        weibo_dict['province_name'] = location_data.split(
                            ' ')[0]
                        weibo_dict['address'] = location_data.split(' ')[1]
                    except:
                        weibo_dict['province_name'] = location_data
                        weibo_dict['address'] = ''
                    # print(weibo_dict['province_name'], weibo_dict['address'])
                    try:
                        weibo_dict['pinyin'] = item['pinyin']
                    except:
                        weibo_dict['pinyin'] = ''
                    weibo_dict['uid'] = str(item['user']['id'])
                    try:
                        weibo_dict['screen_name'] = item['user']['screen_name']
                    except:
                        weibo_dict['screen_name'] = ''
                    try:
                        weibo_dict['name'] = item['user']['name']
                    except:
                        weibo_dict['name'] = ''
                    try:
                        weibo_dict['province'] = item['user']['province']
                    except:
                        weibo_dict['province'] = ''
                    try:
                        weibo_dict['city'] = item['user']['city']
                    except:
                        weibo_dict['city'] = ''
                    try:
                        weibo_dict['location'] = item['user']['location']
                    except:
                        weibo_dict['location'] = ''
                    try:
                        weibo_dict['gender'] = item['user']['gender']
                    except:
                        weibo_dict['gender'] = ''
                    try:
                        weibo_dict['allow_all_act_msg'] = item['user'][
                            'allow_all_act_msg']
                    except:
                        weibo_dict['allow_all_act_msg'] = ''
                    try:
                        weibo_dict['geo_enabled'] = item['user']['geo_enabled']
                    except:
                        weibo_dict['geo_enabled'] = ''
                    try:
                        weibo_dict['verified'] = item['user']['verified']
                    except:
                        weibo_dict['verified'] = ''
                    try:
                        weibo_dict['verified_reason'] = item['user'][
                            'verified_reason']
                    except:
                        weibo_dict['verified_reason'] = ''
                    weibo_dict['likes'] = item['attitudes_count']
                    try:
                        weibo_dict['views'] = item['views']
                    except:
                        weibo_dict['views'] = ''
                    try:
                        weibo_dict['retweeted_status'] = str(
                            item['retweeted_status'])
                    except:
                        weibo_dict['retweeted_status'] = ''
                    weibo_dict['reposts_count'] = item['reposts_count']
                    weibo_dict['comments_count'] = item['comments_count']
                    weibo_dict['attitudes_count'] = item['attitudes_count']
                    weibo_dict['visible'] = str(item['visible'])
                    weibo_dict['pic_ids'] = str(item['pic_ids'])
                    try:
                        weibo_dict['ad'] = item['ad']
                    except:
                        weibo_dict['ad'] = ''
                    weibo_dict['isLongText'] = item['isLongText']
                    weibo_dict['url'] = 'http://m.weibo.cn/' + str(
                        item['user']['id']) + '/' + str(item['idstr'])
                    try:
                        weibo_dict['followers_count'] = item['user'][
                            'followers_count']
                    except:
                        weibo_dict['followers_count'] = ''
                    try:
                        weibo_dict['favourites_count'] = item['user'][
                            'favourites_count']
                    except:
                        weibo_dict['favourites_count'] = ''
                    try:
                        weibo_dict['friends_count'] = item['user'][
                            'friends_count']
                    except:
                        weibo_dict['friends_count'] = ''
                    try:
                        weibo_dict['statuses_count'] = item['user'][
                            'statuses_count']
                    except:
                        weibo_dict['statuses_count'] = ''
                    try:
                        weibo_dict['bi_followers_count'] = item['user'][
                            'bi_followers_count']
                    except:
                        weibo_dict['bi_followers_count'] = ''
                    try:
                        weibo_dict['avatar_large'] = item['user'][
                            'avatar_large']
                    except:
                        weibo_dict['avatar_large'] = ''
                    try:
                        weibo_dict['avatar_hd'] = item['user']['avatar_hd']
                    except:
                        weibo_dict['avatar_hd'] = ''
                    try:
                        weibo_dict['retweeted_time'] = item[
                            'retweeted_status']['created_at']
                    except:
                        weibo_dict['retweeted_time'] = ''
                    try:
                        weibo_dict['retweeted_post_id'] = item[
                            'retweeted_status']['id']
                    except:
                        weibo_dict['retweeted_post_id'] = ''
                    try:
                        weibo_dict['retweeted_author'] = item[
                            'retweeted_status']['in_reply_to_screen_name']
                    except:
                        weibo_dict['retweeted_author'] = ''
                    try:
                        weibo_dict['retweeted_author_id'] = item[
                            'retweeted_status']['in_reply_to_status_id']
                    except:
                        weibo_dict['retweeted_author_id'] = ''
                    try:
                        weibo_dict['profile_url'] = item['user']['profile_url']
                    except:
                        weibo_dict['profile_url'] = ''
                    try:
                        weibo_dict['domain'] = item['user']['domain']
                    except:
                        weibo_dict['domain'] = ''
                    try:
                        weibo_dict['user_url'] = item['user']['domain']
                    except:
                        weibo_dict['user_url'] = ''
                    weibo_dict['author_url'] = 'http://m.weibo.cn/' + str(
                        item['user']['id'])
                    weibo_dict['tags'] = self.parse_tags(weibo_dict)

                    # 图片列表判断
                    img_list = item['pic_ids']
                    if len(img_list) == 0:
                        weibo_dict['imageurl'] = ''
                        weibo_dict['audiourl'] = ''
                    else:
                        weibo_img = []
                        original_pic = item['original_pic'].split(
                            'large/')[0] + 'large/'
                        for img in img_list:
                            img_data = original_pic + img + '.jpg'
                            weibo_img.append(img_data)
                        weibo_dict['imageurl'] = weibo_img
                        weibo_dict['audiourl'] = ''

                    # print(weibo_dict['imageurl'])
                    self.write_goods_jsonfile(weibo_dict)
                    index_num = data_list.index(item)
                    if index_num == len(data_list) - 1:
                        # print(index_num)
                        last_time = self.changetime(
                            data_list[int(index_num)]['created_at'])
                        last_date = self.changetime(data_list[int(
                            index_num)]['created_at']).split(' ')[0]
                        # print(last_time)
                        # print(last_date)
                        if self.start_time <= last_date:
                            # a = "2019-10-27 23:37:07"
                            # 将其转换为时间数组
                            timeArray = time.strptime(last_time,
                                                      "%Y-%m-%d %H:%M:%S")
                            # 转换为时间戳:
                            timeStamp1 = int(time.mktime(timeArray))
                            # print('最后一个时间%s转换成时间戳是: ' % last_time, timeStamp)
                            self.parse_weibo_data(weibo_dict, uid, timeStamp1)
                        if self.start_time > last_date:
                            is_break = True
                    if is_break:
                        break
        except:
            print(22222222222222222222, traceback.format_exc())

    # 请求获取tags
    def parse_tags(self, weibo_dict):
        try:
            # 请求接口需要的携带参数
            data = {
                'access_token': code,  # 访问许可
                'uids': '{}'.format(weibo_dict['uid']),  # 搜索关键词
            }
            url = 'https://c.api.weibo.com/2/tags/tags_batch/other.json'  # 接口链接
            try:
                time.sleep(0.1)
                response = requests.get(url, data, timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
            # print(response.text)
            if 'tags' in response.text:
                tags = re.search(r'"tags":\[{.*?"}\]',
                                 response.text).group().replace('"tags":', '')
                return tags
            else:
                return ''
        except:
            print(555555555555555555555555, traceback.format_exc())

    # 写入json文件
    def write_goods_jsonfile(self, item):
        # print(item)
        item_data = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.hdfsclient.new_write(
            '/user/cspider_daily/nike_daily/weibo/{}/104_{}_weibo_nike_uid.json'
            .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d')),
            item_data,
            encoding='utf-8')
        # item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./104_new_weibo_uid_{}.json'.format(time.strftime('%Y%m%d')), 'ab') as f:
        #     f.write(item.encode("utf-8"))

    def run(self, keyword):
        print(keyword)
        self.parse_age(keyword)
Ejemplo n.º 23
0
    def __init__(self, redis_example):
        self.headers = {
            'Content-Type':
            'text/html; charset=utf-8',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cache-Control':
            'no-cache',
            # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; user_class=a; VipUINFO=luc%3Aa%7Csuc%3Aa%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495; visit_id=4C5B033907F8247A18F2811FF8D147F0; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564740333894.1564740386032.1564740333894.1564740386032.0.0.0.24.24; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af',
            'Host':
            'category.vip.com',
            'Pragma':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
        }
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段:{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time
        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]
Ejemplo n.º 24
0
class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self):
        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        self.start_url = ''
        # 评论接口模板
        self.commnet_port_url = ''

        # 时间部分,按小时抓取
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]

        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]

        print('爬取时间段:{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段:{}到{}'.format(news_start_time, current_day))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day

        # 标记爬虫工作1
        self.is_break = False
        # 标记爬虫工作2
        self.is_work = False
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/forum/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'                                -', '', ret1)
            ret3 = re.sub(
                r'                                                            ',
                '', ret2)
            ret4 = re.sub(r"hot\(.*\d?','", '', ret3)
            ret5 = re.sub(r'\[', '', ret4)
            ret6 = re.sub(r'\]', '', ret5)
            ret7 = re.sub(r"',", "", ret6)
            ret8 = re.sub(r"'", "", ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 匹配具体时间
    def clean_date(self, x):
        now = datetime.now()
        if str(x).find('昨天') != -1:
            x = datetime.strftime(now + timedelta(days=-1),
                                  '%Y-%m-%d %H:%M:%S')
        elif str(x).find('前天') != -1:
            x = datetime.strftime(now + timedelta(days=-2),
                                  '%Y-%m-%d %H:%M:%S')
        elif str(x).find('天前') != -1:
            x = datetime.strftime(
                now + timedelta(days=-int(str(x).replace('天前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('小时前') != -1:
            x = datetime.strftime(
                now + timedelta(hours=-int(str(x).replace('小时前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('分钟前') != -1:
            x = datetime.strftime(
                now + timedelta(minutes=-int(str(x).replace('分钟前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('今天') != -1:
            x = str(x).replace('今天', now.strftime('%Y-%m-%d') + ' ')
        elif str(x).find('刚刚') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('秒前') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('月前') != -1:
            x = datetime.strftime(
                now + timedelta(weeks=-4 * int(str(x).replace('月前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('周前') != -1:
            x = datetime.strftime(
                now + timedelta(weeks=-int(str(x).replace('周前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('[') != -1:
            x = x.replace('[', '').replace(']', '')
        elif str(x).find('月') != -1:
            x = x.replace('月', '-').replace('日', '')
        return x

    def parse_goods_id(self, key_word):
        try:
            # key_word_data = urllib.parse.quote(key_word)
            url = 'http://bbs.dahe.cn/search.php?mod=forum'
            headers = {
                'Content-Type':
                'application/x-www-form-urlencoded',
                'Cookie':
                's8hO_404f_saltkey=tvEEW5wV; s8hO_404f_lastvisit=1568680094; s8hO_404f_sid=IHtErs; PHPSESSID=nr01ffrg19e81likscg0lmejb2; __asc=be50d61716d3cda6bb0dc6485ed; __auc=be50d61716d3cda6bb0dc6485ed; Hm_lvt_49fc517ed1175ad0089c07fe695a54c4=1568684010; s8hO_404f_lastact=1568683853%09search.php%09forum; Hm_lpvt_49fc517ed1175ad0089c07fe695a54c4=1568684168',
                'Host':
                'bbs.dahe.cn',
                'Origin':
                'http://bbs.dahe.cn',
                'Pragma':
                'no-cache',
                'Referer':
                'http://bbs.dahe.cn/search.php?mod=forum',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            form_data = {
                'formhash': '89e49222',
                'srchtxt': key_word.encode('gbk'),
                'searchsubmit': 'yes'
            }
            try:
                time.sleep(0.2)
                response = requests.post(url=url,
                                         headers=headers,
                                         data=form_data)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.post(url=url,
                                             headers=headers,
                                             proxies=proxies,
                                             data=form_data)
                except:
                    time.sleep(0.2)
                    response = requests.post(url=url,
                                             headers=headers,
                                             proxies=proxies,
                                             data=form_data)
            response.encoding = 'gbk'
            print(response.url)
            searchid = self.re_not_number(response.url.split('&')[1])
            print('关键词解析对应id是:', searchid)
            is_break = self.is_break
            insert_time = time.strftime('%Y-%m-%d %H:%M:%S')
            url = 'http://bbs.dahe.cn/search.php?mod=forum&searchid={}&orderby=dateline&ascdesc=desc&searchsubmit=yes&page={}'
            # print(url)
            headers = {
                'Content-Type':
                'text/html; charset=gbk',
                # 'Cookie': 's8hO_404f_saltkey=T4WK2597; s8hO_404f_lastvisit=1566265382; PHPSESSID=hp8k3kq01k4p4et54us1vljsu7; Hm_lvt_49fc517ed1175ad0089c07fe695a54c4=1566269243; yfx_c_g_u_id_10000033=_ck19082010472216611967379906556; __auc=d9a596fe16cacec003e8f31e310; s8hO_404f_atarget=1; __asc=cbf1082316cb721670e06723157; zycna=tzGXcwYAChsBAXxONRbq5Xoc; yfx_c_g_u_id_10000007=_ck19082210393212688365475513495; yfx_f_l_v_t_10000007=f_t_1566441572262__r_t_1566441572262__v_t_1566441572262__r_c_0; wdcid=0cb840f230762783; s8hO_404f_yy_ad_status=2; yfx_f_l_v_t_10000033=f_t_1566269242659__r_t_1566440515358__v_t_1566442626841__r_c_1; s8hO_404f_st_t=0%7C1566443342%7Ce4370d9ec8f238172511195afa70bf43; s8hO_404f_forum_lastvisit=D_1496_1566440306D_1880_1566440345D_2988_1566443342; s8hO_404f_st_p=0%7C1566443988%7C5efa9cc93f4efcd80a2db1e41de54594; s8hO_404f_visitedfid=261D2988D1889D1006D780D1875D1213D1778D1880D1496; s8hO_404f_viewid=tid_1240948; s8hO_404f_sendmail=1; s8hO_404f_sid=HXxXR3; s8hO_404f_lastact=1566444218%09search.php%09forum; Hm_lpvt_49fc517ed1175ad0089c07fe695a54c4=1566444478',
                'Host':
                'bbs.dahe.cn',
                'Pragma':
                'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                time.sleep(0.2)
                response1 = requests.get(url=url.format(searchid, 1),
                                         headers=headers,
                                         allow_redirects=False)
            except:
                try:
                    time.sleep(0.2)
                    response1 = requests.get(url=url.format(searchid, 1),
                                             headers=headers,
                                             allow_redirects=False,
                                             proxies=proxies)
                except:
                    time.sleep(0.2)
                    response1 = requests.get(url=url.format(searchid, 1),
                                             headers=headers,
                                             allow_redirects=False,
                                             proxies=proxies)
            response1.encoding = 'gbk'
            # print(response.text)
            # 将响应转换成一个element对象
            html = etree.HTML(response1.text)
            # 获取帖子总数
            topic_num = self.re_not_number(
                self.re_html(
                    html.xpath('//div[@class="sttl mbn"]/h2/em/text()')))
            if int(topic_num) == 0:
                logger.log(
                    31, '*******-------关键词:%s 搜索不到内容-------*******' % key_word)
            else:
                # 获取帖子页数
                pages_num = int(math.ceil(float(int(topic_num) / 40)))
                logger.log(
                    31, '---关键词: %s ,搜到的帖子总数是: %s ,帖子总页数是: %s ---' %
                    (key_word, topic_num, pages_num))
                for i in range(1, int(pages_num) + 1):
                    topic_url = url.format(searchid, key_word, i)
                    # logger.log(31, '抓取第%s页数商品数据' % i)
                    try:
                        time.sleep(0.2)
                        response2 = requests.get(url=topic_url,
                                                 headers=headers,
                                                 allow_redirects=False)
                    except:
                        try:
                            time.sleep(0.2)
                            response2 = requests.get(url=topic_url,
                                                     headers=headers,
                                                     allow_redirects=False,
                                                     proxies=proxies)
                        except:
                            time.sleep(0.2)
                            response2 = requests.get(url=topic_url,
                                                     headers=headers,
                                                     allow_redirects=False,
                                                     proxies=proxies)
                    # 将响应转换成一个element对象
                    html1 = etree.HTML(response2.text)
                    # 获取帖子列表
                    topic_list = html1.xpath('//div[@class="tl"]/div[2]/ul/li')
                    # print(len(topic_list))
                    da_he_dict = dict()
                    # 遍历帖子列表
                    for data in topic_list:
                        date_time_data = data.xpath('./p[3]/span[1]/text()')[0]
                        # print(date_time_data)
                        date_data_test = date_time_data.split(' ')[0].strip()
                        # print(date_data_test)
                        # 年, 月, 日
                        year_data = date_data_test.split('-')[0]
                        month_test = date_data_test.split('-')[1]
                        day_test = date_data_test.split('-')[2]
                        if len(month_test) == 2:
                            month_data = month_test
                        else:
                            month_data = '0' + month_test
                        if len(day_test) == 2:
                            day_data = day_test
                        else:
                            day_data = '0' + day_test
                        date_data = (year_data + '-' + month_data + '-' +
                                     day_data).strip()
                        time_data = (date_time_data.split(' ')[1] +
                                     ':00').strip()
                        if self.start_time <= date_data:
                            da_he_dict['platform'] = '大河网'
                            da_he_dict['source_date'] = date_data
                            da_he_dict['source_time'] = time_data
                            da_he_dict['date'] = date_data
                            da_he_dict['time'] = time_data
                            da_he_dict['insert_time'] = insert_time
                            da_he_dict['author'] = data.xpath(
                                './p[3]/span[2]/a/text()')[0]
                            da_he_dict[
                                'author_url'] = 'http://bbs.dahe.cn/' + data.xpath(
                                    './p[3]/span[2]/a/@href')[0]
                            da_he_dict['author_id'] = self.re_not_number(
                                data.xpath('./p[3]/span[2]/a/@href')[0])
                            da_he_dict['title'] = self.re_html(
                                data.xpath('./h3/a//text()'))
                            da_he_dict[
                                'url'] = 'http://bbs.dahe.cn/' + data.xpath(
                                    './h3/a/@href')[0]
                            da_he_dict['brand'] = ''
                            da_he_dict['carseries'] = ''
                            da_he_dict['series_url'] = ''
                            # print(da_he_dict)
                            self.parse_topic_data(da_he_dict)
                        if date_data < self.start_time:
                            is_break = True
                    if is_break:
                        break
        except:
            print(111111111111111111111, traceback.format_exc())

    # 解析帖子内容
    def parse_topic_data(self, da_he_dict):
        try:
            url = da_he_dict['url']
            headers = {
                'Content-Type':
                'text/html; charset=gbk',
                'Host':
                'bbs.dahe.cn',
                'Pragma':
                'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            # print(url)
            logger.log(31, url)
            try:
                # time.sleep(0.5)
                response = requests.get(url=url,
                                        headers=headers,
                                        allow_redirects=False)
            except:
                try:
                    # time.sleep(0.5)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False,
                                            proxies=proxies)
                except:
                    # time.sleep(0.5)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False,
                                            proxies=proxies)
            response.encoding = 'gbk'
            # 将响应转换成一个element对象
            html = etree.HTML(response.text)
            # print(response.text)
            # # 获取发帖时间
            # time_data_test = self.clean_date(self.re_html(html.xpath('//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[1]/div/div[2]/em/text()|//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[1]/div/div[2]/em/span/text()')).replace('\\xa0', ' ').replace('发表于  ', '').replace('发表于 ', ''))
            # # print(url)
            # print(time_data_test)
            # time_data = time_data_test.split(' ')[1]
            # lang = len(time_data_test.split(':'))
            # if int(lang) == 3:
            #     time_data_1 = time_data
            # else:
            #     time_data_1 = time_data.split(':')[0] + ':' + time_data.split(':')[1] + ':' + '00'
            # print(da_he_dict['date'], '--------', time_data_1)
            # da_he_dict['source_time'] = time_data_1
            # da_he_dict['time'] = time_data_1
            # 获取浏览数,回复数
            reply_data = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/table[1]/tr/td[1]/div/span/text()'
            )
            # print(reply_data)
            da_he_dict['reply_no'] = reply_data[4]
            da_he_dict['views'] = reply_data[1]
            # 获取发帖人客户端
            post_client = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[1]/div/div[2]/span[1]/a//text()'
            )
            if post_client == []:
                da_he_dict['post_client'] = ''
            else:
                da_he_dict['post_client'] = post_client[0] + post_client[1]
            da_he_dict['content'] = self.re_html(
                html.xpath(
                    '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[2]/div/div[1]/table/tr/td//text()'
                )).replace('\\r',
                           '').replace('\\n',
                                       '').replace('\\u3000',
                                                   '').replace('\\xa0', '')
            da_he_dict['imageurl'] = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[2]/div/div[1]//img/@src'
            )
            da_he_dict['audiourl'] = ''
            da_he_dict['content_id'] = da_he_dict['url'].split('-')[1]
            da_he_dict['from'] = ''
            da_he_dict['is_topics'] = '是'
            da_he_dict['floor'] = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div/strong/a/text()'
            )[0].strip()
            da_he_dict['identification'] = ''
            da_he_dict['favorite'] = ''
            da_he_dict['signin_time'] = ''
            da_he_dict['likes'] = ''
            # 判断是否是热帖
            is_elite = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/div[1]/div/img/@title'
            )
            if is_elite == []:
                da_he_dict['is_elite'] = '否'
            else:
                da_he_dict['is_elite'] = '是'
            da_he_dict['topic_count'] = ''
            da_he_dict['reply_count'] = ''
            da_he_dict['pick_count'] = ''
            da_he_dict['follows'] = ''
            da_he_dict['topic_categroy'] = ''
            da_he_dict['topic_type'] = ''
            da_he_dict['reposts_count'] = ''
            da_he_dict['update_time'] = time.strftime('%Y-%m-%d %H:%M:%S')
            da_he_dict['topic_id'] = da_he_dict['url'].split('-')[1]
            da_he_dict['file_code'] = '182'
            # logger.log(31, '----------------正在写入主贴----------------')
            # print(da_he_dict)
            item = json.dumps(dict(da_he_dict), ensure_ascii=False) + '\n'
            self.hdfsclient.new_write(
                '/user/cspider_daily/nike_daily/forum/{}/182_{}_{}_dahe_Nike.json'
                .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'),
                        self.time_data),
                item,
                encoding='utf-8')

            if int(da_he_dict['reply_no']) == 0:
                # logger.log(31, '没有回帖')
                pass
            else:
                # 获取回帖页数
                pages_num = int(
                    math.ceil(float(int(da_he_dict['reply_no']) / 10)))
                # logger.log(31, '回帖数: %s 回帖总页数是:%s' % (da_he_dict['reply_no'], pages_num))
                self.parse_reply(pages_num, da_he_dict)
        except:
            print(222222222222222222222, traceback.format_exc())

    # 抓取回帖内容
    def parse_reply(self, pages_num, da_he_dict):
        try:
            is_work = self.is_work
            start_time = time.strftime('%Y-%m-%d %H:%M:%S')
            headers = {
                'Content-Type':
                'text/html; charset=gbk',
                'Host':
                'bbs.dahe.cn',
                'Pragma':
                'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            for i in range(pages_num, 0, -1):
                url = 'http://bbs.dahe.cn/thread-{}-{}-1.html'.format(
                    da_he_dict['topic_id'], i)
                try:
                    # time.sleep(1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False)
                except:
                    try:
                        # time.sleep(1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                allow_redirects=False,
                                                proxies=proxies)
                    except:
                        # time.sleep(1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                allow_redirects=False,
                                                proxies=proxies)
                response.encoding = 'gbk'
                # 将响应转换成一个element对象
                html = etree.HTML(response.text)
                reply_dict = dict()
                # 获取回帖列表
                reply_list = html.xpath(
                    '//div[@id="postlist" and @class="pl bm"]/div')
                # print(len(reply_list))
                for item in reply_list[::-1]:
                    floor_data = self.re_html(
                        item.xpath(
                            './table/tr[1]/td[2]/div/strong/a/text()|./tr[1]/td[2]/div[1]/strong/a//text()'
                        )).replace('\\r', '').replace('\\n', '').replace(
                            '#', '').replace(' ', '')
                    # print(floor_data)
                    url_data = response.url
                    floor_test = floor_data
                    date_time_test = item.xpath(
                        './table/tr[1]/td[2]/div[1]/div/div[2]/em/text()|./table/tr[1]/td[2]/div[1]/div/div[2]/em/span/text()'
                    )
                    # print(date_time_test)
                    if date_time_test == []:
                        pass
                    else:
                        date_time_data = self.re_html(date_time_test).replace(
                            '发表于  ', '').replace('\\xa0',
                                                 ' ').replace('发表于 ', '')
                        # print(date_time_data)
                        if re.search(r'前天|昨天', date_time_data) != None:
                            datetime_data = self.clean_date(
                                date_time_data.split(' ')[0]).split(' ')[
                                    0] + ' ' + date_time_data.split(' ')[1]
                        elif re.search(r'天前', date_time_data) != None:
                            datetime_data = self.clean_date(date_time_data)
                        else:
                            datetime_data = date_time_data
                        # print(datetime_data)
                        # 发表日期
                        date_data = datetime_data.split(' ')[0].strip()
                        date_data_test = date_data.split('-')
                        if len(date_data_test[1]) == 1 and len(
                                date_data_test[2]) == 1:
                            date_data_parse = date_data_test[
                                0] + '-0' + date_data_test[
                                    1] + '-0' + date_data_test[2]
                        elif len(date_data_test[1]) == 1 and len(
                                date_data_test[2]) != 1:
                            date_data_parse = date_data_test[
                                0] + '-0' + date_data_test[
                                    1] + '-' + date_data_test[2]
                        elif len(date_data_test[1]) != 1 and len(
                                date_data_test[2]) == 1:
                            date_data_parse = date_data_test[
                                0] + '-' + date_data_test[
                                    1] + '-0' + date_data_test[2]
                        else:
                            date_data_parse = date_data_test[
                                0] + '-' + date_data_test[
                                    1] + '-' + date_data_test[2]
                        # 发表时间
                        time_data_test = datetime_data.split(' ')[1]
                        lang = len(time_data_test.split(':'))
                        if int(lang) == 3:
                            time_data = time_data_test.strip()
                        else:
                            time_data = (time_data_test.split(':')[0] + ':' +
                                         time_data_test.split(':')[1] + ':' +
                                         '00').strip()
                        # print(date_data, '*******', time_data)
                        if self.start_time <= date_data_parse.strip():
                            reply_dict['platform'] = da_he_dict['platform']
                            reply_dict['source_date'] = da_he_dict['date']
                            reply_dict['source_time'] = da_he_dict['time']
                            reply_dict['date'] = date_data_parse.strip()
                            reply_dict['time'] = time_data
                            reply_dict['author'] = item.xpath(
                                './table/tr[1]/td[1]/div/div[1]/div/a/text()'
                            )[0]
                            reply_dict[
                                'author_url'] = 'http://bbs.dahe.cn/' + item.xpath(
                                    './table/tr[1]/td[1]/div/div[1]/div/a/@href'
                                )[0]
                            reply_dict['author_id'] = self.re_not_number(
                                item.xpath(
                                    './table/tr[1]/td[1]/div/div[1]/div/a/@href'
                                )[0])
                            reply_dict['post_client'] = da_he_dict[
                                'post_client']
                            reply_dict['title'] = da_he_dict['title']
                            reply_dict['content'] = self.re_html(
                                item.xpath(
                                    './table/tr[1]/td[2]/div[2]/div/div[1]/table/tr/td//text()'
                                )).replace('\\r', '')
                            reply_dict['imageurl'] = ''
                            reply_dict['audiourl'] = ''
                            reply_dict['content_id'] = self.re_not_number(
                                item.xpath('./@id')[0])
                            reply_dict['brand'] = ''
                            reply_dict['carseries'] = ''
                            reply_dict['from'] = ''
                            reply_dict['series_url'] = ''
                            reply_dict['url'] = url_data
                            reply_dict['is_topics'] = '否'
                            reply_dict['floor'] = floor_test
                            reply_dict['identification'] = ''
                            reply_dict['favorite'] = ''
                            reply_dict['signin_time'] = ''
                            reply_dict['reply_no'] = ''
                            reply_dict['views'] = ''
                            reply_dict['likes'] = ''
                            reply_dict['is_elite'] = da_he_dict['is_elite']
                            reply_dict['topic_count'] = ''
                            reply_dict['reply_count'] = ''
                            reply_dict['pick_count'] = ''
                            reply_dict['follows'] = ''
                            reply_dict['topic_categroy'] = ''
                            reply_dict['topic_type'] = ''
                            reply_dict['reposts_count'] = ''
                            reply_dict['insert_time'] = start_time
                            reply_dict['update_time'] = time.strftime(
                                '%Y-%m-%d %H:%M:%S')
                            reply_dict['topic_id'] = da_he_dict['topic_id']
                            reply_dict['file_code'] = '182'
                            # logger.log(31, '******************开始写入回帖数据**********************')
                            # print(reply_dict)
                            item = json.dumps(dict(reply_dict),
                                              ensure_ascii=False) + '\n'
                            self.hdfsclient.new_write(
                                '/user/cspider_daily/nike_daily/forum/{}/182_{}_{}_dahe_Nike.json'
                                .format(time.strftime('%Y%m%d'),
                                        time.strftime('%Y%m%d'),
                                        self.time_data),
                                item,
                                encoding='utf-8')

                        if date_data < self.start_time:
                            is_work = True
                    if is_work:
                        break
        except:
            print(333333333333333333333, traceback.format_exc())

    # 读取excel获取关键词
    def parse_xlsx(self):
        # 设置路径
        path = './快消采集关键词_v12_20200119.xlsx'
        # 打开execl
        workbook = xlrd.open_workbook(path)

        # 根据sheet索引或者名称获取sheet内容
        Data_sheet = workbook.sheets()[0]  # 通过索引获取

        rowNum = Data_sheet.nrows  # sheet行数
        colNum = Data_sheet.ncols  # sheet列数

        # 获取所有单元格的内容
        list = []
        for i in range(rowNum):
            rowlist = []
            for j in range(colNum):
                rowlist.append(Data_sheet.cell_value(i, j))
            list.append(rowlist)

        for data in list[1::]:
            brand = data[0]
            # print(brand)
            yield {
                '关键词': brand,
            }

    def run(self):
        key_word_list = []
        for item in self.parse_xlsx():
            # print(item)
            key_word_list.append(item)
        for item_data in key_word_list:
            time.sleep(10)
            # print(item_data['关键词'])
            self.parse_goods_id(item_data['关键词'])
Ejemplo n.º 25
0
class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段:{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time

        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;&hellip;', '', ret6)
            ret8 = re.sub(r'":', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 过滤url里面的#detail
    def re_detail(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'#detail', '', message)
            return ret1
        except:
            pass

    # 过滤品牌
    def re_pin_pai(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'<li title.*?>', '', ret1)
            ret3 = re.sub(r'品牌:&nbsp;', '', ret2)
            return ret3
        except:
            pass

    # 解析请求得到的商品信息
    def parse_goods_url(self, data):
        goods_dict = dict()
        goods_dict['平台'] = '淘宝'
        goods_dict['URL'] = data['URL']
        goods_dict['商品名'] = data['商品名']
        try:
            goods_dict['品牌'] = data['品牌']
        except:
            goods_dict['品牌'] = ''
        goods_dict['价格'] = data['价格']
        goods_dict['shop_name'] = data['shop_name']
        goods_dict['月销量'] = data['月销量'].replace('人付款', '')
        goods_dict['关键词'] = data['关键词']
        goods_dict['itemId'] = data['itemId']
        goods_dict['sellerId'] = data['sellerId']
        goods_dict['imageurl'] = data['商品图片']
        goods_dict['audiourl'] = ''
        # logger.log(31, '************************正在抓取的商品是:%s................' % goods_dict)
        self.goods_collection_num(goods_dict)

    # 抓取商品收藏数(人气)
    def goods_collection_num(self, goods_dict):
        try:
            url = 'https://count.taobao.com/counter3?callback=jsonp235&keys=ICCP_1_{}'.format(
                goods_dict['itemId'])
            headers = {
                'content-type': 'application/x-javascript',
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _uab_collina=155540168306791903478476; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=f32553e159b195a4f17c00010f2bcd2e_1564547678304; _m_h5_tk_enc=3268f7bf49fd78b94768c96e3ef51817; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b227365617263686170703b32223a2266303666326434356635336264366335613639393662663834646632366531644349582b684f6f46454c47473571714e726f325955526f4c4f4459314d5467794d6a4d334f7a453d227d; l=cBaOcPD7qg21z_uyBOfZKurza779uIdf1sPzaNbMiICPO_fh5wONWZFb8t8MCnGVLsI2535t6zUaBXYaGyUIh2nk8b8CgsDd.; isg=BOzsOXiZnf59JomJ--wm9a9SvcreDZFEZ8nHSkYsbxe0UY9baraQ30WjcVnMWcin; JSESSIONID=A9F406FD84CDFD576728A12ECBD98A53',
                'upgrade-insecure-requests': '1',
                'user-agent': random.choice(user_agent_list)
            }

            try:
                time.sleep(0.3)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            re_collection_num = re.search(r'":\d{1,20}', response.text)
            # print(re_collection_num.group())
            goods_dict['人气数'] = self.re_html(re_collection_num.group())
            # print(goods_dict)
            response.close()
            self.parse_goods_comment_num(goods_dict)
        except:
            print(444444444444444444, traceback.format_exc())

    # 抓取商品评论数
    def parse_goods_comment_num(self, goods_dict):
        try:
            url = 'https://rate.taobao.com/detailCommon.htm?auctionNumId={}&userNumId={}'.format(
                goods_dict['itemId'], goods_dict['sellerId'])
            headers = {
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer':
                'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1',
                'User-Agent': random.choice(user_agent_list)
            }
            try:
                time.sleep(0.2)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            # print('11111')
            # print(response.text)
            if 'total' in response.text:
                re_comment_num = json.loads(
                    response.text.replace('(', '').replace(')', ''))
                # print(re_comment_num)
                goods_dict['评价人数'] = re_comment_num['data']['count']['total']
                # print(goods_dict['评价人数'])
                if int(self.re_not_number(goods_dict['评价人数'])) == 0:
                    # logger.log(31, '-----------该商品没有评论数据--------------')
                    pass
                else:
                    itemId = goods_dict['itemId']
                    sellerId = goods_dict['sellerId']
                    headers1 = {
                        'cookie':
                        't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                        'pragma': 'no-cache',
                        'upgrade-insecure-requests': '1',
                        'Content-Type': 'application/x-www-form-urlencoded',
                        'Referer':
                        'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1',
                        'User-Agent': random.choice(user_agent_list)
                    }
                    comment_url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId={}&currentPageNum=1&pageSize=20&rateType=&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0'.format(
                        itemId, sellerId)
                    try:
                        time.sleep(0.3)
                        response1 = requests.get(url=comment_url,
                                                 headers=headers1,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                    except:
                        try:
                            time.sleep(0.3)
                            response1 = requests.get(url=comment_url,
                                                     headers=headers1,
                                                     proxies=proxies,
                                                     allow_redirects=False,
                                                     timeout=30)
                        except:
                            time.sleep(0.3)
                            response1 = requests.get(url=comment_url,
                                                     headers=headers1,
                                                     proxies=proxies,
                                                     allow_redirects=False,
                                                     timeout=30)
                    re_pages = re.search(
                        r'{"qnaDisabled":true,"watershed":.*"maxPage":.*}',
                        response1.text)
                    comment_nums = json.loads(re_pages.group())['total']
                    if int(comment_nums) == 0:
                        pass
                    else:
                        pages_num = int(
                            math.ceil(float(int(comment_nums) / 20)))
                        response.close()
                        response1.close()
                        self.goods_comments(goods_dict, pages_num)
        except:
            print(5555555555555555555555, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, pages_num):
        try:
            is_break = self.is_break
            # print(goods_dict)
            itemId = goods_dict['itemId']
            sellerId = goods_dict['sellerId']

            headers = {
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer':
                'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1',
                'User-Agent': random.choice(user_agent_list)
            }
            # print('----------------商品评论总页数是: %s -----------------------' % pages_num)
            # 抓取商品评论链接(总共99页,从1开始)
            for i in range(1, int(pages_num) + 1):
                comment_url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId={}&currentPageNum={}&pageSize=20&rateType=&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0'.format(
                    itemId, sellerId, i)
                # print(comment_url)
                # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10)
                try:
                    time.sleep(0.3)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    try:
                        time.sleep(0.3)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        time.sleep(0.3)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                comment_data = response.text
                # print('开始抓取评论')
                # print(comment_data)
                comment = re.search(
                    r'{"qnaDisabled":true,"watershed":.*"maxPage":.*}',
                    comment_data)
                # print(comment.group())
                items = json.loads(comment.group())['comments']
                # print(items)
                goods_data = dict()
                for item in items:
                    # if item['date'] != None:
                    # time_test = item['date'].split(' ')[0].replace('年', '-').replace('月', '-').replace('日', '') + ' ' + item['date'].split(' ')[1] + ':00'
                    date_data = item['date'].split(' ')[0].replace(
                        '年', '-').replace('月', '-').replace('日', '')
                    try:
                        time_data = item['date'].split(' ')[1] + ':00'
                    except:
                        time_data = '00:00:00'
                    # print('评论时间', date_data, time_data)
                    try:
                        content = item['content']
                    except:
                        content = ''
                    # 追加评论
                    try:
                        comments_2 = item['appendList'][0]['content']
                    except:
                        comments_2 = ''
                    time_test = date_data + ' ' + time_data
                    # 判断评论时间是否在规定的抓取时间内
                    if self.start_time <= time_test:
                        goods_data['platform'] = goods_dict['平台']
                        goods_data['date'] = date_data.strip()
                        goods_data['time'] = time_data.strip()
                        goods_data['keyword'] = goods_dict['关键词']
                        goods_data['name'] = goods_dict['商品名']
                        goods_data['url'] = goods_dict['URL']
                        goods_data['shop_name'] = goods_dict['shop_name']
                        goods_data['user_name'] = item['user']['nick']
                        goods_data['content'] = content + ';' + comments_2
                        goods_data['content_id'] = str(item['rateId'])
                        goods_data['brand'] = goods_dict['品牌']
                        goods_data['price'] = goods_dict['价格']
                        goods_data['sales'] = goods_dict['月销量']
                        goods_data['focus_count'] = goods_dict['人气数']
                        goods_data['comment_num'] = goods_dict['评价人数']
                        goods_data['views'] = ''
                        goods_data['likes'] = item['useful']
                        goods_data['comments_count'] = ''
                        goods_data['author_id'] = ''
                        goods_data['reposts_count'] = ''
                        goods_data['topic_id'] = str(goods_dict['itemId'])
                        # 判断size和type 颜色分类:黑色高帮&nbsp;&nbsp尺码:37
                        test_data = item['auction']['sku']
                        if '码' in test_data:
                            goods_data['type'] = test_data.split(
                                ':')[1].replace('尺码', '').replace(
                                    '&nbsp;&nbsp',
                                    '').replace('鞋码', '').replace(';尺码', '')
                            goods_data['size'] = test_data.split(':')[2]
                        else:
                            goods_data['type'] = ''
                            goods_data['size'] = ''
                        # print('--------********开始写入商品数据********--------')
                        # print(goods_data)
                        goods_data['imageurl'] = goods_dict['imageurl']
                        goods_data['audiourl'] = goods_dict['audiourl']
                        goods_data['file_code'] = '55'
                        # logger.log(31, '--------********开始写入商品数据********--------')
                        # print(goods_data)
                        item = json.dumps(dict(goods_data),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_2h/ecommerce/{}/{}/55_{}_TaoBao_nike{}.json'
                            .format(self.date_time, self.h2_name,
                                    time.strftime('%Y%m%d'), str(self.pid)),
                            item,
                            encoding='utf-8')
                    if date_data.strip() < self.start_time:
                        is_break = True
                if is_break:
                    break
        except:
            print(7777777777777777777, traceback.format_exc())

    def run(self, lock):
        for i in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('tao_bao_2h')
            if str(redis_url_num) == '0':
                print(
                    '******Redis消息队列中url为空,程序等待中...进程{}等待中...*********'.format(
                        str(os.getpid())))
            item = self.redis_example.brpop('tao_bao_2h', timeout=3600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_goods_url(item1)
Ejemplo n.º 26
0
def get_file_name_time():
    a = str(datetime.now())
    hour = a.split(' ')[-1].split(':')[0]
    num = int(hour) / 3
    num = int(num) * 3
    if num == 0:
        num = 24
        a = str(datetime.now() - timedelta(days=1))  # 昨天时间
    num = a.split(' ')[0] + ' ' + str(num)
    return num


file_path = '/user/cspider_daily/nike_2h/qa'
file_name_time = get_file_name_time()
hdfsclient = HdfsClient(url='http://jq-chance-05:9870', user='******')
hour = str(datetime.now()).split(' ')[-1].split(':')[0]
if str(hour) != '00':
    two_hour_ago = int(hour) - 2
    if len(str(two_hour_ago)) == 1:
        two_hour_ago = '0' + str(two_hour_ago)
    hour_name = str(two_hour_ago) + '_' + str(hour)
else:
    hour_name = '22_24'
hdfsclient.makedirs('{}/{}/{}'.format(
    file_path,
    file_name_time.split(' ')[0].replace('-', ''), hour_name))  # 创建每日文件夹
time_time = str(time.time()).split('.')[0]


def app_run(keyword, queue, all_set_list):
Ejemplo n.º 27
0
class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段:{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段:{}到{}'.format(start_time_data, end_time))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time
        # 标记爬虫工作
        self.is_work = True
        self.redis_example = redis_example
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;&hellip;', '', ret6)
            ret8 = re.sub(r'":', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 13位时间戳转换成日期
    def time_change(self, data):
        timeStamp = float(int(data) / 1000)
        timeArray = time.localtime(timeStamp)
        otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
        return otherStyleTime

    # 获取品牌信息
    def parse_brand(self, goods_dict):
        try:
            headers = {
                'content-type':
                'text/html;charset=UTF-8',
                'cookie':
                'kaola_user_key=b87e28b9-e7fc-43ba-8ca7-42abae97a079; _ntes_nnid=116c0ca91001bfb53c23f45f9e55ac87,1568617522153; _ga=GA1.2.290138937.1568617522; _klhtxd_=31; _ga=GA1.3.290138937.1568617522; __da_ntes_utma=2525167.417648162.1568617522.1568617522.1568617522.1; davisit=1; __da_ntes_utmz=2525167.1568617522.1.1.; __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); usertrack=CrGZAV2DFzgLhl54AwtSAg==; KAOLA_NEW_USER_COOKIE=yes; cna=MQj5FQMZD0sCAXxONRZeF0y0; WM_TID=beYPJ03r5ilFBUFUFEZo5jCUV1mKk4PC; t=cf5d799c2331f5cabed38ae64e05e79e; KAOLA_USER_ID=109999078912652422; [email protected]; JSESSIONID-WKL-8IO=0zc3WMz%2Bz0rQe5Jcv1xai4OAOScJJgZviUPXMI3RUo2IYlneCBZYhem2pXj85vvoJ8Z%2B2yMxkJZ%2BDbqGhohayCkj0RWfrbvXgwt00Wju%2BMWVg7WjBsfPPuM6Bq0yJI1vkeq%5C17ndJLsLrHGeY1Sf0k231zopBvGmtXomvGZ5J9TWLbPq%3A1586842936344; davisit=2; __da_ntes_utmb=2525167.1.10.1586756536; _samesite_flag_=true; cookie2=1f50b0bd27965ea6d4731440eb0ab6b2; _tb_token_=57e48eee49e7; csg=7c23ee4b; NTES_OSESS=REPpP5MMDS0ti.Kjs4kXCagwqwIe5DsWd2J6spGZnnoVWWhz6L9pI2HlXPVp_85PuZGCsnYofZ0FK56aZ.uX88iBgdi0zJZsRBB8fdi_YIZfYxQlVYg4kvmcVqVCqK9kxhu.Yzv4Avj3rW.UPrCYFGfnrd5TZovCzX0lNqe3j5rAEWHpYRLXj1PsCx_75evCuvl01iv5jej2sgH2yqYAm2a0p; kaola_csg=93dad892; kaola-user-beta-traffic=12217883524; firstLogin=0; hb_MA-AE38-1FCC6CD7201B_source=search.kaola.com; NTES_KAOLA_RV=1537539_1586756945560_0|2884042_1586756792280_0|5522516_1586513810003_0|5705591_1585881322711_0|8317307_1585880658885_0|5553701_1585880652352_0|8517421_1585879009306_0|1467929_1571291229258_0|5218698_1569811431977_0|5536790_1569811422334_0|5457794_1569811411408_0|5115159_1569811404628_0|2843760_1569566707083_0|5481268_1569489750583_0|2723610_1569488978899_0|2546067_1569485553114_0|1758828_1569485116618_0|1616628_1569482665961_0|5111078_1569482641632_0|2482224_1569482624326_0; isg=BHV1IQtJR6edB6MO8FzlgBdJhPHvWigPBiZuAfeb4ewRzpfAv0AP1GEMGNLdjkG8',
                'pragma':
                'no-cache',
                'referer':
                'https://search.kaola.com/search.html?key=AlphaBounce&oldQuery=AIR%2520MAX&searchRefer=searchbutton&zn=top',
                'upgrade-insecure-requests':
                '1',
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            url = goods_dict['url']
            try:
                # time.sleep(0.2)
                response1 = requests.get(url=url,
                                         headers=headers,
                                         proxies=proxies,
                                         allow_redirects=False,
                                         timeout=30)
            except:
                try:
                    # time.sleep(0.2)
                    response1 = requests.get(url=url,
                                             headers=headers,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
                except:
                    # time.sleep(0.2)
                    response1 = requests.get(url=url,
                                             headers=headers,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
            html1 = etree.HTML(response1.text)
            # 品牌
            try:
                goods_dict['brand'] = html1.xpath(
                    '//dt[@class="orig-country"]/a/text()')[0].spilt(' ')[0]
            except:
                goods_dict['brand'] = ''
            # print(goods_dict)
            self.goods_comments(goods_dict)
        except:
            print(9999999999999999999999, traceback.format_exc())

    # 抓取第一页商品评论
    def goods_comments(self, goods_dict):
        try:
            if int(goods_dict['achieve_num']) == 0:
                pass
                # logger.log(31, '**********---------没有商品评论------************')
            else:
                goods_id = goods_dict['goods_id']
                comment_url = 'https://goods.kaola.com/commentAjax/comment_list_new.json'
                # print(comment_url, goods_id)
                headers = {
                    'authority':
                    'goods.kaola.com',
                    'method':
                    'POST',
                    'path':
                    '/commentAjax/comment_list_new.json',
                    'scheme':
                    'https',
                    'accept':
                    '*/*',
                    'accept-encoding':
                    'gzip, deflate, br',
                    'accept-language':
                    'zh-CN,zh;q=0.9',
                    'cache-control':
                    'no-cache',
                    'content-length':
                    '220',
                    'content-type':
                    'application/x-www-form-urlencoded',
                    'cookie':
                    'kaola_user_key=b87e28b9-e7fc-43ba-8ca7-42abae97a079; _ntes_nnid=116c0ca91001bfb53c23f45f9e55ac87,1568617522153; _ga=GA1.2.290138937.1568617522; _klhtxd_=31; _ga=GA1.3.290138937.1568617522; __da_ntes_utma=2525167.417648162.1568617522.1568617522.1568617522.1; davisit=1; __da_ntes_utmz=2525167.1568617522.1.1.; __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); usertrack=CrGZAV2DFzgLhl54AwtSAg==; KAOLA_NEW_USER_COOKIE=yes; cna=MQj5FQMZD0sCAXxONRZeF0y0; WM_TID=beYPJ03r5ilFBUFUFEZo5jCUV1mKk4PC; t=cf5d799c2331f5cabed38ae64e05e79e; KAOLA_USER_ID=109999078912652422; [email protected]; JSESSIONID-WKL-8IO=0zc3WMz%2Bz0rQe5Jcv1xai4OAOScJJgZviUPXMI3RUo2IYlneCBZYhem2pXj85vvoJ8Z%2B2yMxkJZ%2BDbqGhohayCkj0RWfrbvXgwt00Wju%2BMWVg7WjBsfPPuM6Bq0yJI1vkeq%5C17ndJLsLrHGeY1Sf0k231zopBvGmtXomvGZ5J9TWLbPq%3A1586842936344; davisit=2; __da_ntes_utmb=2525167.1.10.1586756536; _samesite_flag_=true; cookie2=1f50b0bd27965ea6d4731440eb0ab6b2; _tb_token_=57e48eee49e7; csg=7c23ee4b; NTES_OSESS=REPpP5MMDS0ti.Kjs4kXCagwqwIe5DsWd2J6spGZnnoVWWhz6L9pI2HlXPVp_85PuZGCsnYofZ0FK56aZ.uX88iBgdi0zJZsRBB8fdi_YIZfYxQlVYg4kvmcVqVCqK9kxhu.Yzv4Avj3rW.UPrCYFGfnrd5TZovCzX0lNqe3j5rAEWHpYRLXj1PsCx_75evCuvl01iv5jej2sgH2yqYAm2a0p; kaola_csg=93dad892; kaola-user-beta-traffic=12217883524; firstLogin=0; hb_MA-AE38-1FCC6CD7201B_source=search.kaola.com; NTES_KAOLA_RV=1537539_1586756945560_0|2884042_1586756792280_0|5522516_1586513810003_0|5705591_1585881322711_0|8317307_1585880658885_0|5553701_1585880652352_0|8517421_1585879009306_0|1467929_1571291229258_0|5218698_1569811431977_0|5536790_1569811422334_0|5457794_1569811411408_0|5115159_1569811404628_0|2843760_1569566707083_0|5481268_1569489750583_0|2723610_1569488978899_0|2546067_1569485553114_0|1758828_1569485116618_0|1616628_1569482665961_0|5111078_1569482641632_0|2482224_1569482624326_0; isg=BHV1IQtJR6edB6MO8FzlgBdJhPHvWigPBiZuAfeb4ewRzpfAv0AP1GEMGNLdjkG8',
                    'origin':
                    'https://goods.kaola.com',
                    'pragma':
                    'no-cache',
                    'referer':
                    'https://goods.kaola.com/review/{}.html'.format(
                        str(goods_id)),
                    'user-agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
                    'x-requested-with':
                    'XMLHttpRequest'
                }
                form_data = {
                    'goodsId': '{}'.format(str(goods_id)),
                    'grade': '0',
                    'tagType': '0',
                    'hasContent': '0',
                    'paginationContext': 'null',
                    'pageNo': '1',
                    'pageSize': '20',
                }
                try:
                    # time.sleep(0.2)
                    response = requests.post(url=comment_url,
                                             headers=headers,
                                             data=form_data,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
                except:
                    try:
                        # time.sleep(0.2)
                        response = requests.post(url=comment_url,
                                                 headers=headers,
                                                 data=form_data,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                    except:
                        # time.sleep(0.2)
                        response = requests.post(url=comment_url,
                                                 headers=headers,
                                                 data=form_data,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                # print(response.text)
                data = json.loads(response.text)
                # 获取评论列表
                comments_list = data['data']['commentPage']['result']
                if int(len(comments_list)) == 0:
                    return
                else:
                    # 获取当前页数
                    page_data = data['data']['commentPage']['pageNo']
                    # 评价总页数
                    pages_num = data['data']['commentPage']['totalPage']
                    # logger.log(31, '*******************第1页评论****************')
                    for item in comments_list:
                        kao_la_dict = dict()
                        time_data = self.time_change(item['createTime'])
                        # print(data_time_data)
                        try:
                            content = item['commentContent'].replace('\n', ' ')
                        except:
                            content = ''
                        # 追加评论
                        try:
                            comments_2 = item['replyList'][0]['replyContent']
                        except:
                            comments_2 = ''
                        if self.start_time <= time_data:
                            kao_la_dict['platform'] = goods_dict['platform']
                            kao_la_dict['date'] = time_data.split(' ')[0]
                            kao_la_dict['time'] = time_data.split(' ')[1]
                            kao_la_dict['keyword'] = goods_dict['keyword']
                            kao_la_dict['name'] = goods_dict['name']
                            kao_la_dict['imageurl'] = goods_dict['商品图片']
                            kao_la_dict['audiourl'] = ''
                            kao_la_dict['url'] = goods_dict['url']
                            kao_la_dict['shop_name'] = goods_dict['shop_name']
                            kao_la_dict['user_name'] = ''
                            kao_la_dict['content'] = content + ';' + comments_2
                            kao_la_dict['content_id'] = str(
                                item['goodsCommentId'])
                            kao_la_dict['brand'] = goods_dict['brand']
                            kao_la_dict['price'] = goods_dict['price']
                            kao_la_dict['sales'] = goods_dict['sales']
                            kao_la_dict['focus_count'] = ''
                            kao_la_dict['comment_num'] = goods_dict[
                                'achieve_num']
                            kao_la_dict['views'] = ''
                            kao_la_dict['likes'] = item['zanCount']
                            kao_la_dict['comments_count'] = ''
                            kao_la_dict['author_id'] = ''
                            kao_la_dict['reposts_count'] = ''
                            kao_la_dict['topic_id'] = str(item['goodsId'])
                            try:
                                kao_la_dict['type'] = item['skuPropertyList'][
                                    1]['propertyValue']
                            except:
                                kao_la_dict['type'] = ''
                            try:
                                kao_la_dict['size'] = item['skuPropertyList'][
                                    0]['propertyValue']
                            except:
                                kao_la_dict['size'] = ''
                            kao_la_dict['file_code'] = '176'
                            # print(kao_la_dict)
                            item = json.dumps(dict(kao_la_dict),
                                              ensure_ascii=False) + '\n'
                            self.hdfsclient.new_write(
                                '/user/cspider_daily/nike_2h/ecommerce/{}/{}/176_{}_KaoLa_nike{}.json'
                                .format(self.date_time, self.h2_name,
                                        time.strftime('%Y%m%d'), self.pid),
                                item,
                                encoding='utf-8')
                        else:
                            pass
                    if int(page_data) < int(pages_num):
                        # 获取第一页评论最后一个的id以及下一页从哪页跳转参数
                        lastId = data['data']['paginationContext']['lastId']
                        lastPage = data['data']['paginationContext'][
                            'lastPage']
                        # print(lastId, lastPage)
                        self.goods_comments_2(lastId, lastPage, goods_id,
                                              goods_dict,
                                              int(page_data) + 1)
                    else:
                        pass
        except:
            print(22222222222222222, traceback.format_exc())

    # 获取第一页之后的所有页面评论
    def goods_comments_2(self, lastId, lastPage, goods_id, goods_dict, i):
        try:
            comment_url = 'https://goods.kaola.com/commentAjax/comment_list_new.json'
            # print(comment_url, goods_id, lastId, lastPage)
            headers = {
                'authority':
                'goods.kaola.com',
                'method':
                'POST',
                'path':
                '/commentAjax/comment_list_new.json',
                'scheme':
                'https',
                'accept':
                '*/*',
                'accept-encoding':
                'gzip, deflate, br',
                'accept-language':
                'zh-CN,zh;q=0.9',
                'cache-control':
                'no-cache',
                'content-length':
                '247',
                'content-type':
                'application/x-www-form-urlencoded',
                'cookie':
                'kaola_user_key=b87e28b9-e7fc-43ba-8ca7-42abae97a079; _ntes_nnid=116c0ca91001bfb53c23f45f9e55ac87,1568617522153; _ga=GA1.2.290138937.1568617522; _klhtxd_=31; _ga=GA1.3.290138937.1568617522; __da_ntes_utma=2525167.417648162.1568617522.1568617522.1568617522.1; davisit=1; __da_ntes_utmz=2525167.1568617522.1.1.; __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); usertrack=CrGZAV2DFzgLhl54AwtSAg==; KAOLA_NEW_USER_COOKIE=yes; cna=MQj5FQMZD0sCAXxONRZeF0y0; WM_TID=beYPJ03r5ilFBUFUFEZo5jCUV1mKk4PC; t=cf5d799c2331f5cabed38ae64e05e79e; KAOLA_USER_ID=109999078912652422; [email protected]; JSESSIONID-WKL-8IO=0zc3WMz%2Bz0rQe5Jcv1xai4OAOScJJgZviUPXMI3RUo2IYlneCBZYhem2pXj85vvoJ8Z%2B2yMxkJZ%2BDbqGhohayCkj0RWfrbvXgwt00Wju%2BMWVg7WjBsfPPuM6Bq0yJI1vkeq%5C17ndJLsLrHGeY1Sf0k231zopBvGmtXomvGZ5J9TWLbPq%3A1586842936344; davisit=2; __da_ntes_utmb=2525167.1.10.1586756536; _samesite_flag_=true; cookie2=1f50b0bd27965ea6d4731440eb0ab6b2; _tb_token_=57e48eee49e7; csg=7c23ee4b; NTES_OSESS=REPpP5MMDS0ti.Kjs4kXCagwqwIe5DsWd2J6spGZnnoVWWhz6L9pI2HlXPVp_85PuZGCsnYofZ0FK56aZ.uX88iBgdi0zJZsRBB8fdi_YIZfYxQlVYg4kvmcVqVCqK9kxhu.Yzv4Avj3rW.UPrCYFGfnrd5TZovCzX0lNqe3j5rAEWHpYRLXj1PsCx_75evCuvl01iv5jej2sgH2yqYAm2a0p; kaola_csg=93dad892; kaola-user-beta-traffic=12217883524; firstLogin=0; hb_MA-AE38-1FCC6CD7201B_source=search.kaola.com; NTES_KAOLA_RV=1537539_1586756945560_0|2884042_1586756792280_0|5522516_1586513810003_0|5705591_1585881322711_0|8317307_1585880658885_0|5553701_1585880652352_0|8517421_1585879009306_0|1467929_1571291229258_0|5218698_1569811431977_0|5536790_1569811422334_0|5457794_1569811411408_0|5115159_1569811404628_0|2843760_1569566707083_0|5481268_1569489750583_0|2723610_1569488978899_0|2546067_1569485553114_0|1758828_1569485116618_0|1616628_1569482665961_0|5111078_1569482641632_0|2482224_1569482624326_0; isg=BHV1IQtJR6edB6MO8FzlgBdJhPHvWigPBiZuAfeb4ewRzpfAv0AP1GEMGNLdjkG8',
                'origin':
                'https://goods.kaola.com',
                'pragma':
                'no-cache',
                'referer':
                'https://goods.kaola.com/review/{}.html'.format(str(goods_id)),
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
                'x-requested-with':
                'XMLHttpRequest'
            }
            form_data = {
                'goodsId': '{}'.format(str(goods_id)),
                'grade': '0',
                'tagType': '0',
                'hasContent': '0',
                'showSelfGoodsComment': 'false',
                'paginationContext': {
                    "lastId": '{}'.format(lastId),
                    "lastPage": '{}'.format(lastPage)
                },
                'pageNo': '{}'.format(i),
                'pageSize': '20',
                'hasInitCommentTab': 'true'
            }
            try:
                # time.sleep(0.2)
                response = requests.post(url=comment_url,
                                         headers=headers,
                                         data=form_data,
                                         proxies=proxies,
                                         allow_redirects=False,
                                         timeout=30)
            except:
                try:
                    # time.sleep(0.2)
                    response = requests.post(url=comment_url,
                                             headers=headers,
                                             data=form_data,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
                except:
                    # time.sleep(0.2)
                    response = requests.post(url=comment_url,
                                             headers=headers,
                                             data=form_data,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
            data = json.loads(response.text)
            # print(data)
            # 获取评论列表
            comments_list = data['data']['commentPage']['result']
            # logger.log(31, '**********************第{}页评论**********************'.format(i))
            if int(len(comments_list)) == 0:
                return
            else:
                # 获取当前页数
                page_data = data['data']['commentPage']['pageNo']
                # 评价总页数
                pages_num = data['data']['commentPage']['totalPage']
                for item in comments_list:
                    kao_la_goods = dict()
                    time_data = self.time_change(item['createTime'])
                    try:
                        content = item['commentContent'].replace('\n', ' ')
                    except:
                        content = ''
                    # 追加评论
                    try:
                        comments_2 = item['replyList'][0]['replyContent']
                    except:
                        comments_2 = ''
                    if self.start_time <= time_data:
                        kao_la_goods['platform'] = goods_dict['platform']
                        kao_la_goods['date'] = time_data.split(' ')[0]
                        kao_la_goods['time'] = time_data.split(' ')[1]
                        kao_la_goods['keyword'] = goods_dict['keyword']
                        kao_la_goods['name'] = goods_dict['name']
                        kao_la_goods['imageurl'] = goods_dict['商品图片']
                        kao_la_goods['audiourl'] = ''
                        kao_la_goods['url'] = goods_dict['url']
                        kao_la_goods['shop_name'] = goods_dict['shop_name']
                        kao_la_goods['user_name'] = ''
                        kao_la_goods['content'] = content + ';' + comments_2
                        kao_la_goods['content_id'] = str(
                            item['goodsCommentId'])
                        kao_la_goods['brand'] = goods_dict['brand']
                        kao_la_goods['price'] = goods_dict['price']
                        kao_la_goods['sales'] = goods_dict['sales']
                        kao_la_goods['focus_count'] = ''
                        kao_la_goods['comment_num'] = goods_dict['achieve_num']
                        kao_la_goods['views'] = ''
                        kao_la_goods['likes'] = item['zanCount']
                        kao_la_goods['comments_count'] = ''
                        kao_la_goods['author_id'] = ''
                        kao_la_goods['reposts_count'] = ''
                        kao_la_goods['topic_id'] = str(item['goodsId'])
                        try:
                            kao_la_goods['type'] = item['skuPropertyList'][1][
                                'propertyValue']
                        except:
                            kao_la_goods['type'] = ''
                        try:
                            kao_la_goods['size'] = item['skuPropertyList'][0][
                                'propertyValue']
                        except:
                            kao_la_goods['size'] = ''
                        kao_la_goods['file_code'] = '176'
                        # print(kao_la_goods)
                        item = json.dumps(dict(kao_la_goods),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_2h/ecommerce/{}/{}/176_{}_KaoLa_nike{}.json'
                            .format(self.date_time, self.h2_name,
                                    time.strftime('%Y%m%d'), self.pid),
                            item,
                            encoding='utf-8')
                    else:
                        pass

                if int(page_data) < int(pages_num):
                    # 获取第2页评论最后一个的id以及下一页从哪页跳转参数
                    lastId = data['data']['paginationContext']['lastId']
                    lastPage = data['data']['paginationContext']['lastPage']
                    i += 1
                    self.goods_comments_2(lastId, lastPage, goods_id,
                                          goods_dict, i)
                else:
                    pass
        except:
            print(3333333333333333333, traceback.format_exc())

    # # 读取excel获取关键词
    # def parse_xlsx(self):
    #     # 设置路径
    #     path = './快消采集关键词_0916_v3-1.xlsx'
    #     # 打开execl
    #     workbook = xlrd.open_workbook(path)
    #
    #     # 根据sheet索引或者名称获取sheet内容
    #     Data_sheet = workbook.sheets()[0]  # 通过索引获取
    #
    #     rowNum = Data_sheet.nrows  # sheet行数
    #     colNum = Data_sheet.ncols  # sheet列数
    #
    #     # 获取所有单元格的内容
    #     list = []
    #     for i in range(rowNum):
    #         rowlist = []
    #         for j in range(colNum):
    #             rowlist.append(Data_sheet.cell_value(i, j))
    #         list.append(rowlist)
    #
    #     for data in list[1::]:
    #         brand = data[0]
    #         # print(brand)
    #         yield {
    #             '关键词': brand,
    #         }

    def run(self, lock):
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('kaola_2h_url')
            if str(redis_url_num) == '0':
                print(
                    '**********Redis消息队列中url为空.....进程 {} 抓取结束......************'
                    .format(str(os.getpid())))
                return
            item = self.redis_example.brpop('kaola_2h_url', timeout=3600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_brand(item1)
Ejemplo n.º 28
0
    def __init__(self, file_path, comment_path, need_time):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        # 评论接口模板
        self.comment_port_url = 'http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/{}/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset={}&callback=jsonp_1542355418897&_=1542355418898'

        # # 时间判断部分
        # date = datetime.now() - timedelta(days=30)
        # news_start_time = str(date).split(' ')[0]
        # yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        # yesterday = str(yesterday).split(' ')[0]

        # # 定义开始时间 y-m-d  离现在时间远  news_start_time
        # self.start_time = news_start_time
        # # 定义结束时间 y-m-d  离现在时间近  yesterday
        # self.end_time = yesterday
        # print('爬取时间段:{}到{}'.format(self.start_time, self.end_time))
        #
        # logging.info('爬取时间段:{}到{}'.format(self.start_time, self.end_time))
        # # 定义评论的抓取时间范围
        # # self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-08-01'  # 一天回复
        # # self.comment_start_time = ''  # 不限定时间回复
        #
        # self.comment_end_time = yesterday
        # # self.comment_end_time = yesterday
        # get_now_time = time.time() - 86400
        get_now_time = time.time() - int(need_time)
        time_local = time.localtime(float(get_now_time))
        # 转换成新的时间格式(2016-05-05 20:28:54)
        dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        end_t = time.time()
        time_local = time.localtime(float(end_t))
        # 转换成新的时间格式(2016-05-05 20:28:54)
        end_dt = time.strftime("%Y-%m-%d %H:%M",
                               time_local)  # "%Y-%m-%d %H:%M:%S"
        # end_time = str(end_time).split(' ')[0]
        logging.log(31, '爬取时间段:{}到{}'.format(dt, str(datetime.now())))
        # 定义开始时间 y-m-d  离现在时间远
        self.start_time = dt
        # self.start_time = '2019-09-09 00:01'
        # 定义结束时间 y-m-d  离现在时间近
        self.end_time = end_dt
        # self.end_time = '2019-09-16 12:57'
        # 标记爬虫工作
        self.is_work = True
        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs('{}/{}'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}'.format(
            self.comment_apth,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
Ejemplo n.º 29
0
import uuid
import sys

canss = sys.argv[1]

ss_name = str(datetime.datetime.now()).split('.')[0]
ss_name = ss_name.replace(':', '-').split('-')
del ss_name[-1]
ss_names = "-".join(ss_name)

zhu_date_st = 'a'
zhu_time_st = 'b'

from with_hdfs import HdfsClient

hdfs = HdfsClient(url='http://192.168.1.209:14000', user='******')
sjc = str(time.time()).split('.')[0]
daily_date = str(datetime.datetime.now() -
                 timedelta(hours=2)).split(' ')[0].replace('-', '')

st_1 = str(datetime.datetime.now()).split(' ')[1].split(':')[0]
st_2 = str(datetime.datetime.now() -
           timedelta(hours=2)).split(' ')[1].split(':')[0]
st_3 = st_2 + '_' + st_1

# 通过系统时间自动计算时间间隔 ---------------------------------------------------------
date = datetime.datetime.now() - timedelta(days=1)
news_start_time = str(date).split(' ')[0]
now_time = str(datetime.datetime.now()).split(' ')[0]
logging.info('爬取时间段:{}到{}'.format(news_start_time, now_time))
Ejemplo n.º 30
0
class YiDianSpider(object):
    def __init__(self, file_path, comment_path):
        self.headers_two = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            # 'Connection':'keep-alive',
            'Cookie':
            'cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201542776168%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201542776168%7D%7D; UM_distinctid=16730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461; JSESSIONID=208cee9fea61049d61e7d18f9e9c275ecf530a9e308a94dde36658adc01a0594; wuid=154945905891357; wuid_createAt=2018-11-21 12:56:9',
            'Host': 'www.baidu.com',
            'Referer': 'http://www.yidianzixun.com/channel/c11',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        self.proxies = ['218.95.55.154:4243']

        # 去重列表
        self.set_list = []
        #
        self.error_url_list = []
        self.headers_one = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'www.baidu.com',
            # 'Proxy-Connection': 'keep-alive',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        self.user_agent = [
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
        ]

        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:  # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        try:
            self.page_ip = proxies.res_ip()
            print('ip: ', self.page_ip)
            # self.page_ip = '116.248.160.138:4261'
        except:
            time.sleep(3)
            print('调用ip时发生错误:{}'.format(traceback.format_exc()))
            logger.error('调用ip时发生错误:{}'.format(traceback.format_exc()))
            self.page_ip = proxies.res_ip()
        self.ip_count = 0

        # 定义评论的抓取时间范围
        # self.comment_start_time = yesterday  # 一天回复
        self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = yesterday
        self.is_get_comment = True

        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        hour = str(datetime.now()).split(' ')[-1].split(':')[0]
        if str(hour) != '00':
            two_hour_ago = int(hour) - 2
            if len(str(two_hour_ago)) == 1:
                two_hour_ago = '0' + str(two_hour_ago)
            self.hour_name = str(two_hour_ago) + '_' + str(hour)
        else:
            self.hour_name = '22_24'
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.file_path,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.comment_apth,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]

    def get_channel_id(self):
        url = 'http://www.yidianzixun.com/channel/c11'
        try:
            response = requests.get(url,
                                    proxies={'http': self.page_ip},
                                    timeout=30)
            data = response.content.decode()
            data = re.search('channel_id(.*?)汽车', data).group(0)
            channel_id = re.search('\d{8,15}', data).group(0)
            cokies = response.headers['Set-Cookie']
            print(cokies)
            id = re.search('JSESSIONID=([a-z0-9]{30,80});', cokies).group(1)

            return channel_id, id
        except:
            print(traceback.format_exc())

            if self.ip_count < 10:
                self.page_ip = proxies.res_ip()
                print('跟换ip中: ', self.page_ip)
                self.ip_count += 1
                time.sleep(5)
                self.get_channel_id()
            else:
                raise IndexError

    def get_news_list_port(self, url):
        headers_port = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Host': 'www.yidianzixun.com',
            'Connection': 'keep-alive',
            # 'Upgrade-Insecure-Requests': '1',
            'Referer': 'http://www.yidianzixun.com/',
            'User-Agent':
            'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36',
            'Cookie':
            'wuid=289836166779454; wuid_createAt=2019-10-29 16:11:43; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1572336703; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1572336703; JSESSIONID=7c64fe11dc634f7bce6816ca76a196fb915ea8d8a307a0a41b26269846df44ef',
            'X-Requested-With': 'XMLHttpRequest'
        }

        # headers_port['Cookie'] = cookie
        print(url)
        response = requests.get(url,
                                headers=headers_port,
                                proxies={'http': self.page_ip})
        # print(response.url)
        # print(response.text)
        data = response.content.decode()
        data = json.loads(data)
        data = data['result']
        # print(data)
        for news in data:
            item = {}
            try:
                title = news['title']
            except:
                continue
            item['title'] = title
            itemid = news['docid']
            url = 'http://www.yidianzixun.com/article/' + itemid
            print(url)
            news_date = news['date']
            if 'V_' not in itemid:
                if url not in self.set_list:
                    # self.write_news_jsonfile(item)
                    try:
                        self.get_news_page_info(url)
                    except:
                        print(traceback.format_exc())
                    self.set_list.append(url)

    # 获取通过js生成的spt的值
    def get_spt(self, start, channel_id):
        # start = 10
        end = start + 10
        n = "/home/q/news_list_for_channel?channel_id=11756176923&cstart=0&cend=10&infinite=true&refresh=1&__from__=pc&multi=5"
        e = str(channel_id)
        # ctx = execjs.compile(
        #     '''
        #     function good (n,e,i,t){
        #         for (var o = "sptoken", a = "", c = 1; c < arguments.length; c++){
        #             o += arguments[c];
        #         }
        #         for (var c = 0; c < o.length; c++) {
        #             var r = 10 ^ o.charCodeAt(c);
        #             a += String.fromCharCode(r)
        #         }
        #         return a
        #     }
        #     '''
        # )
        # spt = ctx.call('good', n, e, start, end)
        # return spt

    def get_news_page_info(self, url):
        item = {}
        response = requests.get(url)
        print(response.url)
        data = etree.HTML(response.content.decode())
        title = data.xpath('.//h2/text()')[0]
        if data.xpath('.//a[@class="doc-source"]/text()'):
            source = data.xpath('.//a[@class="doc-source"]/text()')[0]
        else:
            source = data.xpath('.//div[@class="meta"]/span[1]/text()')[0]
        # date_time = data.xpath('.//div[@class="meta"]/span[2]/text()')[0]
        if data.xpath('.//div[@id="imedia-article"]//text()'):
            content = data.xpath('.//div[@id="imedia-article"]//text()')
        elif data.xpath('.//div[@id="imedia-article"]/article/p//text()'):
            content = data.xpath(
                './/div[@id="imedia-article"]/article/p//text()')
        elif data.xpath(
                './/div[@id="imedia-article"]/section/section//text()'):
            content = data.xpath(
                './/div[@id="imedia-article"]/section/section//text()')
        elif data.xpath('.//div[@class="content-bd"]/div/div//text()'):
            content = data.xpath('.//div[@class="content-bd"]/div/div//text()')
        elif data.xpath('.//div[@class="content-bd"]/p//text()'):
            content = data.xpath('.//div[@class="content-bd"]/p//text()')
        elif data.xpath('.//div[@class="content-bd"]/div/div/text()'):
            content = data.xpath('.//div[@class="content-bd"]/div/div//text()')
        elif data.xpath('.//div[@class="content-bd"]/section//text()'):
            content = data.xpath('.//div[@class="content-bd"]/section//text()')
        elif data.xpath('.//div[@class="content-bd"]/section/text()'):
            content = data.xpath('.//div[@class="content-bd"]/section/text()')
        elif data.xpath('.//div[@class="content-bd"]//text()'):
            content = data.xpath('.//div[@class="content-bd"]//text()')
        else:
            content = data.xpath(
                './/div[@id="imedia-article"]/section/section/section/p//text()'
            )
        content = ''.join(content)

        # get_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        item['platform'] = '一点资讯'
        item['title'] = title
        item['article_source'] = source  # 文章来源
        item['article_author'] = ''  # 文章作者
        item['content'] = content
        # if len(data.xpath('.//div[@class="meta"]/span')) == 3:
        #     date_all = data.xpath('.//div[@class="meta"]/span[3]/text()')[0]
        # elif len(data.xpath('.//div[@class="meta"]/span')) == 2:
        #     date_all = data.xpath('.//div[@class="meta"]/span[2]/text()')[0]
        # else:
        date_all = data.xpath('.//div[@class="meta"]/span//text()')
        date_all = ''.join(date_all).strip()

        try:
            if date_all == '昨天' or '小时前' in date_all:
                yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
                yesterday = str(yesterday).split(' ')[0]
                # print(date_all,  yesterday)
                item['date'] = yesterday
            elif date_all == '2天前':
                yesterday = datetime.now() - timedelta(days=2)  # 2前天时间
                yesterday = str(yesterday).split(' ')[0]
                # print(date_all, yesterday)
                item['date'] = yesterday
            elif date_all == '3天前':
                yesterday = datetime.now() - timedelta(days=3)  # 3前天时间
                yesterday = str(yesterday).split(' ')[0]
                # print(date_all, yesterday)
                item['date'] = yesterday
            else:
                news_date = re.search(r'\d{4}\.\d{1,2}\.\d{1,2}',
                                      date_all).group(0)
                # print(222222, news_date)
                # print(33333, date_all)
                item['date'] = news_date.replace('.', '-')
        except:
            item['date'] = self.comment_end_time
        # print(item)
        item['time'] = ''
        item['likes'] = ''
        item['clicks'] = ''
        item['views'] = ''
        item['keyword'] = ''
        item['comments_count'] = ''
        item['article_url'] = url  # 文章详情URL
        item['dislikes'] = ''  # 踩人数
        item['series_url'] = ''  # 车系首页
        item['list_url'] = 'http://www.yidianzixun.com/channel/c11'  # 文章列表URL
        item['article_type_1st'] = ''  # 文章类型
        item['article_type_2nd'] = ''  # 文章类型
        item['insert_time'] = str(datetime.now()).split('.')[0]  # 初始爬取时间
        item['update_time'] = str(datetime.now()).split('.')[0]  # 最后爬取时间
        item['content_id'] = url.split('/')[-1].split('?')[0]  # 文章id
        item['topic_id'] = url.split('/')[-1].split('?')[0]  # 主贴id
        item['author_id'] = ''  # 作者id
        item['file_code'] = '26'  # 文件编号

        # 做时间判断部分---------------  这个部分区分于另外一个部分
        # if date_all == '昨天' or date_all == '2天前' or date_all == '3天前' or '小时前' in date_all:
        # print(date_all, '时间符合')
        # print(item)
        self.write_news_jsonfile(item)
        news_id = url.split('/')[-1]
        self.is_get_comment = True
        self.get_commnet_info(news_id, title, url, item['date'])

    # 获取评论信息
    def get_commnet_info(self,
                         news_id,
                         title,
                         source_url,
                         source_date,
                         last_comment_id=''):
        item = {}
        url = 'http://www.yidianzixun.com/home/q/getcomments?_=1542864983174&docid={}&s=&count=30&last_comment_id={}&appid=web_yidian'.format(
            str(news_id), last_comment_id)
        response = requests.get(url)
        data = json.loads(response.content.decode())
        comments = data['comments']
        if comments:
            total_item = ''
            for comment in comments:
                # print(comment)
                # print('爬取评论中')
                item['platform'] = '一点资讯'
                item['title'] = title
                content = comment['comment']
                item['content'] = content
                author = comment['nickname']
                item['author'] = author
                date_all = comment['createAt']
                comment_date = date_all.split(' ')[0]
                comment_time = date_all.split(' ')[1]
                #  评论部分做时间判断部分---------------
                get_news_time = time.mktime(
                    time.strptime(str(comment_date), "%Y-%m-%d"))
                end_time = time.mktime(
                    time.strptime(self.comment_end_time, "%Y-%m-%d"))
                if self.comment_start_time != '':
                    start_time = time.mktime(
                        time.strptime(self.comment_start_time, "%Y-%m-%d"))
                else:
                    start_time = time.mktime(
                        time.strptime('2010-1-1', "%Y-%m-%d"))
                if float(get_news_time) < float(start_time):
                    self.is_get_comment = False  # 返回的回答消息是按时间进行排序的,所以当时间小于指定时间时,就停止爬取,
                    break
                elif float(start_time) <= float(get_news_time) <= float(
                        end_time):

                    item['date'] = comment_date
                    item['time'] = comment_time
                    item['source_date'] = source_date
                    item['source_time'] = ''
                    item['source_url'] = source_url
                    item['floor'] = ''
                    item['keyword'] = ''
                    item['comment_url'] = source_url
                    item['views'] = ''
                    item['comments_count'] = ''
                    item['likes'] = ''
                    item['author_id'] = comment['userid']  # 用户id
                    item['dislikes'] = ''  # 踩人数
                    item['insert_time'] = str(
                        datetime.now()).split('.')[0]  # 初始爬取时间
                    item['update_time'] = str(
                        datetime.now()).split('.')[0]  # 最后爬取时间
                    item['content_id'] = comment['comment_id']  # 内容id
                    item['topic_id'] = source_url.split('/')[-1].split('?')[
                        0]  # 主贴id
                    item['file_code'] = '40'  # 文件编号
                    item = json.dumps(dict(item), ensure_ascii=False) + '\n'
                    total_item += item

            self.write_comment_jsonfile(total_item)
            if len(comments) == 30 and self.is_get_comment:
                last_comment_id = comments[-1]['comment_id']
                print('评论翻页')
                self.get_commnet_info(news_id,
                                      title,
                                      source_url,
                                      source_date,
                                      last_comment_id=last_comment_id)

    def write_news_jsonfile(self, item):
        logger.log(31, '正在写入新闻数据......')

        # with open('./../yidianzixun/26_{}_yidianzixun_news.json'.format(str(now_time)), 'ab') as f:
        #     f.write(item.encode('utf-8'))
        self.hdfsclient.new_write(
            '{}/{}/{}/26_{}_{}_yidianzixun_news.json'.format(
                self.file_path,
                self.file_name_time.split(' ')[0].replace('-',
                                                          ''), self.hour_name,
                str(datetime.now()).split(' ')[0].replace('-', '_'),
                self.time_time),
            item,
            encoding='utf-8')

    def write_comment_jsonfile(self, item):
        logger.log(31, '正在写入评论数据......')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./../yidianzixun/40_{}_yidianzixun_commnet.json'.format(str(now_time)), 'ab') as f:
        #     f.write(item.encode('utf-8'))
        self.hdfsclient.new_write(
            '{}/{}/{}/40_{}_{}_yidianzixun_commnet.json'.format(
                self.comment_apth,
                self.file_name_time.split(' ')[0].replace('-',
                                                          ''), self.hour_name,
                str(datetime.now()).split(' ')[0].replace('-', '_'),
                self.time_time),
            item,
            encoding='utf-8')

    def get_news_url(self, num):
        """
        从百度搜索关键词,然后获取符合的新闻的url, 提取抓取数量
        """
        # 时间
        get_time = time.time()
        str_time = str(get_time)[:-4]
        date = datetime.now() - timedelta(days=7)
        a = str(date)[:-7]
        timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")
        # 转换为时间戳:
        timeStamp = int(time.mktime(timeArray))
        end_time = str(timeStamp) + '.' + str_time.split('.')[1]
        print(str_time, end_time)
        # url = 'https://www.baidu.com/s?q1=汽车&q2=&q3=&q4=&gpc=stf%3D{}%2C{}%7Cstftype%3D1&ft=&q5=&q6=www.yidianzixun.com&tn=baiduadv&pn={}'.format(end_time, str_time, num)
        url = 'https://www.baidu.com/s?wd=site%3A(www.yidianzixun.com)%20%E6%B1%BD%E8%BD%A6&pn={}&oq=site%3A(www.yidianzixun.com)%20%E6%B1%BD%E8%BD%A6&ct=2097152&tn=baiduadv&ie=utf-8&si=(www.yidianzixun.com)&rsv_pq=e948db9e00097fcd&rsv_t=1273sdRx9rzb35pYERweuGf1mV6RO2BZZUthjhhdYlSidhjyUjzN%2FuD2LYJ1%2Fso&gpc=stf%3D{}%2C{}%7Cstftype%3D2&tfflag=1'.format(
            num, end_time, str_time)
        print(url)
        # ip = random.choice(self.proxies_list)
        response = requests.get(url,
                                headers=self.headers_one,
                                verify=False,
                                timeout=30)  # , proxies={'https': ip}
        content = etree.HTML(response.content.decode())
        if content.xpath('.//h3[@class="t"]/a/@href'):
            url_list = content.xpath('.//h3[@class="t"]/a/@href')
            print(url_list)
            print(len(url_list))
            for url_ch in url_list:
                response = requests.get(url_ch,
                                        headers=self.headers_two,
                                        allow_redirects=False)
                print(response.status_code)
                news_url = response.headers['Location']
                print(news_url)
                if news_url not in self.set_list:
                    try:
                        self.get_news_page_info(news_url)
                    except Exception as e:
                        print(e)
                        time.sleep(15)
                    self.set_list.append(news_url)

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    def run(self):

        url = 'http://www.yidianzixun.com/home/q/news_list_for_channel'
        get_time = time.time()
        get_time = ''.join(str(get_time).split('.'))

        url_list = [
            # 体育
            'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171666&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C%3C%3C%3A%3B%3A&appid=web_yidian&_={}',
            # NBA
            'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171682&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C28%3A%3B%3A&appid=web_yidian&_={}',
            # 财经
            'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171698&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C32%3A%3B%3A&appid=web_yidian&_={}'
        ]
        for get_url in url_list:
            for i in range(2):
                try:
                    for j in range(30):
                        url = get_url.format(
                            str(time.time()).replace('.', '')[:-4])
                        try:
                            self.get_news_list_port(url)
                        except requests.exceptions.ProxyError:
                            print(traceback.format_exc())
                            break

                except TypeError:
                    print(traceback.format_exc())
                    logger.error('内容解析错误')
                except:
                    print(traceback.format_exc())
                    logger.error('其他错误')

                time.sleep(10)
                self.page_ip = proxies.res_ip()