Python HdfsClient.new_write Exemples, with_hdfs.HdfsClient.new_write Python Exemples

Exemple #1

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        self.headers = {
            'Content-Type':
            'text/html; charset=utf-8',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cache-Control':
            'no-cache',
            # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; user_class=a; VipUINFO=luc%3Aa%7Csuc%3Aa%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495; visit_id=4C5B033907F8247A18F2811FF8D147F0; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564740333894.1564740386032.1564740333894.1564740386032.0.0.0.24.24; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af',
            'Host':
            'category.vip.com',
            'Pragma':
            'no-cache',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
        }
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段：{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段：{}到{}'.format(start_time_data, end_time))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time
        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 时间戳转换时间
    def time_change(self, data):
        # 替换抓取数据中的html标签
        try:
            timeStamp = float(int(data) / 1000)
            timeArray = time.localtime(timeStamp)
            otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
            return otherStyleTime
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 获取评论量
    def parse_comments_num(self, goods_dict):
        try:
            headers = {
                # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; VipDFT=1; visit_id=2221152ECC2AD948DF7AB8D56322CE59; vipAc=cf3c0da6d5b52c0f6088b0148efbdb22; vipshop_passport_src=https%3A%2F%2Fdetail.vip.com%2Fdetail-1710618487-6918048587083491095.html; PASSPORT_ACCESS_TOKEN=1FDEBDAAF470FFB2C3C6A9EEAF7256FBA60D1F08; VipRUID=298018734; VipUID=0f94f94cc1ea26b39e78438380499d64; VipRNAME=152*****067; VipLID=0%7C1564973676%7C4b447f; VipDegree=D1; user_class=c; VipUINFO=luc%3Ac%7Csuc%3Ac%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; PHPSESSID=b9bnc95dlt7r4eg2r196td02i4; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495%2C6918048587083491095%2C6917922115290256471; VipCI_te=0%7C%7C1564974326; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564974076993.1564974326073.1564974076993.1564974326073.0.0.0.39.39; waitlist=%7B%22pollingId%22%3A%22F90BE7CF-3F21-4012-800F-E1F26000E5BF%22%2C%22pollingStamp%22%3A1564974516121%7D; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af',
                'Host':
                'detail.vip.com',
                'Pragma':
                'no-cache',
                'Referer':
                goods_dict['url'],
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            url = 'https://detail.vip.com/v2/mapi?_path=rest/content/reputation/getCountBySpuId&spuId={}&brandId={}&app_name=shop_pc'.format(
                goods_dict['spuId'], goods_dict['brandId'])
            try:
                time.sleep(0.1)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            achieve_num_data = json.loads(response.text)['data']
            goods_dict['achieve_num'] = achieve_num_data
            if int(achieve_num_data) == 0:
                page_num = int(
                    math.ceil(float((int(achieve_num_data) + 1) / 10)))
                # logger.log(31, '评论数是: %s , 评论页数是: %s ' % (goods_dict['achieve_num'], str(page_num)))
                # print(goods_dict)
                self.parse_comments(goods_dict, page_num)
            else:
                page_num = int(math.ceil(float(int(achieve_num_data) / 10)))
                # logger.log(31, '评论数是: %s , 评论页数是: %s ' % (goods_dict['achieve_num'], str(page_num)))
                # print(goods_dict)
                self.parse_comments(goods_dict, page_num)
        except:
            print(222222222222222222222222, traceback.format_exc())

    # 抓取商品评论
    def parse_comments(self, goods_dict, page_num):
        try:
            if page_num == 0:
                pass
                # logger.log(31, '0000000000000000没有商品评论信息000000000000000000')
            else:
                is_break = self.is_break
                headers = {
                    # 'Cookie': 'vip_rip=101.86.55.85; vip_address=%257B%2522pname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522cname%2522%253A%2522%255Cu4e0a%255Cu6d77%255Cu5e02%2522%252C%2522pid%2522%253A%2522103101%2522%252C%2522cid%2522%253A%2522103101101%2522%257D; vip_province=103101; vip_province_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_name=%E4%B8%8A%E6%B5%B7%E5%B8%82; vip_city_code=103101101; vip_wh=VIP_SH; mars_pid=0; mars_sid=a369b0e73f9656dbd3eda470968f6cd2; _smt_uid=5d4156d3.52d69d05; VipDFT=1; visit_id=2221152ECC2AD948DF7AB8D56322CE59; vipAc=cf3c0da6d5b52c0f6088b0148efbdb22; vipshop_passport_src=https%3A%2F%2Fdetail.vip.com%2Fdetail-1710618487-6918048587083491095.html; PASSPORT_ACCESS_TOKEN=1FDEBDAAF470FFB2C3C6A9EEAF7256FBA60D1F08; VipRUID=298018734; VipUID=0f94f94cc1ea26b39e78438380499d64; VipRNAME=152*****067; VipLID=0%7C1564973676%7C4b447f; VipDegree=D1; user_class=c; VipUINFO=luc%3Ac%7Csuc%3Ac%7Cbct%3Ac_new%7Chct%3Ac_new%7Cbdts%3A0%7Cbcts%3A0%7Ckfts%3A0%7Cc10%3A0%7Crcabt%3A0%7Cp2%3A0%7Cp3%3A1%7Cp4%3A0%7Cp5%3A1; PHPSESSID=b9bnc95dlt7r4eg2r196td02i4; vipte_viewed_=6917921732696396695%2C793920209978892%2C2161644495%2C6918048587083491095%2C6917922115290256471; VipCI_te=0%7C%7C1564974326; _jzqco=%7C%7C%7C%7C%7C1.15943944.1564563154491.1564974076993.1564974326073.1564974076993.1564974326073.0.0.0.39.39; waitlist=%7B%22pollingId%22%3A%22F90BE7CF-3F21-4012-800F-E1F26000E5BF%22%2C%22pollingStamp%22%3A1564974516121%7D; mars_cid=1564563151837_048422ec87f93127ee1eced568a171af',
                    'Host':
                    'detail.vip.com',
                    'Pragma':
                    'no-cache',
                    'Referer':
                    goods_dict['url'],
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
                }
                for i in range(1, int(page_num) + 1):
                    # logger.log(31, '*************************抓取评论第：%s 页' % i)
                    url = 'https://detail.vip.com/v2/mapi?_path=rest/content/reputation/queryBySpuId&spuId={}&brandId={}&page={}&pageSize=10&app_name=shop_pc&keyWordNlp=%E6%9C%80%E6%96%B0'.format(
                        goods_dict['spuId'], goods_dict['brandId'], i)
                    try:
                        time.sleep(0.1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        try:
                            time.sleep(0.1)
                            response = requests.get(url=url,
                                                    headers=headers,
                                                    proxies=proxies,
                                                    allow_redirects=False,
                                                    timeout=30)
                        except:
                            time.sleep(0.1)
                            response = requests.get(url=url,
                                                    headers=headers,
                                                    proxies=proxies,
                                                    allow_redirects=False,
                                                    timeout=30)
                    # 商品评价列表
                    comments_list = json.loads(response.text)['data']
                    if int(len(comments_list)) == 0:
                        break
                    else:
                        comment_dict = dict()
                        for item in comments_list:
                            date_data = self.time_change(
                                item['reputation']['postTime'])
                            # print(date_data)
                            if self.start_time <= date_data:
                                comment_dict['platform'] = goods_dict[
                                    'platform']
                                comment_dict['date'] = date_data.split(' ')[0]
                                comment_dict['time'] = date_data.split(' ')[1]
                                comment_dict['keyword'] = goods_dict['keyword']
                                comment_dict['name'] = goods_dict['name']
                                comment_dict['imageurl'] = goods_dict['商品图片']
                                comment_dict['audiourl'] = ''
                                comment_dict['url'] = goods_dict['url']
                                comment_dict['shop_name'] = ''
                                comment_dict['user_name'] = item[
                                    'reputationUser']['authorName']
                                comment_dict['author_id'] = str(
                                    item['reputationUser']['userIdentity'])
                                comment_dict['content'] = item['reputation'][
                                    'content']
                                comment_dict['content_id'] = str(
                                    item['reputation']['reputationId'])
                                comment_dict['brand'] = goods_dict['brand']
                                comment_dict['price'] = goods_dict['price']
                                comment_dict['sales'] = goods_dict['sales']
                                comment_dict['focus_count'] = ''
                                comment_dict['comment_num'] = goods_dict[
                                    'achieve_num']
                                comment_dict['views'] = ''
                                comment_dict['likes'] = ''
                                comment_dict['comments_count'] = ''
                                comment_dict['reposts_count'] = ''
                                comment_dict['topic_id'] = str(
                                    goods_dict['url'].split('-')[2].replace(
                                        '.html', ''))
                                try:
                                    comment_dict['type'] = item[
                                        'reputationProduct']['colorInfo']
                                except:
                                    comment_dict['type'] = ''
                                try:
                                    comment_dict['size'] = item[
                                        'reputationProduct']['size']
                                except:
                                    comment_dict['size'] = ''
                                comment_dict['file_code'] = '179'
                                # logger.log(31, '---------------正在写入符合时间的商品评论---------------------')
                                # print(comment_dict)
                                # self.write_Nike_jsonfile(comment_dict)
                                item = json.dumps(dict(comment_dict),
                                                  ensure_ascii=False) + '\n'
                                self.hdfsclient.new_write(
                                    '/user/cspider_daily/nike_2h/ecommerce/{}/{}/179_{}_WPH_nike{}.json'
                                    .format(self.date_time, self.h2_name,
                                            time.strftime('%Y%m%d'), self.pid),
                                    item,
                                    encoding='utf-8')
                            if self.start_time > date_data.split(
                                    ' ')[0].strip():
                                is_break = True
                        if is_break:
                            break
        except:
            print(33333333333333333333, traceback.format_exc())

    # def parse_xlsx(self):
    #     # 设置路径
    #     path = './快消采集关键词_0916_v3-1.xlsx'
    #     # 打开execl
    #     workbook = xlrd.open_workbook(path)
    #
    #     # 根据sheet索引或者名称获取sheet内容
    #     Data_sheet = workbook.sheets()[0]  # 通过索引获取
    #
    #     # print(Data_sheet.name)  # 获取sheet名称
    #     rowNum = Data_sheet.nrows  # sheet行数
    #     colNum = Data_sheet.ncols  # sheet列数
    #
    #     # 获取所有单元格的内容
    #     list = []
    #     for i in range(rowNum):
    #         rowlist = []
    #         for j in range(colNum):
    #             rowlist.append(Data_sheet.cell_value(i, j))
    #         list.append(rowlist)
    #
    #     for data in list[1::]:
    #         brand = data[0]
    #         # print(brand)
    #         yield {
    #             '关键词': brand,
    #         }

    def run(self, lock):
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('WPH_nike_url')
            if str(redis_url_num) == '0':
                print(
                    '**********Redis消息队列中url为空.....进程 {} 抓取结束.....***********'.
                    format(str(os.getpid())))

            item = self.redis_example.brpop('WPH_nike_url', timeout=3600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_comments_num(item1)

Exemple #2

0

Afficher le fichier

Fichier : shop_Su_ning.py Projet : yuzhujiutian/nike_spider

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]

        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]

        print('爬取时间段：{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段：{}到{}'.format(news_start_time, current_day))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day
        # 标记爬虫工作
        self.is_break = False
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;', '', ret6)
            ret8 = re.sub(r'https:', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 过滤商品价格
    def re_price(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'pcData\(', '', message)
            ret2 = re.sub(r'\)', '', ret1)
            return ret2
        except:
            pass

    # 过滤商品品牌信息
    def re_brand(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'"brandName":', '', message)
            ret2 = re.sub(r'&amp;', '', ret1)
            ret3 = re.sub(r'"', '', ret2)
            return ret3
        except:
            pass

    # 根据关键词搜索请求得到商品信息
    def parse_url(self, data):
        # 创建一个字典接收数据
        goods_dict = dict()
        goods_dict['平台'] = data['平台']
        goods_dict['关键词'] = data['关键词']
        goods_dict['URL'] = data['URL']
        goods_dict['商品图片'] = data['商品图片']
        goods_dict['商品名'] = data['商品名']
        goods_dict['shop_name'] = data['shop_name']
        goods_dict['品牌'] = data['品牌']
        goods_dict['月销量'] = data['月销量']
        goods_dict['价格'] = data['价格']
        goods_dict['评论人数'] = data['评论人数']
        # logger.log(31, '--------********正在抓取的商品是：%s********--------' % goods_dict)
        self.parse_goods_details(goods_dict)

    # 解析商品品牌信息
    def parse_goods_details(self, goods_dict):
        try:
            headers = {
                'Content-Type':
                'text/html;charset=utf-8',
                # 'Connection': 'keep-alive',
                # 'Cookie': 'SN_SESSION_ID=c55ac35a-f7d1-4b0c-b48a-f88e8bb896f4; useMp4=1.701108; _snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; _snsr=direct%7Cdirect%7C%7C%7C; _snzwt=THiw3Z16a429d6f24nzVa227f; _snmc=1; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; authId=si0BE64747CDCB0EC1B819BB87E6D52FC1; secureToken=E180078268FCC770B6CFC47BFC919E55; _snms=155592217017833779; smhst=651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000a861276981|0000000000a11028136288|0070705161a11002911104|0070756234a101822780|0000000000; _snma=1%7C15553832315961909%7C1555383231596%7C1555923318059%7C1555923324804%7C140%7C9; _snmp=155592332389716467; _snmb=155591411681863515%7C1555923324825%7C1555923324807%7C37',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            # print(goods_dict)
            goods_url = goods_dict['URL']
            # response = requests.get(url=goods_url, headers=headers, proxies=random.choice(proxies), timeout=10)
            try:
                time.sleep(0.2)
                response = requests.get(url=goods_url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except (requests.exceptions.ConnectionError, ConnectionResetError):
                try:
                    time.sleep(0.2)
                    response = requests.get(url=goods_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except (requests.exceptions.ConnectionError,
                        ConnectionResetError):
                    time.sleep(0.2)
                    response = requests.get(url=goods_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            # print(response.text)
            # print('正在抓取的页面是: %s' % goods_url)
            data = response.text
            # print(html)
            # 用正则截取价格和评论链接里需要的两串ID
            partNumber = re.search(r'"partNumber".*?,', data)
            vendorCode = re.search(r'"vendorCode".*?,', data)
            # print(partNumber.group(), vendorCode.group())
            goods_dict['partNumber'] = self.re_not_number(partNumber.group())
            goods_dict['vendorCode'] = self.re_not_number(vendorCode.group())
            # print(goods_dict)
            if int(self.re_not_number(goods_dict['评论人数'])) == 0:
                logger.log(31, '-------------没有商品评论信息------------')
            else:
                # 获取评论页数
                page_num = int(
                    math.ceil(
                        float(
                            int(self.re_not_number(goods_dict['评论人数'])) / 10)))
                self.goods_comments(goods_dict, page_num)
        except:
            print(2222222222222222222, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, page_num):
        try:
            is_break = self.is_break

            partNumber = goods_dict['partNumber']
            vendorCode = goods_dict['vendorCode']
            headers = {
                'Content-Type':
                'application/javascript;charset=UTF-8',
                # 'Connection': 'keep-alive',
                # 'Cookie': '_snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; tradeMA=127; route=3798b42173574ff4536b1645bfa56286; _snzwt=THusFg16a66e65b60nBjXc7ab; _snsr=direct%7Cdirect%7C%7C%7C; _snmc=1; _snms=155652264991095847; authId=si07DE872B7B580CBB2CB11C7105B450A8; secureToken=5C8868551C3103287B59ADEDD6B90567; smhst=192279908|0000000000a600733096|0000000000a600479244|0000000000a10700388709|0070547159a651484540|0000000000a826233089|0000000000a10243606506|0000000000a101822738|0000000000a101822744|0000000000a160764310|0000000000a122819279|0000000000a651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000; _snma=1%7C15553832315961909%7C1555383231596%7C1556524706411%7C1556524786984%7C224%7C15; _snmp=155652478697968344; _snmb=155652102706620667%7C1556524786995%7C1556524786988%7C28',
                'Host':
                'review.suning.com',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            if int(page_num) >= 50:
                pages = 50
            else:
                pages = page_num
            # 抓取商品评论链接(总共50页,第一页从1开始)
            for i in range(1, int(pages) + 1):
                comment_url = 'https://review.suning.com/ajax/cluster_review_lists/style--{}-{}-newest-{}-default-10-----reviewList.htm?callback=reviewList'.format(
                    partNumber, vendorCode, i)
                # print(comment_url)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except (requests.exceptions.ConnectionError,
                        ConnectionResetError):
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except (requests.exceptions.ConnectionError,
                            ConnectionResetError):
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                comment_data = response.text
                # print(comment_data)
                comment = re.search(r'{"commodityReviews":.*"reCloudDrill":0}',
                                    comment_data)
                # print(comment.group())
                items = json.loads(comment.group())['commodityReviews']
                if len(items) == 0:
                    break
                else:
                    goods_comment = dict()
                    for item in items:
                        # print(item)
                        date_data = item['publishTime'].split(' ')[0]
                        time_data = item['publishTime'].split(' ')[1]
                        # print(date_data, time_data)
                        try:
                            content = self.re_html(item['content'])
                        except:
                            content = ''
                        # 追加评论
                        try:
                            content_add = item['againReview']['againContent']
                        except:
                            content_add = ''

                        # 判断评论时间是否在规定的抓取时间内
                        if self.start_time <= date_data.strip():
                            goods_comment['platform'] = goods_dict['平台']
                            goods_comment['date'] = date_data.strip()
                            goods_comment['time'] = time_data.strip()
                            goods_comment['keyword'] = goods_dict['关键词']
                            goods_comment['name'] = goods_dict['商品名']
                            goods_comment['imageurl'] = goods_dict['商品图片']
                            goods_comment['audiourl'] = ''
                            goods_comment['url'] = goods_dict['URL']
                            goods_comment['shop_name'] = goods_dict[
                                'shop_name']
                            goods_comment['user_name'] = item['userInfo'][
                                'nickName']
                            goods_comment[
                                'content'] = content + ';' + content_add
                            goods_comment['content_id'] = str(
                                item['commodityReviewId'])
                            goods_comment['brand'] = goods_dict['品牌']
                            goods_comment['price'] = goods_dict['价格']
                            goods_comment['sales'] = goods_dict['月销量']
                            goods_comment['focus_count'] = ''
                            goods_comment['comment_num'] = goods_dict['评论人数']
                            goods_comment['views'] = ''
                            goods_comment['author_id'] = ''
                            goods_comment['reposts_count'] = ''
                            goods_comment['topic_id'] = str(
                                goods_dict['URL'].split('/')[4].replace(
                                    '.html', ''))
                            test_data = item['commodityInfo']['charaterId1']
                            if test_data == '尺码':
                                goods_comment['type'] = item['commodityInfo'][
                                    'charaterDesc2']
                                goods_comment['size'] = item['commodityInfo'][
                                    'charaterDesc1']
                            else:
                                goods_comment['type'] = item['commodityInfo'][
                                    'charaterDesc1']
                                goods_comment['size'] = item['commodityInfo'][
                                    'charaterDesc2']
                            self.likes_comments(goods_comment)
                        if date_data.strip() < self.start_time:
                            is_break = True
                    if is_break:
                        break
        except:
            print(3333333333333333333, traceback.format_exc())

    # 解析商品评论的点赞数和回复数
    def likes_comments(self, goods_comment):
        try:
            comment_id = goods_comment['content_id']
            url = 'https://review.suning.com/ajax/useful_count/635960739-usefulCnt.htm'.format(
                comment_id)
            headers = {
                'Content-Type':
                'application/javascript;charset=UTF-8',
                # 'Cookie': 'tradeMA=55; _snvd=1565067528273QvL8ia7lwZC; SN_CITY=20_021_1000267_9264_01_12113_2_0; cityCode=021; districtId=12113; cityId=9264; hm_guid=ca34f536-186e-4619-aa8f-6c8808ee39a6; _df_ud=e64b917e-c77c-46e0-9d10-d84c86c93f3a; _device_session_id=p_806c72c6-6fa6-462d-bf88-f660c7094c1a; _cp_dt=21f7906b-c341-404f-996b-4d4f2e32e4af-70039; route=e46977517568f7cad53fbfe19eaf4774; _snmc=1; _snsr=direct%7Cdirect%7C%7C%7C; authId=siC700F4CB8ABB1C2E87F1FA1E9650CF7A; secureToken=F9331FD98F503CE8898949382003910A; _snzwt=THs64g16ce02abb69OAUS9a89; _snms=156712934067680848; smhst=690105206|0000000000a10118749983|0000000000a10689501376|0070222946a10949954840|0000000000a10966209249|0000000000a10757523126|0000000000a10620476914|0000000000a11180422688|0000000000a10966225829|0000000000a769909849|0070230352a10580507394|0070222946a826193435|0000000000a10163182478|0000000000a10964625880|0000000000a10571100966|0070074453; _snma=1%7C156506752678869586%7C1565067526788%7C1567129356201%7C1567129676548%7C137%7C12; _snmp=156712967506243164; _snmb=156712899210934272%7C1567129676573%7C1567129676552%7C8',
                'Host':
                'review.suning.com',
                'Pragma':
                'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                time.sleep(0.2)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            # print(response.text)
            likes_comments_data = json.loads(
                response.text.replace('usefulCnt(', '').replace(')', ''))
            goods_comment['likes'] = likes_comments_data[
                'reviewUsefuAndReplylList'][0]['usefulCount']
            goods_comment['comments_count'] = likes_comments_data[
                'reviewUsefuAndReplylList'][0]['replyCount']
            goods_comment['file_code'] = '53'
            # logger.log(31, '****-------正在写入符合时间的商品评论信息-------****')
            item = json.dumps(dict(goods_comment), ensure_ascii=False) + '\n'
            self.hdfsclient.new_write(
                '/user/cspider_daily/nike_daily/ecommerce/{}/53_{}_{}_Suning_nike_1.json'
                .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'),
                        self.time_data),
                item,
                encoding='utf-8')

        except:
            print(4444444444444444444444, traceback.format_exc())

    def run(self):
        f = open('./{}_suning_shop_img_url.json'.format('20200407'),
                 'r',
                 encoding='utf-8')
        goods_data_list = []
        for line in f.readlines():
            dic_data = json.loads(line)
            goods_data_list.append(dic_data)
        for data in goods_data_list:
            self.parse_url(data)

Exemple #3

0

Afficher le fichier

Fichier : xiaohongshu_API_topic_spider.py Projet : yuzhujiutian/nike_spider

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, file_path, comment_path):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        self.start_url = ''
        # 评论接口模板
        self.commnet_port_url = ''
        # # 时间判断部分
        date = datetime.now() - timedelta(days=2)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # print('爬取时间段：{}到{}'.format(news_start_time, yesterday))
        #
        # logging.info('爬取时间段：{}到{}'.format(news_start_time, yesterday))
        #
        # # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2010-03-20'
        # # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-12-09 14:08'
        # # 标记爬虫工作

        # get_now_time = time.time() - 86400  # 一天或者三小时 的秒数
        # # get_now_time = time.time() - 8640000  # 一百天
        # print(get_now_time)
        # time_local = time.localtime(float(get_now_time))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # print(dt)
        # end_t = time.time()
        # print(end_t)
        # time_local = time.localtime(float(end_t))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # end_dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # print(end_dt)
        # # end_time = str(end_time).split(' ')[0]
        # print('爬取时间段：{}到{}'.format(dt, end_dt))

        # logging.info('爬取时间段：{}到{}'.format(dt, str(datetime.now())))
        # 定义开始时间 y-m-d  离现在时间远
        # self.start_time = dt
        # self.start_time = '2019-09-09 00:22'
        # 定义结束时间 y-m-d  离现在时间近
        # self.end_time = end_dt
        # self.end_time = '2019-09-16 10:22'

        self.is_work = True

        self.xhsapi = XhsApi('8ac1d719cd0a2d16')
        # 代理服务器
        proxyHost = "http-cla.abuyun.com"
        proxyPort = "9030"

        # 代理隧道验证信息
        proxyUser = "******"
        proxyPass = "******"

        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }

        self.proxies = {"http": proxyMeta, "https": proxyMeta}
        self.set_list = []
        self.info = seeeion_id_list
        # try:
        #     os.mkdir('./json_file/{}'.format(str(datetime.now()).split(' ')[0]))
        # except:
        #     pass

        with open('./session_id_list_topic.json') as f:
            session_id = f.read()

        self.session_id_list = eval(session_id)
        self.session_id_error = []
        self.error_count = 0
        self.file_path = file_path
        self.comment_path = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs('{}/{}'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}'.format(
            self.comment_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]

    # def get_session_id(self):
    #     register_smid_ret = self.xhsapi.register_smid_proxy(self.ip)
    #     print('register_smid_ret:' + register_smid_ret)
    #     smid = json.loads(register_smid_ret)['detail']['deviceId']
    #     print('smid:' + smid)
    #     self.xhsapi.set_smid(smid)
    #     # 激活用户
    #     active_user_ret = self.xhsapi.active_user_proxy(self.ip)
    #     print('active_user_ret:' + active_user_ret)
    #     # 设置session id
    #     session_id = json.loads(active_user_ret)['data']['session']
    #     print('session_id:' + session_id)
    #     item = {
    #         'deviceId': "abbd5bf5-3a82-3fcd-b8b8-4e4c48f68950",
    #         'device_fingerprint': "201908191457046c8b8bd154ae84d8f7c9f8e912c573870183341147f781ee",
    #         'device_fingerprint1': "201908191457046c8b8bd154ae84d8f7c9f8e912c573870183341147f781ee",
    #         'sid': "session.1566198308579055731492",
    #         'search_id': "A9F65F9019EF946464D38BF16C0E250A",
    #     }
    #     item['device_fingerprint'] = smid
    #     item['device_fingerprint1'] = smid
    #     item['sid'] = "session." + session_id
    #     print(item)

    def get_sid(self):
        register_smid_ret = self.xhsapi.register_smid()
        print('register_smid_ret:' + register_smid_ret)
        smid = json.loads(register_smid_ret)['detail']['deviceId']
        print('smid:' + smid)
        self.xhsapi.set_smid(smid)
        # 激活用户
        active_user_ret = self.xhsapi.active_user()
        print('active_user_ret:' + active_user_ret)
        # 设置session id
        session_id = json.loads(active_user_ret)['data']['session']
        print('session_id:' + session_id)

        return smid, session_id

    def change_ip(self):
        logger.log(31, '开始切换ip')
        url = 'http://proxy.abuyun.com/switch-ip'
        time.sleep(random.randint(1, 15))
        response = requests.get(url, proxies=self.proxies)
        logger.log(31, '现使用ip：' + response.text)

    def res_ip(self):
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Host':
            'webapi.http.zhimacangku.com',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        }
        # 5-25分 500个ip
        import time
        time.sleep(3)
        url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=310000&city=0&yys=0&port=1&time=2&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions='
        ip_pro = requests.get(url, headers=headers)
        # print(ip_pro.text)
        # ip_data = json.loads(ip_pro.text)
        ip = ip_pro.text.strip()

        # ip = str(ip_data['data'][0]['ip']) + ':' + str(ip_data['data'][0]['port'])
        return ip

    def get_serach_list(self, page, keyword):
        info = random.choice(self.session_id_list)
        # info = self.sid_info
        print(self.session_id_list.index(info))
        parms = {
            'keyword': keyword,
            'platform': 'android',
            'filters': '',
            # 'sort': '',  # 综合 排序
            # 'sort': 'popularity_descending',  # 最热 排序
            'sort': 'time_descending',  # 最新 排序
            'page': page,
            'page_size': '20',
            'source': 'explore_feed',
            # 'search_id': info['search_id'],
            'api_extra': '',
            'deviceId': info['deviceId'],
            'device_fingerprint': info['device_fingerprint'],
            'device_fingerprint1': info['device_fingerprint1'],
            'versionName': '5.35.1',
            'channel': 'YingYongBao',
            'sid': info['sid'],
            'lang': 'zh',
            't': str(round(time.time())),
        }

        url = 'https://www.xiaohongshu.com/api/sns/v9/search/notes'
        for i in range(10):
            res = self.xhsapi.get_sign(url, parms)
            print(1111, res)
            if len(res['shield']) == 32:
                break
        res = self.xhsapi.get_sign(url, parms)
        print(res['sign'])
        parms['sign'] = res['sign']
        headers = {
            # 'authorization': info['sid'],
            # 'device_id': info['deviceId'],
            'user-agent':
            'Dalvik/2.1.0 (Linux; U; Android 6.0; DIG-AL00 Build/HUAWEIDIG-AL00) Resolution/720*1280 Version/6.8.0.3 Build/6080103 Device/(HUAWEI;DIG-AL00) NetType/WiFi',
            'shield': res['shield'],
            'Host': 'www.xiaohongshu.com',
            'accept-encoding': 'gzip',
            'Connection': 'Keep-Alive',
        }

        response = requests.get(url, params=parms, headers=headers)
        print(response.url)
        if '"result":0' in response.text and 'msg:' in response.text:
            del self.session_id_list[self.session_id_list.index(info)]
            return
        json_text = json.loads(response.text)
        print(json_text)
        note_list = json_text["data"]["notes"]
        for note in note_list:
            title = note["title"]
            if not title:
                title = note["desc"]
            id = note["id"]
            print(title)
            time.sleep(0.1)
            if id not in self.set_list:

                try:
                    self.get_note(id, keyword)
                except:
                    print(traceback.format_exc())
                    try:
                        self.get_note(id, keyword)
                    except:
                        print(traceback.format_exc())
                self.set_list.append(id)

    def get_note(self, note_id, keyword, index=0):
        info = random.choice(self.info)
        # info = self.sid_info
        logger.log(31, 'session_id下标:  ' + str(self.info.index(info)))

        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])
        note_ret = self.xhsapi.get_note(note_id)
        # print(333333, note_ret)

        if '参数错误' in note_ret:
            logger.log(31, '参数错误，重试.....')
            self.get_note(note_id, keyword, index)
            return
        # print(response.text)
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     logger.log(31, '无效id：', info)
        #     del self.session_id_list[self.session_id_list.index(info)]
        #     return
        if '{"msg":"","result":0,"success":true}' in note_ret:
            self.session_id_error.append(info)
            if self.session_id_error.count(info) > 5:
                logger.log(31, '无效id：' + str(info))
                # del self.info[self.info.index(info)]
            if self.error_count > 5:
                self.change_ip()
                self.error_count = 0
            self.error_count += 1
            self.get_note(note_id, keyword, index)
            return

        json_text = json.loads(note_ret)
        # print(11111, json_text)
        data = json_text["data"][0]['note_list'][0]
        item = {}
        item['platform'] = '小红书'
        # print(222222, data)
        date_all = data['time']
        time_local = time.localtime(float(date_all))
        # 转换成新的时间格式(2016-05-05 20:28:54)
        dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        logger.log(31, "时间:    " + str(dt))
        # # 做时间判断部分---------------
        get_news_time = time.mktime(
            time.strptime(str(dt).split(' ')[0], "%Y-%m-%d"))
        end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
        if self.start_time != '':
            start_time = time.mktime(time.strptime(self.start_time,
                                                   "%Y-%m-%d"))
        else:
            start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
        if float(get_news_time) < float(start_time) and index > 1:
            logger.log(31, '不符合时间')
            # self.redis_example.sadd('xiaohongshu_out_day_url', note_id)
            self.is_work = False
            return
        elif float(start_time) <= float(get_news_time) <= float(end_time):

            logging.log(31, '符合时间')
            news_date = dt.split(' ')[0]
            news_time = dt.split(' ')[1]
            item['date'] = news_date
            item['time'] = news_time
            title = data['share_info']["title"]
            item['title'] = title
            item['content'] = data["desc"] + '#今日份AJ女生'
            note_id = data["id"]
            item['content_id'] = note_id
            item['article_author'] = data["user"]["nickname"]
            item['clicks'] = ''
            item['views'] = data['view_count']
            comments = data["comments_count"]
            item['comments_count'] = comments
            item['likes'] = data["liked_count"]
            item['dislikes'] = ''
            item['keyword'] = keyword
            article_url = data['share_info']["link"]
            item['article_url'] = article_url
            item['series_url'] = ''
            item['list_url'] = ''
            item['article_type'] = ''
            item['article_source'] = ''
            item['insert_time'] = str(datetime.now()).split('.')[0]
            item['update_time'] = str(datetime.now()).split('.')[0]
            item['topic_id'] = note_id
            item['author_id'] = data["user"]["id"]
            item['file_code'] = '28'
            item['reposts_count'] = data['shared_count']
            if data['topics']:
                item['topic'] = data['topics'][0]['name']
                item['get_topic_id'] = data['topics'][0]['id']
                item['get_topic_url'] = data['topics'][0]['link']
            else:
                item['topic'] = ''
                item['get_topic_id'] = ''
                item['get_topic_url'] = ''
            # if '韩束' not in item['title'] and '韩束' not in item['content']:
            #     print('检索文章没有包含关键词，判定不符合数据......')
            #     return
            # else:
            #     print('符合检索关键词的文章......')
            #     print(item)
            logging.log(31, item)
            self.write_news_jsonfile(item)
            # self.queue.put(item)
            if int(comments) > 0:
                try:
                    self.get_note_comment(note_id, keyword, article_url,
                                          news_date, news_time, title)
                except:
                    if '503 Service Temporarily' in traceback.format_exc(
                    ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                    ):
                        self.change_ip()
                    logging.error(traceback.format_exc())
                    try:
                        self.get_note_comment(note_id, keyword, article_url,
                                              news_date, news_time, title)
                    except:
                        if '503 Service Temporarily' in traceback.format_exc(
                        ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                        ):
                            self.change_ip()
                        logging.error(traceback.format_exc())

    # @retry(stop_max_attempt_number=2, retry_on_exception=retry_if_key_error)
    def get_note_comment(self,
                         note_id,
                         keyword,
                         article_url,
                         news_date,
                         news_time,
                         title,
                         start='',
                         now_page=1):
        if start:
            response = self.xhsapi.get_note_comments(note_id, 20, start)
        else:
            response = self.xhsapi.get_note_comments(note_id, 20)
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     del self.session_id_list[self.session_id_list.index(s)]
        #     return

        data = json.loads(response)
        # print(data)
        try:
            comment_list = data['data']["comments"]
        except:
            logging.log(31, data)
            logging.error(traceback.format_exc())
            return
        comment_count = data['data']["comment_count_l1"]

        last_comment_id = ''
        # total_item = ''
        for comment in comment_list:
            item = {}
            item['platform'] = '小红书'
            item['source_date'] = news_date
            item['source_time'] = news_time
            date_all = comment['time']
            # #转换成localtime
            time_local = time.localtime(float(date_all))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            comment_date = time.strftime("%Y-%m-%d %H:%M",
                                         time_local)  # "%Y-%m-%d %H:%M:%S"
            # # 做时间判断部分---------------
            # get_news_time = time.mktime(time.strptime(str(comment_date), "%Y-%m-%d %H:%M"))
            # # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
            # if self.start_time != '':
            #     start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d %H:%M"))
            # else:
            #     start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d %H:%M"))
            # if float(get_news_time) < float(start_time):
            #     self.is_work = False
            #     return
            #
            # if float(start_time) <= float(get_news_time):

            get_news_time = time.mktime(
                time.strptime(str(comment_date).split(' ')[0], "%Y-%m-%d"))
            end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time, "%Y-%m-%d"))
            else:
                start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
            if float(get_news_time) < float(start_time):
                self.is_get_comment = False  # 返回的回答消息是按时间进行排序的，所以当时间小于指定时间时，就停止爬取，
                # break
            elif float(start_time) <= float(get_news_time) <= float(end_time):
                item['date'] = comment_date.split(' ')[0]
                item['time'] = comment_date.split(' ')[1]
                item['title'] = title
                item['author'] = comment['user']["nickname"]
                item['author_id'] = comment['user']["userid"]
                item['content'] = comment["content"]
                comment_id = comment["id"]
                last_comment_id = comment_id
                item['content_id'] = comment_id
                item['floor'] = ''
                item['keyword'] = keyword
                item['source_url'] = article_url
                item['comment_url'] = article_url
                item['views'] = ''
                item['comments_count'] = ''
                item['likes'] = comment["like_count"]
                item['dislikes'] = ''
                item['insert_time'] = str(datetime.now()).split('.')[0]
                item['update_time'] = str(datetime.now()).split('.')[0]
                item['topic_id'] = note_id
                item['file_code'] = '42'
                item['reposts_count'] = ''
                # print(item)
                # print(11111111, item)
                # item = json.dumps(dict(item), ensure_ascii=False) + '\n'
                # total_item = total_item + item
                # self.comment_queue.put(item)
                self.write_comment_jsonfile(item)
        # self.comment_queue.put

        # print(last_comment_id)
        all_page_num = math.ceil(float(int(comment_count) / 20))
        if int(all_page_num) > now_page and self.is_work:
            now_page += 1
            time.sleep(0.1)
            try:
                self.get_note_comment(note_id, keyword, article_url, news_date,
                                      news_time, title, last_comment_id,
                                      now_page)
            except:
                try:
                    self.get_note_comment(note_id, keyword, article_url,
                                          news_date, news_time, title,
                                          last_comment_id, now_page)
                except:
                    pass

    def get_user(self, user_id, page, num):
        info = random.choice(self.info)
        # info = self.sid_info
        print(self.info.index(info))
        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])
        # response = requests.get(url, params=parms, headers=headers)
        note_ret = self.xhsapi.get_user_note(user_id, page, num)
        print(1111, note_ret)
        if '参数错误' in note_ret:
            logger.log(31, '参数错误，重试.....')
            self.get_user(user_id, page, num)
            return
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     logger.log(31, '无效id：', info)
        #     del self.session_id_list[self.session_id_list.index(info)]
        #     return
        if '{"msg":"","result":0,"success":true}' in note_ret:
            self.change_ip()
        #     self.session_id_error.append(info)
        #     if self.session_id_error.count(info) > 5:
        #         logger.log(31, '无效id：' + str(info))
        #         del self.session_id_list[self.session_id_list.index(info)]
        #     if self.error_count > 5:
        #         self.change_ip()
        #         self.error_count = 0
        #     self.error_count += 1
        #     self.get_user(user_id, page, num)
        #     return
        data = json.loads(note_ret)
        notes = data['data']['notes']
        if not notes:
            with open('uses_id', 'a') as f:
                f.write(user_id + '\n')
        else:
            for index, note in enumerate(notes):
                # item = {}
                # print(note)
                id = note['id']
                if not self.is_work:
                    return
                try:
                    time.sleep(1)
                    self.get_note(id, '', index)
                except:
                    if '503 Service Temporarily' in traceback.format_exc(
                    ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                    ):
                        self.change_ip()
                    logging.error(traceback.format_exc())
                    try:
                        time.sleep(1)
                        self.get_note(id, '', index)
                    except:
                        if '503 Service Temporarily' in traceback.format_exc(
                        ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                        ):
                            self.change_ip()
                        logging.error(traceback.format_exc())
                        try:
                            time.sleep(1)
                            self.get_note(id, '', index)
                        except:
                            if '503 Service Temporarily' in traceback.format_exc(
                            ) or 'requests.exceptions.SSLError' in traceback.format_exc(
                            ):
                                self.change_ip()
                            logging.error(traceback.format_exc())
                time.sleep(1)

    def get_topic(self, topic_id, page, num):
        """
        get topice info
        :param user_id:
        :param page:
        :param num:
        :return:
        """

        info = random.choice(self.info)
        # info = self.sid_info
        logging.log(31, self.info.index(info))
        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])
        # response = requests.get(url, params=parms, headers=headers)
        note_ret = self.xhsapi.get_topic_notes(topic_id, page, num)
        # logging.log(31, note_ret)
        if '参数错误' in note_ret:
            logger.log(31, '参数错误，重试.....')
            self.get_topic(topic_id, page, num)
            return
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     logger.log(31, '无效id：', info)
        #     del self.session_id_list[self.session_id_list.index(info)]
        #     return
        if '{"msg":"","result":0,"success":true}' in note_ret:
            self.change_ip()
            #     self.session_id_error.append(info)
            #     if self.session_id_error.count(info) > 5:
            #         logger.log(31, '无效id：' + str(info))
            #         del self.session_id_list[self.session_id_list.index(info)]
            #     if self.error_count > 5:
            #         self.change_ip()
            #         self.error_count = 0
            #     self.error_count += 1
            self.get_topic(topic_id, page, num)
            return
        #     return
        data = json.loads(note_ret)
        notes = data['data']['noteList']
        if not notes:
            # with open('uses_id', 'a') as f:
            #     f.write(topic_id + '\n')
            pass
        else:
            for index, note in enumerate(notes):
                # item = {}
                # print(note)
                id = note['id']
                # if not self.is_work:
                #     return
                for i in range(10):
                    try:
                        time.sleep(1)
                        self.get_note(id, '', index)
                        break
                    except:
                        time.sleep(3)
                        logging.error(traceback.format_exc())
                time.sleep(1)

    def get_topic_v2(self, topic_id, page):
        info = random.choice(self.info)
        # info = self.sid_info
        logging.log(31, self.info.index(info))
        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])

        parms = {

            # 'sort': 'hot',  # 最热 排序
            'sort': 'time',  # 最新 排序
            'page': page,  # 最新 排序
            'pageSize': '6',
            # 'sid': info['sid'],
            'sid': 'session.1570584984409448341951',
        }

        url = 'https://www.xiaohongshu.com/fe_api/burdock/v1/page/{}/notes'.format(
            topic_id)
        for i in range(10):
            res = self.xhsapi.get_xsign(url, parms)
            # if len(res['shield']) == 32:
            break

        res = self.xhsapi.get_sign(url, parms)
        print(res)
        parms['sign'] = res['sign']
        headers = {
            # 'authorization': info['sid'],
            # 'device_id': info['deviceId'],
            'user-agent':
            'Dalvik/2.1.0 (Linux; U; Android 6.0; DIG-AL00 Build/HUAWEIDIG-AL00) Resolution/720*1280 Version/6.8.0.3 Build/6080103 Device/(HUAWEI;DIG-AL00) NetType/WiFi',
            'shield': res['shield'],
            'Host': 'www.xiaohongshu.com',
            'accept-encoding': 'gzip',
            'Connection': 'Keep-Alive',
        }

    # 写入json文件
    def write_news_jsonfile(self, item):
        # print(item)
        logging.log(31, '写入数据')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./json_file/{}/28_{}_xiaohongshu_article_topic_time.json'.format(str(datetime.now()).split(' ')[0], str(datetime.now()).split(' ')[0]), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        try:
            self.hdfsclient.new_write(
                '{}/{}/28_{}_{}_xiaohongshu_article_topic_time.json'.format(
                    self.file_path,
                    str(datetime.now()).split(' ')[0].replace('-', ''),
                    str(datetime.now()).split(' ')[0].replace('-', '_'),
                    self.time_time),
                item,
                encoding='utf-8')
        except urllib3.exceptions.NewConnectionError:
            self.write_news_jsonfile(item)

    def write_comment_jsonfile(self, item):
        # print(item)
        logging.log(31, '写入评论')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./json_file/{}/42_{}_xiaohongshu_comment_topic_time.json'.format(str(datetime.now()).split(' ')[0], str(datetime.now()).split(' ')[0]), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        try:
            self.hdfsclient.new_write(
                '{}/{}/42_{}_{}_xiaohongshu_comment_topic_time.json'.format(
                    self.comment_path,
                    str(datetime.now()).split(' ')[0].replace('-', ''),
                    str(datetime.now()).split(' ')[0].replace('-', '_'),
                    self.time_time),
                item,
                encoding='utf-8')
        except urllib3.exceptions.NewConnectionError:
            self.write_comment_jsonfile(item)

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    def run(self):
        # excel_file = xlrd.open_workbook(r'./韩束小红书投放.xlsx')
        # excel_form = excel_file.sheet_by_name('KOC')
        # for i in range(2, 102):
        #     rows = excel_form.row_values(i)
        #     print(rows)
        #     name = rows[2]
        #     user_url = rows[3]
        #     flows = rows[4]
        #     likes = rows[5]
        #     direction = rows[6]
        #     date_time = rows[7]
        #     print(user_url)
        #     print(date_time)
        #     user_id = user_url.split('/')[-1].split('?')[0]
        #     self.is_work = True
        #     self.tiezi_list = []
        #     print(user_id)
        #     if len(str(date_time)) > 5:
        #         date_time = str(date_time)[:4]
        #     get_date = '2020-' + str(date_time).replace('.', '-')
        #     print(get_date)
        #     # str_time = time.mktime(time.strptime(get_date, "%Y-%m-%d"))
        #     # print(str_time)
        #     # self.start_time = get_date
        #     # self.end_time = get_date
        #     for i in range(1, 400):
        #         if not self.is_work:
        #             break
        #         try:
        #             time.sleep(1)
        #             self.get_topic(user_id, i, '10')
        #         except:
        #             if '503 Service Temporarily' in traceback.format_exc() or 'requests.exceptions.SSLError' in traceback.format_exc():
        #                 self.change_ip()
        #             try:
        #                 time.sleep(1)
        #                 self.get_topic(user_id, i, '10')
        #             except:
        #                 if '503 Service Temporarily' in traceback.format_exc() or 'requests.exceptions.SSLError' in traceback.format_exc():
        #                     self.change_ip()
        #                 try:
        #                     time.sleep(1)
        #                     self.get_topic(user_id, i, '10')
        #                 except:
        #                     if '503 Service Temporarily' in traceback.format_exc() or 'requests.exceptions.SSLError' in traceback.format_exc():
        #                         self.change_ip()
        #                     print(traceback.format_exc())
        #         time.sleep(1)

        # self.get_note('5ce2a1ea0000000006016cd9')
        #
        # self.get_comment('5ce2a1ea0000000006016cd9', 20)
        for i in range(1, 400):

            logging.log(31, '主贴翻页：' + str(i))
            if not self.is_work and i > 3:
                break
            for j in range(10):
                try:
                    self.get_topic('5e60bd92dd0a2a00013fe218', i, 6)
                    break
                except:
                    self.change_ip()
                    logging.error(traceback.format_exc())

Exemple #4

0

Afficher le fichier

Fichier : liba_dayli.py Projet : yuzhujiutian/nike_spider

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, file_path):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        # 时间判断部分
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2019-09-09'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-09-16'
        logging.log(31, '爬取时间段：{}到{}'.format(self.start_time, self.end_time))
        # 定义评论的抓取时间范围
        self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-08-01'  # 一天回复
        # self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = yesterday
        # 标记爬虫工作
        self.is_work = True
        self.file_path = file_path
        self.hdfsclient = HdfsClient(url='http://*****:*****@class="ui-list"]/li')
        for li in li_list:
            title = li.xpath('.//h2/a/text()')[0]
            news_url = li.xpath('.//h2/a/@href')[0]
            topic_time = ''.join(
                li.xpath('.//div[@class="ui-topic-attr"]/span[2]/text()')
            ).strip().split('：')[1]
            last_rreplay_time = li.xpath(
                './/div[@class="ui-topic-attr"]/span[3]/text()')[0].split(
                    '：')[1]
            views_replay = li.xpath(
                './/div[@class="ui-topic-attr"]/span[4]/text()')[0]
            # 做时间判断部分---------------
            get_time = self.time_change(topic_time)
            get_news_time = time.mktime(time.strptime(get_time, "%Y-%m-%d"))
            end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time, "%Y-%m-%d"))
            else:
                start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
            if float(get_news_time) < float(start_time):
                self.is_work = False

            if float(start_time) <= float(get_news_time) < float(
                    end_time):  # 符合时间段的内容
                self.get_topic_page(news_url, title, views_replay, keyword)

        a_list = data.xpath('.//a[@class="ui-page-cell"]')  # 翻页判断和操作
        for a in a_list:
            get_text = a.xpath('.//parent::a/text()')
            get_text = ''.join(get_text)
            if '下一页' in get_text:
                next_url = 'https://www.libaclub.com/' + a.xpath(
                    './/parent::a/@href')[0]
                self.get_search_page(next_url, keyword)

    def get_topic_page(self, url, title, views_replay, keyword):
        logger.log(31, '主贴url: ' + url)
        response = requests.get(url, headers=self.headers_one, timeout=60)
        data = etree.HTML(response.content.decode('utf-8', 'ignore'))
        div_list = data.xpath('.//div[@class="ui-topic"]')
        total_item = ''
        for div in div_list:
            content = div.xpath(
                './/div[@class="ui-topic-content fn-break"]/text()')[0]
            item = {}
            item['platform'] = '篱笆网'
            date_all = div.xpath(
                './/div[@class="ui-topic-operate"]/div[@class="fn-left"]/text()'
            )[0]

            item['date'] = date_all.split(' ')[0]
            item['time'] = date_all.split(' ')[1]
            try:
                item['author'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[1]/a/text()')[0]
                item['author_id'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[1]/a/@href')[0].split(
                        '/')[-1]
            except:
                item['author'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[@class="ui-topic-author-name ui-topic-author-anonymityName"]/text()'
                )[0]
                item['author_id'] = ''
            try:
                item['post_client'] = div.xpath(
                    './/div[@class="from-iphone"]/a/text()')[0]
            except:
                item['post_client'] = ''
            item['title'] = title
            item['content'] = content.strip()
            item['content_id'] = div.xpath(
                './/div[@class="ui-topic-operate"]/div[@class="fn-right"]/a[1]/@href'
            )[0].split('/')[-1].split('.')[0].split('_')[-1]
            item['brand'] = ''
            item['carseries'] = ''
            try:
                item['from'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[4]/text()')[0]
            except:
                item['from'] = ''
            item['series_url'] = ''
            item['url'] = url

            floor = div.xpath('.//span[@class="ui-dropdown-self"]/text()')[0]
            item['floor'] = floor
            item['identification'] = ''
            item['favorite'] = ''
            try:
                item['signin_time'] = div.xpath(
                    './/div[@class="ui-topic-author"]/p[3]/text()')[0]
            except:
                item['signin_time'] = ''

            if floor == '楼主':
                item['views'] = views_replay.split('/')[0]
                item['reply_no'] = views_replay.split('/')[1]
                self.source_date = date_all.split(' ')[0]
                self.source_time = date_all.split(' ')[1]
                item['is_topics'] = '是'

            else:
                item['reply_no'] = ''
                item['views'] = ''
                item['is_topics'] = '否'
            item['source_date'] = self.source_date
            item['source_time'] = self.source_time
            item['likes'] = ''
            item['is_elite'] = ''
            item['topic_count'] = ''
            item['reply_count'] = ''
            item['pick_count'] = ''
            item['follows'] = ''
            item['topic_categroy'] = ''
            item['topic_type'] = ''
            item['insert_time'] = str(datetime.now()).split('.')[0]
            item['update_time'] = str(datetime.now()).split('.')[0]
            item['topic_id'] = url.split('.h')[0].split('_')[-2]
            item['reply_floor'] = ''
            item['keyword'] = keyword
            item['file_code'] = '185'
            item['reposts_count'] = ''
            # print(item)
            item = json.dumps(dict(item), ensure_ascii=False) + '\n'
            total_item += item

        self.__write_news_jsonfile(total_item)

        if data.xpath('.//a[@class="ui-paging-next"]/@href'):  # 判断是否有下一页， 翻页操作
            next_page_url = 'https://www.libaclub.com' + data.xpath(
                './/a[@class="ui-paging-next"]/@href')[0]
            self.get_topic_page(next_page_url, title, views_replay, keyword)

    # 写入json文件
    def __write_news_jsonfile(self, item):

        # with open('./../libawang/{}_liba_news_nike.json'.format(str(datetime.now()).split(' ')[0]), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write('{}/{}/185_{}_{}_liba_news.json'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', ''),
            str(datetime.now()).split(' ')[0].replace('-', '_'),
            self.time_time),
                                  item,
                                  encoding='utf-8')

    def time_change(self, str_time):
        """
        时间可是转换， 将‘分钟前’，‘小时前’，‘昨天’，‘前天’, '天前'，转换成标准时间格式Y-m-d h:m:s
        :param str_time:
        :return:
        """
        if '秒' in str_time or '刚刚' in str_time:
            get_time = str(datetime.now()).split('.')[0]
            return get_time

        elif '分钟' in str_time:
            get_time_num = re.search('\d{1,2}', str_time).group(0)
            get_time_num = int(get_time_num) * 60
            int_time = int(str(time.time()).split('.')[0]) - get_time_num
            # #转换成localtime
            time_local = time.localtime(float(int_time))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d", time_local)  # "%Y-%m-%d %H:%M:%S"
            return dt

        elif '小时' in str_time:
            get_time_num = re.search('\d{1,2}', str_time).group(0)
            get_time_num = int(get_time_num) * 60 * 60
            # print(get_time_num)
            int_time = int(str(time.time()).split('.')[0]) - get_time_num
            # #转换成localtime
            time_local = time.localtime(float(int_time))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d", time_local)  # "%Y-%m-%d %H:%M:%S"
            return dt

        elif '昨天' in str_time:
            try:
                part_time = str_time.split(' ')[1]
                yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
                yesterday = str(yesterday).split(' ')[0]
            except:
                yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
                yesterday = str(yesterday).split(' ')[0]
            return yesterday

        elif '前天' in str_time:
            part_time = str_time.split(' ')[1]
            two_days_ago = datetime.now() - timedelta(days=2)  # 昨天时间
            two_days_ago = str(two_days_ago).split(
                ' ')[0] + ' ' + part_time.replace('点', ':').replace('分', '')
            return two_days_ago

        elif '天前' in str_time:
            part_time = str_time.split('天前')[0]
            two_days_ago = datetime.now() - timedelta(
                days=int(part_time))  # 昨天时间
            two_days_ago = str(two_days_ago).split(' ')[0]
            return two_days_ago

        elif '年' in str_time:
            str_time = str_time.replace('年',
                                        '-').replace('月',
                                                     '-').replace('日', '')
            return str_time

        elif ' ' in str_time and '202' not in str_time:
            str_time = str(
                datetime.now()).split('-')[0] + '-' + str_time.split(' ')[0]
            return str_time
        else:
            # str_time = '2019-' + str_time.replace('月', '-').replace('日', '')
            return str_time

    def run(self):
        url = 'https://www.libaclub.com/facade.php?act=search&searchAction=keyword&keyword={}&sId=&timetype=2&timeBegin=1563938285&timeEnd=1566530285&sid=0&searchScope=0&orderBy=0&page=1'
        url_list = get_config_para('nike_daily_keywords')
        logger.log(31, url_list)
        for item in url_list:
            # print(1)
            keyword = item['keywords']
            logger.log(31, keyword)
            if keyword:
                search_url = url.format(keyword)
                try:
                    self.get_search_page(search_url, keyword)
                except:
                    logger.error(traceback.format_exc())

Exemple #5

0

Afficher le fichier

Fichier : da_he.py Projet : yuzhujiutian/nike_spider

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self):
        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        self.start_url = ''
        # 评论接口模板
        self.commnet_port_url = ''

        # 时间部分,按小时抓取
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]

        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]

        print('爬取时间段：{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段：{}到{}'.format(news_start_time, current_day))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day

        # 标记爬虫工作1
        self.is_break = False
        # 标记爬虫工作2
        self.is_work = False
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/forum/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'                                -', '', ret1)
            ret3 = re.sub(
                r'                                                            ',
                '', ret2)
            ret4 = re.sub(r"hot\(.*\d?','", '', ret3)
            ret5 = re.sub(r'\[', '', ret4)
            ret6 = re.sub(r'\]', '', ret5)
            ret7 = re.sub(r"',", "", ret6)
            ret8 = re.sub(r"'", "", ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 匹配具体时间
    def clean_date(self, x):
        now = datetime.now()
        if str(x).find('昨天') != -1:
            x = datetime.strftime(now + timedelta(days=-1),
                                  '%Y-%m-%d %H:%M:%S')
        elif str(x).find('前天') != -1:
            x = datetime.strftime(now + timedelta(days=-2),
                                  '%Y-%m-%d %H:%M:%S')
        elif str(x).find('天前') != -1:
            x = datetime.strftime(
                now + timedelta(days=-int(str(x).replace('天前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('小时前') != -1:
            x = datetime.strftime(
                now + timedelta(hours=-int(str(x).replace('小时前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('分钟前') != -1:
            x = datetime.strftime(
                now + timedelta(minutes=-int(str(x).replace('分钟前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('今天') != -1:
            x = str(x).replace('今天', now.strftime('%Y-%m-%d') + ' ')
        elif str(x).find('刚刚') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('秒前') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('月前') != -1:
            x = datetime.strftime(
                now + timedelta(weeks=-4 * int(str(x).replace('月前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('周前') != -1:
            x = datetime.strftime(
                now + timedelta(weeks=-int(str(x).replace('周前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('[') != -1:
            x = x.replace('[', '').replace(']', '')
        elif str(x).find('月') != -1:
            x = x.replace('月', '-').replace('日', '')
        return x

    def parse_goods_id(self, key_word):
        try:
            # key_word_data = urllib.parse.quote(key_word)
            url = 'http://bbs.dahe.cn/search.php?mod=forum'
            headers = {
                'Content-Type':
                'application/x-www-form-urlencoded',
                'Cookie':
                's8hO_404f_saltkey=tvEEW5wV; s8hO_404f_lastvisit=1568680094; s8hO_404f_sid=IHtErs; PHPSESSID=nr01ffrg19e81likscg0lmejb2; __asc=be50d61716d3cda6bb0dc6485ed; __auc=be50d61716d3cda6bb0dc6485ed; Hm_lvt_49fc517ed1175ad0089c07fe695a54c4=1568684010; s8hO_404f_lastact=1568683853%09search.php%09forum; Hm_lpvt_49fc517ed1175ad0089c07fe695a54c4=1568684168',
                'Host':
                'bbs.dahe.cn',
                'Origin':
                'http://bbs.dahe.cn',
                'Pragma':
                'no-cache',
                'Referer':
                'http://bbs.dahe.cn/search.php?mod=forum',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            form_data = {
                'formhash': '89e49222',
                'srchtxt': key_word.encode('gbk'),
                'searchsubmit': 'yes'
            }
            try:
                time.sleep(0.2)
                response = requests.post(url=url,
                                         headers=headers,
                                         data=form_data)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.post(url=url,
                                             headers=headers,
                                             proxies=proxies,
                                             data=form_data)
                except:
                    time.sleep(0.2)
                    response = requests.post(url=url,
                                             headers=headers,
                                             proxies=proxies,
                                             data=form_data)
            response.encoding = 'gbk'
            print(response.url)
            searchid = self.re_not_number(response.url.split('&')[1])
            print('关键词解析对应id是：', searchid)
            is_break = self.is_break
            insert_time = time.strftime('%Y-%m-%d %H:%M:%S')
            url = 'http://bbs.dahe.cn/search.php?mod=forum&searchid={}&orderby=dateline&ascdesc=desc&searchsubmit=yes&page={}'
            # print(url)
            headers = {
                'Content-Type':
                'text/html; charset=gbk',
                # 'Cookie': 's8hO_404f_saltkey=T4WK2597; s8hO_404f_lastvisit=1566265382; PHPSESSID=hp8k3kq01k4p4et54us1vljsu7; Hm_lvt_49fc517ed1175ad0089c07fe695a54c4=1566269243; yfx_c_g_u_id_10000033=_ck19082010472216611967379906556; __auc=d9a596fe16cacec003e8f31e310; s8hO_404f_atarget=1; __asc=cbf1082316cb721670e06723157; zycna=tzGXcwYAChsBAXxONRbq5Xoc; yfx_c_g_u_id_10000007=_ck19082210393212688365475513495; yfx_f_l_v_t_10000007=f_t_1566441572262__r_t_1566441572262__v_t_1566441572262__r_c_0; wdcid=0cb840f230762783; s8hO_404f_yy_ad_status=2; yfx_f_l_v_t_10000033=f_t_1566269242659__r_t_1566440515358__v_t_1566442626841__r_c_1; s8hO_404f_st_t=0%7C1566443342%7Ce4370d9ec8f238172511195afa70bf43; s8hO_404f_forum_lastvisit=D_1496_1566440306D_1880_1566440345D_2988_1566443342; s8hO_404f_st_p=0%7C1566443988%7C5efa9cc93f4efcd80a2db1e41de54594; s8hO_404f_visitedfid=261D2988D1889D1006D780D1875D1213D1778D1880D1496; s8hO_404f_viewid=tid_1240948; s8hO_404f_sendmail=1; s8hO_404f_sid=HXxXR3; s8hO_404f_lastact=1566444218%09search.php%09forum; Hm_lpvt_49fc517ed1175ad0089c07fe695a54c4=1566444478',
                'Host':
                'bbs.dahe.cn',
                'Pragma':
                'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                time.sleep(0.2)
                response1 = requests.get(url=url.format(searchid, 1),
                                         headers=headers,
                                         allow_redirects=False)
            except:
                try:
                    time.sleep(0.2)
                    response1 = requests.get(url=url.format(searchid, 1),
                                             headers=headers,
                                             allow_redirects=False,
                                             proxies=proxies)
                except:
                    time.sleep(0.2)
                    response1 = requests.get(url=url.format(searchid, 1),
                                             headers=headers,
                                             allow_redirects=False,
                                             proxies=proxies)
            response1.encoding = 'gbk'
            # print(response.text)
            # 将响应转换成一个element对象
            html = etree.HTML(response1.text)
            # 获取帖子总数
            topic_num = self.re_not_number(
                self.re_html(
                    html.xpath('//div[@class="sttl mbn"]/h2/em/text()')))
            if int(topic_num) == 0:
                logger.log(
                    31, '*******-------关键词:%s 搜索不到内容-------*******' % key_word)
            else:
                # 获取帖子页数
                pages_num = int(math.ceil(float(int(topic_num) / 40)))
                logger.log(
                    31, '---关键词: %s ,搜到的帖子总数是: %s ,帖子总页数是: %s ---' %
                    (key_word, topic_num, pages_num))
                for i in range(1, int(pages_num) + 1):
                    topic_url = url.format(searchid, key_word, i)
                    # logger.log(31, '抓取第%s页数商品数据' % i)
                    try:
                        time.sleep(0.2)
                        response2 = requests.get(url=topic_url,
                                                 headers=headers,
                                                 allow_redirects=False)
                    except:
                        try:
                            time.sleep(0.2)
                            response2 = requests.get(url=topic_url,
                                                     headers=headers,
                                                     allow_redirects=False,
                                                     proxies=proxies)
                        except:
                            time.sleep(0.2)
                            response2 = requests.get(url=topic_url,
                                                     headers=headers,
                                                     allow_redirects=False,
                                                     proxies=proxies)
                    # 将响应转换成一个element对象
                    html1 = etree.HTML(response2.text)
                    # 获取帖子列表
                    topic_list = html1.xpath('//div[@class="tl"]/div[2]/ul/li')
                    # print(len(topic_list))
                    da_he_dict = dict()
                    # 遍历帖子列表
                    for data in topic_list:
                        date_time_data = data.xpath('./p[3]/span[1]/text()')[0]
                        # print(date_time_data)
                        date_data_test = date_time_data.split(' ')[0].strip()
                        # print(date_data_test)
                        # 年, 月， 日
                        year_data = date_data_test.split('-')[0]
                        month_test = date_data_test.split('-')[1]
                        day_test = date_data_test.split('-')[2]
                        if len(month_test) == 2:
                            month_data = month_test
                        else:
                            month_data = '0' + month_test
                        if len(day_test) == 2:
                            day_data = day_test
                        else:
                            day_data = '0' + day_test
                        date_data = (year_data + '-' + month_data + '-' +
                                     day_data).strip()
                        time_data = (date_time_data.split(' ')[1] +
                                     ':00').strip()
                        if self.start_time <= date_data:
                            da_he_dict['platform'] = '大河网'
                            da_he_dict['source_date'] = date_data
                            da_he_dict['source_time'] = time_data
                            da_he_dict['date'] = date_data
                            da_he_dict['time'] = time_data
                            da_he_dict['insert_time'] = insert_time
                            da_he_dict['author'] = data.xpath(
                                './p[3]/span[2]/a/text()')[0]
                            da_he_dict[
                                'author_url'] = 'http://bbs.dahe.cn/' + data.xpath(
                                    './p[3]/span[2]/a/@href')[0]
                            da_he_dict['author_id'] = self.re_not_number(
                                data.xpath('./p[3]/span[2]/a/@href')[0])
                            da_he_dict['title'] = self.re_html(
                                data.xpath('./h3/a//text()'))
                            da_he_dict[
                                'url'] = 'http://bbs.dahe.cn/' + data.xpath(
                                    './h3/a/@href')[0]
                            da_he_dict['brand'] = ''
                            da_he_dict['carseries'] = ''
                            da_he_dict['series_url'] = ''
                            # print(da_he_dict)
                            self.parse_topic_data(da_he_dict)
                        if date_data < self.start_time:
                            is_break = True
                    if is_break:
                        break
        except:
            print(111111111111111111111, traceback.format_exc())

    # 解析帖子内容
    def parse_topic_data(self, da_he_dict):
        try:
            url = da_he_dict['url']
            headers = {
                'Content-Type':
                'text/html; charset=gbk',
                'Host':
                'bbs.dahe.cn',
                'Pragma':
                'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            # print(url)
            logger.log(31, url)
            try:
                # time.sleep(0.5)
                response = requests.get(url=url,
                                        headers=headers,
                                        allow_redirects=False)
            except:
                try:
                    # time.sleep(0.5)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False,
                                            proxies=proxies)
                except:
                    # time.sleep(0.5)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False,
                                            proxies=proxies)
            response.encoding = 'gbk'
            # 将响应转换成一个element对象
            html = etree.HTML(response.text)
            # print(response.text)
            # # 获取发帖时间
            # time_data_test = self.clean_date(self.re_html(html.xpath('//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[1]/div/div[2]/em/text()|//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[1]/div/div[2]/em/span/text()')).replace('\\xa0', ' ').replace('发表于  ', '').replace('发表于 ', ''))
            # # print(url)
            # print(time_data_test)
            # time_data = time_data_test.split(' ')[1]
            # lang = len(time_data_test.split(':'))
            # if int(lang) == 3:
            #     time_data_1 = time_data
            # else:
            #     time_data_1 = time_data.split(':')[0] + ':' + time_data.split(':')[1] + ':' + '00'
            # print(da_he_dict['date'], '--------', time_data_1)
            # da_he_dict['source_time'] = time_data_1
            # da_he_dict['time'] = time_data_1
            # 获取浏览数，回复数
            reply_data = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/table[1]/tr/td[1]/div/span/text()'
            )
            # print(reply_data)
            da_he_dict['reply_no'] = reply_data[4]
            da_he_dict['views'] = reply_data[1]
            # 获取发帖人客户端
            post_client = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[1]/div/div[2]/span[1]/a//text()'
            )
            if post_client == []:
                da_he_dict['post_client'] = ''
            else:
                da_he_dict['post_client'] = post_client[0] + post_client[1]
            da_he_dict['content'] = self.re_html(
                html.xpath(
                    '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[2]/div/div[1]/table/tr/td//text()'
                )).replace('\\r',
                           '').replace('\\n',
                                       '').replace('\\u3000',
                                                   '').replace('\\xa0', '')
            da_he_dict['imageurl'] = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div[2]/div/div[1]//img/@src'
            )
            da_he_dict['audiourl'] = ''
            da_he_dict['content_id'] = da_he_dict['url'].split('-')[1]
            da_he_dict['from'] = ''
            da_he_dict['is_topics'] = '是'
            da_he_dict['floor'] = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/div[1]/table/tr[1]/td[2]/div/strong/a/text()'
            )[0].strip()
            da_he_dict['identification'] = ''
            da_he_dict['favorite'] = ''
            da_he_dict['signin_time'] = ''
            da_he_dict['likes'] = ''
            # 判断是否是热帖
            is_elite = html.xpath(
                '//div[@id="postlist" and @class="pl bm"]/div[1]/div/img/@title'
            )
            if is_elite == []:
                da_he_dict['is_elite'] = '否'
            else:
                da_he_dict['is_elite'] = '是'
            da_he_dict['topic_count'] = ''
            da_he_dict['reply_count'] = ''
            da_he_dict['pick_count'] = ''
            da_he_dict['follows'] = ''
            da_he_dict['topic_categroy'] = ''
            da_he_dict['topic_type'] = ''
            da_he_dict['reposts_count'] = ''
            da_he_dict['update_time'] = time.strftime('%Y-%m-%d %H:%M:%S')
            da_he_dict['topic_id'] = da_he_dict['url'].split('-')[1]
            da_he_dict['file_code'] = '182'
            # logger.log(31, '----------------正在写入主贴----------------')
            # print(da_he_dict)
            item = json.dumps(dict(da_he_dict), ensure_ascii=False) + '\n'
            self.hdfsclient.new_write(
                '/user/cspider_daily/nike_daily/forum/{}/182_{}_{}_dahe_Nike.json'
                .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'),
                        self.time_data),
                item,
                encoding='utf-8')

            if int(da_he_dict['reply_no']) == 0:
                # logger.log(31, '没有回帖')
                pass
            else:
                # 获取回帖页数
                pages_num = int(
                    math.ceil(float(int(da_he_dict['reply_no']) / 10)))
                # logger.log(31, '回帖数: %s 回帖总页数是：%s' % (da_he_dict['reply_no'], pages_num))
                self.parse_reply(pages_num, da_he_dict)
        except:
            print(222222222222222222222, traceback.format_exc())

    # 抓取回帖内容
    def parse_reply(self, pages_num, da_he_dict):
        try:
            is_work = self.is_work
            start_time = time.strftime('%Y-%m-%d %H:%M:%S')
            headers = {
                'Content-Type':
                'text/html; charset=gbk',
                'Host':
                'bbs.dahe.cn',
                'Pragma':
                'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            for i in range(pages_num, 0, -1):
                url = 'http://bbs.dahe.cn/thread-{}-{}-1.html'.format(
                    da_he_dict['topic_id'], i)
                try:
                    # time.sleep(1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False)
                except:
                    try:
                        # time.sleep(1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                allow_redirects=False,
                                                proxies=proxies)
                    except:
                        # time.sleep(1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                allow_redirects=False,
                                                proxies=proxies)
                response.encoding = 'gbk'
                # 将响应转换成一个element对象
                html = etree.HTML(response.text)
                reply_dict = dict()
                # 获取回帖列表
                reply_list = html.xpath(
                    '//div[@id="postlist" and @class="pl bm"]/div')
                # print(len(reply_list))
                for item in reply_list[::-1]:
                    floor_data = self.re_html(
                        item.xpath(
                            './table/tr[1]/td[2]/div/strong/a/text()|./tr[1]/td[2]/div[1]/strong/a//text()'
                        )).replace('\\r', '').replace('\\n', '').replace(
                            '#', '').replace(' ', '')
                    # print(floor_data)
                    url_data = response.url
                    floor_test = floor_data
                    date_time_test = item.xpath(
                        './table/tr[1]/td[2]/div[1]/div/div[2]/em/text()|./table/tr[1]/td[2]/div[1]/div/div[2]/em/span/text()'
                    )
                    # print(date_time_test)
                    if date_time_test == []:
                        pass
                    else:
                        date_time_data = self.re_html(date_time_test).replace(
                            '发表于  ', '').replace('\\xa0',
                                                 ' ').replace('发表于 ', '')
                        # print(date_time_data)
                        if re.search(r'前天|昨天', date_time_data) != None:
                            datetime_data = self.clean_date(
                                date_time_data.split(' ')[0]).split(' ')[
                                    0] + ' ' + date_time_data.split(' ')[1]
                        elif re.search(r'天前', date_time_data) != None:
                            datetime_data = self.clean_date(date_time_data)
                        else:
                            datetime_data = date_time_data
                        # print(datetime_data)
                        # 发表日期
                        date_data = datetime_data.split(' ')[0].strip()
                        date_data_test = date_data.split('-')
                        if len(date_data_test[1]) == 1 and len(
                                date_data_test[2]) == 1:
                            date_data_parse = date_data_test[
                                0] + '-0' + date_data_test[
                                    1] + '-0' + date_data_test[2]
                        elif len(date_data_test[1]) == 1 and len(
                                date_data_test[2]) != 1:
                            date_data_parse = date_data_test[
                                0] + '-0' + date_data_test[
                                    1] + '-' + date_data_test[2]
                        elif len(date_data_test[1]) != 1 and len(
                                date_data_test[2]) == 1:
                            date_data_parse = date_data_test[
                                0] + '-' + date_data_test[
                                    1] + '-0' + date_data_test[2]
                        else:
                            date_data_parse = date_data_test[
                                0] + '-' + date_data_test[
                                    1] + '-' + date_data_test[2]
                        # 发表时间
                        time_data_test = datetime_data.split(' ')[1]
                        lang = len(time_data_test.split(':'))
                        if int(lang) == 3:
                            time_data = time_data_test.strip()
                        else:
                            time_data = (time_data_test.split(':')[0] + ':' +
                                         time_data_test.split(':')[1] + ':' +
                                         '00').strip()
                        # print(date_data, '*******', time_data)
                        if self.start_time <= date_data_parse.strip():
                            reply_dict['platform'] = da_he_dict['platform']
                            reply_dict['source_date'] = da_he_dict['date']
                            reply_dict['source_time'] = da_he_dict['time']
                            reply_dict['date'] = date_data_parse.strip()
                            reply_dict['time'] = time_data
                            reply_dict['author'] = item.xpath(
                                './table/tr[1]/td[1]/div/div[1]/div/a/text()'
                            )[0]
                            reply_dict[
                                'author_url'] = 'http://bbs.dahe.cn/' + item.xpath(
                                    './table/tr[1]/td[1]/div/div[1]/div/a/@href'
                                )[0]
                            reply_dict['author_id'] = self.re_not_number(
                                item.xpath(
                                    './table/tr[1]/td[1]/div/div[1]/div/a/@href'
                                )[0])
                            reply_dict['post_client'] = da_he_dict[
                                'post_client']
                            reply_dict['title'] = da_he_dict['title']
                            reply_dict['content'] = self.re_html(
                                item.xpath(
                                    './table/tr[1]/td[2]/div[2]/div/div[1]/table/tr/td//text()'
                                )).replace('\\r', '')
                            reply_dict['imageurl'] = ''
                            reply_dict['audiourl'] = ''
                            reply_dict['content_id'] = self.re_not_number(
                                item.xpath('./@id')[0])
                            reply_dict['brand'] = ''
                            reply_dict['carseries'] = ''
                            reply_dict['from'] = ''
                            reply_dict['series_url'] = ''
                            reply_dict['url'] = url_data
                            reply_dict['is_topics'] = '否'
                            reply_dict['floor'] = floor_test
                            reply_dict['identification'] = ''
                            reply_dict['favorite'] = ''
                            reply_dict['signin_time'] = ''
                            reply_dict['reply_no'] = ''
                            reply_dict['views'] = ''
                            reply_dict['likes'] = ''
                            reply_dict['is_elite'] = da_he_dict['is_elite']
                            reply_dict['topic_count'] = ''
                            reply_dict['reply_count'] = ''
                            reply_dict['pick_count'] = ''
                            reply_dict['follows'] = ''
                            reply_dict['topic_categroy'] = ''
                            reply_dict['topic_type'] = ''
                            reply_dict['reposts_count'] = ''
                            reply_dict['insert_time'] = start_time
                            reply_dict['update_time'] = time.strftime(
                                '%Y-%m-%d %H:%M:%S')
                            reply_dict['topic_id'] = da_he_dict['topic_id']
                            reply_dict['file_code'] = '182'
                            # logger.log(31, '******************开始写入回帖数据**********************')
                            # print(reply_dict)
                            item = json.dumps(dict(reply_dict),
                                              ensure_ascii=False) + '\n'
                            self.hdfsclient.new_write(
                                '/user/cspider_daily/nike_daily/forum/{}/182_{}_{}_dahe_Nike.json'
                                .format(time.strftime('%Y%m%d'),
                                        time.strftime('%Y%m%d'),
                                        self.time_data),
                                item,
                                encoding='utf-8')

                        if date_data < self.start_time:
                            is_work = True
                    if is_work:
                        break
        except:
            print(333333333333333333333, traceback.format_exc())

    # 读取excel获取关键词
    def parse_xlsx(self):
        # 设置路径
        path = './快消采集关键词_v12_20200119.xlsx'
        # 打开execl
        workbook = xlrd.open_workbook(path)

        # 根据sheet索引或者名称获取sheet内容
        Data_sheet = workbook.sheets()[0]  # 通过索引获取

        rowNum = Data_sheet.nrows  # sheet行数
        colNum = Data_sheet.ncols  # sheet列数

        # 获取所有单元格的内容
        list = []
        for i in range(rowNum):
            rowlist = []
            for j in range(colNum):
                rowlist.append(Data_sheet.cell_value(i, j))
            list.append(rowlist)

        for data in list[1::]:
            brand = data[0]
            # print(brand)
            yield {
                '关键词': brand,
            }

    def run(self):
        key_word_list = []
        for item in self.parse_xlsx():
            # print(item)
            key_word_list.append(item)
        for item_data in key_word_list:
            time.sleep(10)
            # print(item_data['关键词'])
            self.parse_goods_id(item_data['关键词'])

Exemple #6

0

Afficher le fichier

class YiDianSpider(object):
    def __init__(self, file_path, comment_path):
        self.headers_two = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            # 'Connection':'keep-alive',
            'Cookie':
            'cn_1255169715_dplus=%7B%22distinct_id%22%3A%20%2216730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201542776168%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201542776168%7D%7D; UM_distinctid=16730471952668-0ecf0ba7ae41cb-414f0120-15f900-16730471953461; JSESSIONID=208cee9fea61049d61e7d18f9e9c275ecf530a9e308a94dde36658adc01a0594; wuid=154945905891357; wuid_createAt=2018-11-21 12:56:9',
            'Host': 'www.baidu.com',
            'Referer': 'http://www.yidianzixun.com/channel/c11',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        self.proxies = ['218.95.55.154:4243']

        # 去重列表
        self.set_list = []
        #
        self.error_url_list = []
        self.headers_one = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Host':
            'www.baidu.com',
            # 'Proxy-Connection': 'keep-alive',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        self.user_agent = [
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
        ]

        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:  # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        try:
            self.page_ip = proxies.res_ip()
            print('ip: ', self.page_ip)
            # self.page_ip = '116.248.160.138:4261'
        except:
            time.sleep(3)
            print('调用ip时发生错误：{}'.format(traceback.format_exc()))
            logger.error('调用ip时发生错误：{}'.format(traceback.format_exc()))
            self.page_ip = proxies.res_ip()
        self.ip_count = 0

        # 定义评论的抓取时间范围
        # self.comment_start_time = yesterday  # 一天回复
        self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = yesterday
        self.is_get_comment = True

        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        hour = str(datetime.now()).split(' ')[-1].split(':')[0]
        if str(hour) != '00':
            two_hour_ago = int(hour) - 2
            if len(str(two_hour_ago)) == 1:
                two_hour_ago = '0' + str(two_hour_ago)
            self.hour_name = str(two_hour_ago) + '_' + str(hour)
        else:
            self.hour_name = '22_24'
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.file_path,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}/{}'.format(
            self.comment_apth,
            self.file_name_time.split(' ')[0].replace('-', ''),
            self.hour_name))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]

    def get_channel_id(self):
        url = 'http://www.yidianzixun.com/channel/c11'
        try:
            response = requests.get(url,
                                    proxies={'http': self.page_ip},
                                    timeout=30)
            data = response.content.decode()
            data = re.search('channel_id(.*?)汽车', data).group(0)
            channel_id = re.search('\d{8,15}', data).group(0)
            cokies = response.headers['Set-Cookie']
            print(cokies)
            id = re.search('JSESSIONID=([a-z0-9]{30,80});', cokies).group(1)

            return channel_id, id
        except:
            print(traceback.format_exc())

            if self.ip_count < 10:
                self.page_ip = proxies.res_ip()
                print('跟换ip中: ', self.page_ip)
                self.ip_count += 1
                time.sleep(5)
                self.get_channel_id()
            else:
                raise IndexError

    def get_news_list_port(self, url):
        headers_port = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Host': 'www.yidianzixun.com',
            'Connection': 'keep-alive',
            # 'Upgrade-Insecure-Requests': '1',
            'Referer': 'http://www.yidianzixun.com/',
            'User-Agent':
            'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36',
            'Cookie':
            'wuid=289836166779454; wuid_createAt=2019-10-29 16:11:43; Hm_lvt_15fafbae2b9b11d280c79eff3b840e45=1572336703; Hm_lpvt_15fafbae2b9b11d280c79eff3b840e45=1572336703; JSESSIONID=7c64fe11dc634f7bce6816ca76a196fb915ea8d8a307a0a41b26269846df44ef',
            'X-Requested-With': 'XMLHttpRequest'
        }

        # headers_port['Cookie'] = cookie
        print(url)
        response = requests.get(url,
                                headers=headers_port,
                                proxies={'http': self.page_ip})
        # print(response.url)
        # print(response.text)
        data = response.content.decode()
        data = json.loads(data)
        data = data['result']
        # print(data)
        for news in data:
            item = {}
            try:
                title = news['title']
            except:
                continue
            item['title'] = title
            itemid = news['docid']
            url = 'http://www.yidianzixun.com/article/' + itemid
            print(url)
            news_date = news['date']
            if 'V_' not in itemid:
                if url not in self.set_list:
                    # self.write_news_jsonfile(item)
                    try:
                        self.get_news_page_info(url)
                    except:
                        print(traceback.format_exc())
                    self.set_list.append(url)

    # 获取通过js生成的spt的值
    def get_spt(self, start, channel_id):
        # start = 10
        end = start + 10
        n = "/home/q/news_list_for_channel?channel_id=11756176923&cstart=0&cend=10&infinite=true&refresh=1&__from__=pc&multi=5"
        e = str(channel_id)
        # ctx = execjs.compile(
        #     '''
        #     function good (n,e,i,t){
        #         for (var o = "sptoken", a = "", c = 1; c < arguments.length; c++){
        #             o += arguments[c];
        #         }
        #         for (var c = 0; c < o.length; c++) {
        #             var r = 10 ^ o.charCodeAt(c);
        #             a += String.fromCharCode(r)
        #         }
        #         return a
        #     }
        #     '''
        # )
        # spt = ctx.call('good', n, e, start, end)
        # return spt

    def get_news_page_info(self, url):
        item = {}
        response = requests.get(url)
        print(response.url)
        data = etree.HTML(response.content.decode())
        title = data.xpath('.//h2/text()')[0]
        if data.xpath('.//a[@class="doc-source"]/text()'):
            source = data.xpath('.//a[@class="doc-source"]/text()')[0]
        else:
            source = data.xpath('.//div[@class="meta"]/span[1]/text()')[0]
        # date_time = data.xpath('.//div[@class="meta"]/span[2]/text()')[0]
        if data.xpath('.//div[@id="imedia-article"]//text()'):
            content = data.xpath('.//div[@id="imedia-article"]//text()')
        elif data.xpath('.//div[@id="imedia-article"]/article/p//text()'):
            content = data.xpath(
                './/div[@id="imedia-article"]/article/p//text()')
        elif data.xpath(
                './/div[@id="imedia-article"]/section/section//text()'):
            content = data.xpath(
                './/div[@id="imedia-article"]/section/section//text()')
        elif data.xpath('.//div[@class="content-bd"]/div/div//text()'):
            content = data.xpath('.//div[@class="content-bd"]/div/div//text()')
        elif data.xpath('.//div[@class="content-bd"]/p//text()'):
            content = data.xpath('.//div[@class="content-bd"]/p//text()')
        elif data.xpath('.//div[@class="content-bd"]/div/div/text()'):
            content = data.xpath('.//div[@class="content-bd"]/div/div//text()')
        elif data.xpath('.//div[@class="content-bd"]/section//text()'):
            content = data.xpath('.//div[@class="content-bd"]/section//text()')
        elif data.xpath('.//div[@class="content-bd"]/section/text()'):
            content = data.xpath('.//div[@class="content-bd"]/section/text()')
        elif data.xpath('.//div[@class="content-bd"]//text()'):
            content = data.xpath('.//div[@class="content-bd"]//text()')
        else:
            content = data.xpath(
                './/div[@id="imedia-article"]/section/section/section/p//text()'
            )
        content = ''.join(content)

        # get_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        item['platform'] = '一点资讯'
        item['title'] = title
        item['article_source'] = source  # 文章来源
        item['article_author'] = ''  # 文章作者
        item['content'] = content
        # if len(data.xpath('.//div[@class="meta"]/span')) == 3:
        #     date_all = data.xpath('.//div[@class="meta"]/span[3]/text()')[0]
        # elif len(data.xpath('.//div[@class="meta"]/span')) == 2:
        #     date_all = data.xpath('.//div[@class="meta"]/span[2]/text()')[0]
        # else:
        date_all = data.xpath('.//div[@class="meta"]/span//text()')
        date_all = ''.join(date_all).strip()

        try:
            if date_all == '昨天' or '小时前' in date_all:
                yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
                yesterday = str(yesterday).split(' ')[0]
                # print(date_all,  yesterday)
                item['date'] = yesterday
            elif date_all == '2天前':
                yesterday = datetime.now() - timedelta(days=2)  # 2前天时间
                yesterday = str(yesterday).split(' ')[0]
                # print(date_all, yesterday)
                item['date'] = yesterday
            elif date_all == '3天前':
                yesterday = datetime.now() - timedelta(days=3)  # 3前天时间
                yesterday = str(yesterday).split(' ')[0]
                # print(date_all, yesterday)
                item['date'] = yesterday
            else:
                news_date = re.search(r'\d{4}\.\d{1,2}\.\d{1,2}',
                                      date_all).group(0)
                # print(222222, news_date)
                # print(33333, date_all)
                item['date'] = news_date.replace('.', '-')
        except:
            item['date'] = self.comment_end_time
        # print(item)
        item['time'] = ''
        item['likes'] = ''
        item['clicks'] = ''
        item['views'] = ''
        item['keyword'] = ''
        item['comments_count'] = ''
        item['article_url'] = url  # 文章详情URL
        item['dislikes'] = ''  # 踩人数
        item['series_url'] = ''  # 车系首页
        item['list_url'] = 'http://www.yidianzixun.com/channel/c11'  # 文章列表URL
        item['article_type_1st'] = ''  # 文章类型
        item['article_type_2nd'] = ''  # 文章类型
        item['insert_time'] = str(datetime.now()).split('.')[0]  # 初始爬取时间
        item['update_time'] = str(datetime.now()).split('.')[0]  # 最后爬取时间
        item['content_id'] = url.split('/')[-1].split('?')[0]  # 文章id
        item['topic_id'] = url.split('/')[-1].split('?')[0]  # 主贴id
        item['author_id'] = ''  # 作者id
        item['file_code'] = '26'  # 文件编号

        # 做时间判断部分---------------  这个部分区分于另外一个部分
        # if date_all == '昨天' or date_all == '2天前' or date_all == '3天前' or '小时前' in date_all:
        # print(date_all, '时间符合')
        # print(item)
        self.write_news_jsonfile(item)
        news_id = url.split('/')[-1]
        self.is_get_comment = True
        self.get_commnet_info(news_id, title, url, item['date'])

    # 获取评论信息
    def get_commnet_info(self,
                         news_id,
                         title,
                         source_url,
                         source_date,
                         last_comment_id=''):
        item = {}
        url = 'http://www.yidianzixun.com/home/q/getcomments?_=1542864983174&docid={}&s=&count=30&last_comment_id={}&appid=web_yidian'.format(
            str(news_id), last_comment_id)
        response = requests.get(url)
        data = json.loads(response.content.decode())
        comments = data['comments']
        if comments:
            total_item = ''
            for comment in comments:
                # print(comment)
                # print('爬取评论中')
                item['platform'] = '一点资讯'
                item['title'] = title
                content = comment['comment']
                item['content'] = content
                author = comment['nickname']
                item['author'] = author
                date_all = comment['createAt']
                comment_date = date_all.split(' ')[0]
                comment_time = date_all.split(' ')[1]
                #  评论部分做时间判断部分---------------
                get_news_time = time.mktime(
                    time.strptime(str(comment_date), "%Y-%m-%d"))
                end_time = time.mktime(
                    time.strptime(self.comment_end_time, "%Y-%m-%d"))
                if self.comment_start_time != '':
                    start_time = time.mktime(
                        time.strptime(self.comment_start_time, "%Y-%m-%d"))
                else:
                    start_time = time.mktime(
                        time.strptime('2010-1-1', "%Y-%m-%d"))
                if float(get_news_time) < float(start_time):
                    self.is_get_comment = False  # 返回的回答消息是按时间进行排序的，所以当时间小于指定时间时，就停止爬取，
                    break
                elif float(start_time) <= float(get_news_time) <= float(
                        end_time):

                    item['date'] = comment_date
                    item['time'] = comment_time
                    item['source_date'] = source_date
                    item['source_time'] = ''
                    item['source_url'] = source_url
                    item['floor'] = ''
                    item['keyword'] = ''
                    item['comment_url'] = source_url
                    item['views'] = ''
                    item['comments_count'] = ''
                    item['likes'] = ''
                    item['author_id'] = comment['userid']  # 用户id
                    item['dislikes'] = ''  # 踩人数
                    item['insert_time'] = str(
                        datetime.now()).split('.')[0]  # 初始爬取时间
                    item['update_time'] = str(
                        datetime.now()).split('.')[0]  # 最后爬取时间
                    item['content_id'] = comment['comment_id']  # 内容id
                    item['topic_id'] = source_url.split('/')[-1].split('?')[
                        0]  # 主贴id
                    item['file_code'] = '40'  # 文件编号
                    item = json.dumps(dict(item), ensure_ascii=False) + '\n'
                    total_item += item

            self.write_comment_jsonfile(total_item)
            if len(comments) == 30 and self.is_get_comment:
                last_comment_id = comments[-1]['comment_id']
                print('评论翻页')
                self.get_commnet_info(news_id,
                                      title,
                                      source_url,
                                      source_date,
                                      last_comment_id=last_comment_id)

    def write_news_jsonfile(self, item):
        logger.log(31, '正在写入新闻数据......')

        # with open('./../yidianzixun/26_{}_yidianzixun_news.json'.format(str(now_time)), 'ab') as f:
        #     f.write(item.encode('utf-8'))
        self.hdfsclient.new_write(
            '{}/{}/{}/26_{}_{}_yidianzixun_news.json'.format(
                self.file_path,
                self.file_name_time.split(' ')[0].replace('-',
                                                          ''), self.hour_name,
                str(datetime.now()).split(' ')[0].replace('-', '_'),
                self.time_time),
            item,
            encoding='utf-8')

    def write_comment_jsonfile(self, item):
        logger.log(31, '正在写入评论数据......')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./../yidianzixun/40_{}_yidianzixun_commnet.json'.format(str(now_time)), 'ab') as f:
        #     f.write(item.encode('utf-8'))
        self.hdfsclient.new_write(
            '{}/{}/{}/40_{}_{}_yidianzixun_commnet.json'.format(
                self.comment_apth,
                self.file_name_time.split(' ')[0].replace('-',
                                                          ''), self.hour_name,
                str(datetime.now()).split(' ')[0].replace('-', '_'),
                self.time_time),
            item,
            encoding='utf-8')

    def get_news_url(self, num):
        """
        从百度搜索关键词，然后获取符合的新闻的url, 提取抓取数量
        """
        # 时间
        get_time = time.time()
        str_time = str(get_time)[:-4]
        date = datetime.now() - timedelta(days=7)
        a = str(date)[:-7]
        timeArray = time.strptime(a, "%Y-%m-%d %H:%M:%S")
        # 转换为时间戳:
        timeStamp = int(time.mktime(timeArray))
        end_time = str(timeStamp) + '.' + str_time.split('.')[1]
        print(str_time, end_time)
        # url = 'https://www.baidu.com/s?q1=汽车&q2=&q3=&q4=&gpc=stf%3D{}%2C{}%7Cstftype%3D1&ft=&q5=&q6=www.yidianzixun.com&tn=baiduadv&pn={}'.format(end_time, str_time, num)
        url = 'https://www.baidu.com/s?wd=site%3A(www.yidianzixun.com)%20%E6%B1%BD%E8%BD%A6&pn={}&oq=site%3A(www.yidianzixun.com)%20%E6%B1%BD%E8%BD%A6&ct=2097152&tn=baiduadv&ie=utf-8&si=(www.yidianzixun.com)&rsv_pq=e948db9e00097fcd&rsv_t=1273sdRx9rzb35pYERweuGf1mV6RO2BZZUthjhhdYlSidhjyUjzN%2FuD2LYJ1%2Fso&gpc=stf%3D{}%2C{}%7Cstftype%3D2&tfflag=1'.format(
            num, end_time, str_time)
        print(url)
        # ip = random.choice(self.proxies_list)
        response = requests.get(url,
                                headers=self.headers_one,
                                verify=False,
                                timeout=30)  # , proxies={'https': ip}
        content = etree.HTML(response.content.decode())
        if content.xpath('.//h3[@class="t"]/a/@href'):
            url_list = content.xpath('.//h3[@class="t"]/a/@href')
            print(url_list)
            print(len(url_list))
            for url_ch in url_list:
                response = requests.get(url_ch,
                                        headers=self.headers_two,
                                        allow_redirects=False)
                print(response.status_code)
                news_url = response.headers['Location']
                print(news_url)
                if news_url not in self.set_list:
                    try:
                        self.get_news_page_info(news_url)
                    except Exception as e:
                        print(e)
                        time.sleep(15)
                    self.set_list.append(news_url)

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    def run(self):

        url = 'http://www.yidianzixun.com/home/q/news_list_for_channel'
        get_time = time.time()
        get_time = ''.join(str(get_time).split('.'))

        url_list = [
            # 体育
            'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171666&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C%3C%3C%3A%3B%3A&appid=web_yidian&_={}',
            # NBA
            'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171682&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C28%3A%3B%3A&appid=web_yidian&_={}',
            # 财经
            'http://www.yidianzixun.com/home/q/news_list_for_channel?channel_id=13402171698&cstart=0&cend=10&infinite=true&refresh=1&__from__=wap&_spt=yz~eaod%3B9%3E%3A8%3B%3D%3B%3C32%3A%3B%3A&appid=web_yidian&_={}'
        ]
        for get_url in url_list:
            for i in range(2):
                try:
                    for j in range(30):
                        url = get_url.format(
                            str(time.time()).replace('.', '')[:-4])
                        try:
                            self.get_news_list_port(url)
                        except requests.exceptions.ProxyError:
                            print(traceback.format_exc())
                            break

                except TypeError:
                    print(traceback.format_exc())
                    logger.error('内容解析错误')
                except:
                    print(traceback.format_exc())
                    logger.error('其他错误')

                time.sleep(10)
                self.page_ip = proxies.res_ip()

Exemple #7

0

Afficher le fichier

Fichier : mo_gu_jie.py Projet : yuzhujiutian/nike_spider

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]

        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]

        print('爬取时间段：{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段：{}到{}'.format(news_start_time, current_day))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day
        # 标记爬虫工作
        self.is_work = False
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;', '', ret6)
            ret8 = re.sub(r'https:', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 根据关键词搜索请求得到商品信息
    def parse_goods(self, key_word):
        try:
            # 根据关键词,例如：洗发水,抓取商品信息
            url = 'https://list.mogujie.com/search?q={}&cKey=43&page=1&sort=pop'.format(
                key_word)
            headers = {
                # 'authority': 'list.mogujie.com',
                # 'method': 'GET',
                # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                # 'accept-encoding': 'gzip, deflate, br',
                # 'accept-language': 'zh-CN,zh;q=0.9',
                # 'cache-control': 'no-cache',
                'cookie':
                '__mgjuuid=ebddbce7-601f-4f3d-a860-d5ba8f411688; _TDeParam=1-1RjCYYeGOiwg6JI5UDopvg',
                'pragma':
                'no-cache',
                'upgrade-insecure-requests':
                '1',
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                time.sleep(0.1)
                response = requests.get(url=url,
                                        headers=headers,
                                        allow_redirects=False,
                                        timeout=20)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=20)
                except:
                    time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=20)
            # print(response.text)
            # 判断比对获取的是否是关键词关联搜索的商品
            rewriteKeyword = json.loads(response.text)['result']
            if 'searchRewrite' in rewriteKeyword:
                if rewriteKeyword['searchRewrite'][
                        'rewriteKeyword'] == key_word.replace(' ', ''):
                    # 获取商品总数
                    goods_num = json.loads(
                        response.text)['result']['wall']['total']
                    # 商品总页数
                    page_num = int(math.ceil(float(int(goods_num) / 75)))
                    for i in range(1, page_num + 1):
                        logger.log(
                            31,
                            '------正在抓取关键词: %s 的第: %s 页商品数据， 商品总页数是: %s ------'
                            % (key_word, i, page_num))
                        goods_url = 'https://list.mogujie.com/search?q={}&cKey=43&page={}&sort=pop'.format(
                            key_word, i)
                        try:
                            time.sleep(0.2)
                            response1 = requests.get(url=goods_url,
                                                     headers=headers,
                                                     allow_redirects=False,
                                                     timeout=20)
                        except:
                            try:
                                time.sleep(0.2)
                                response1 = requests.get(url=goods_url,
                                                         headers=headers,
                                                         proxies=proxies,
                                                         allow_redirects=False,
                                                         timeout=20)
                            except:
                                time.sleep(0.2)
                                response1 = requests.get(url=goods_url,
                                                         headers=headers,
                                                         proxies=proxies,
                                                         allow_redirects=False,
                                                         timeout=20)
                        # 获取商品列表信息节点
                        goods_list = json.loads(
                            response1.text)['result']['wall']['docs']
                        # print(len(goods_list))
                        # 遍历商品信息节点列表
                        for node in goods_list:
                            goods_dict = dict()
                            goods_dict['platform'] = '蘑菇街'
                            goods_dict['keyword'] = key_word
                            goods_dict['url'] = node['link']
                            goods_dict['imageurl'] = node['img']
                            goods_dict['audiourl'] = ''
                            goods_dict['name'] = node['title']
                            goods_dict['sales'] = ''
                            goods_dict['price'] = node['price']
                            goods_dict['itemID'] = node['tradeItemId']
                            goods_dict['brand'] = ''
                            goods_dict['focus_count'] = node['cfav']
                            # print(goods_dict)
                            self.parse_goods_details(goods_dict)
                else:
                    logger.log(31,
                               '------关键词: %s 搜索不到对应的商品数据--------' % key_word)
            else:
                # 获取商品总数
                goods_num = json.loads(
                    response.text)['result']['wall']['total']
                # 商品总页数
                page_num = int(math.ceil(float(int(goods_num) / 75)))
                for i in range(1, page_num + 1):
                    # logger.log(31, '------正在抓取关键词: %s 的第: %s 页商品数据， 商品总页数是: %s ------' % (key_word, i, page_num))
                    goods_url = 'https://list.mogujie.com/search?q={}&cKey=43&page={}&sort=pop'.format(
                        key_word, i)
                    try:
                        time.sleep(0.2)
                        response1 = requests.get(url=goods_url,
                                                 headers=headers,
                                                 allow_redirects=False,
                                                 timeout=20)
                    except:
                        try:
                            time.sleep(0.2)
                            response1 = requests.get(url=goods_url,
                                                     headers=headers,
                                                     proxies=proxies,
                                                     allow_redirects=False,
                                                     timeout=20)
                        except:
                            time.sleep(0.2)
                            response1 = requests.get(url=goods_url,
                                                     headers=headers,
                                                     proxies=proxies,
                                                     allow_redirects=False,
                                                     timeout=20)
                    # 获取商品列表信息节点
                    goods_list = json.loads(
                        response1.text)['result']['wall']['docs']
                    # print(len(goods_list))
                    # 遍历商品信息节点列表
                    for node in goods_list:
                        goods_dict = dict()
                        goods_dict['platform'] = '蘑菇街'
                        goods_dict['keyword'] = key_word
                        goods_dict['url'] = node['link']
                        goods_dict['imageurl'] = node['img']
                        goods_dict['audiourl'] = ''
                        goods_dict['name'] = node['title']
                        goods_dict['sales'] = ''
                        goods_dict['price'] = node['price']
                        goods_dict['itemID'] = node['tradeItemId']
                        goods_dict['brand'] = ''
                        goods_dict['focus_count'] = node['cfav']
                        # print(goods_dict)
                        self.parse_goods_details(goods_dict)
        except:
            print(111111111111111111111, traceback.format_exc())

    # 解析商品评论人数
    def parse_goods_details(self, goods_dict):
        try:
            headers = {
                'cookie':
                '__mgjuuid=7e841984-d679-49eb-9994-89abaec55322; _mwp_h5_token_enc=36d248108519bf86cf2fa681dbc521f8; _mwp_h5_token=3c71c26a371458b615f433396b39eccf_1564968570925; _ga=GA1.2.2057442167.1565061045; _gid=GA1.2.2144070558.1565061045; __mgjref=https%3A%2F%2Fshop.mogu.com%2Fdetail%2F1m6os9s%3Facm%3D3.ms.1_4_1m6os9s.43.1185-68998.4aiUQrym0Gs9T.sd_117-swt_43-imt_6-t_4aiUQrym0Gs9T-lc_4-pit_1-qid_21841-dit_170-idx_0-dm1_5001%26ptp%3D31.nXjSr.0.0.wLDh8N89',
                'pragma':
                'no-cache',
                'Referer':
                goods_dict['url'],
                'upgrade-insecure-requests':
                '1',
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            # print(goods_dict)
            url = 'https://rate.mogu.com/jsonp/pc.rate.ratelist/v2?pageSize=20&sort=1&isNewDetail=1&itemId={}&type=1&marketType=market_mogujie&page=1'.format(
                goods_dict['itemID'])
            try:
                time.sleep(0.2)
                response = requests.get(url=url,
                                        headers=headers,
                                        allow_redirects=False,
                                        timeout=20)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=20)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=20)
            # print(response.url)
            # print(response.text)
            commnent_num_data = re.search(r'{".*"success":true}',
                                          response.text)
            num_data = commnent_num_data.group()
            # print(num_data)
            if 'total' not in num_data:
                pass
                # logger.log(31, '-----------没有商品评论数据-----------')
            else:
                goods_dict['achieve_num'] = json.loads(
                    num_data)['data']['total']
                # 获取评论页数
                page_num = int(
                    math.ceil(float(int(goods_dict['achieve_num']) / 20)))
                # print(goods_dict['achieve_num'], page_num)
                self.goods_comments(goods_dict, page_num)
        except:
            print(2222222222222222222, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, page_num):
        try:
            is_break = self.is_work
            headers = {
                'cookie':
                '__mgjuuid=7e841984-d679-49eb-9994-89abaec55322; _mwp_h5_token_enc=36d248108519bf86cf2fa681dbc521f8; _mwp_h5_token=3c71c26a371458b615f433396b39eccf_1564968570925; _ga=GA1.2.2057442167.1565061045; _gid=GA1.2.2144070558.1565061045; __mgjref=https%3A%2F%2Fshop.mogu.com%2Fdetail%2F1m6os9s%3Facm%3D3.ms.1_4_1m6os9s.43.1185-68998.4aiUQrym0Gs9T.sd_117-swt_43-imt_6-t_4aiUQrym0Gs9T-lc_4-pit_1-qid_21841-dit_170-idx_0-dm1_5001%26ptp%3D31.nXjSr.0.0.wLDh8N89',
                'pragma':
                'no-cache',
                'Referer':
                goods_dict['url'],
                'upgrade-insecure-requests':
                '1',
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            # 抓取商品评论链接(总共50页,第一页从1开始)
            for i in range(1, int(page_num) + 1):
                comment_url = 'https://rate.mogu.com/jsonp/pc.rate.ratelist/v2?pageSize=20&sort=1&isNewDetail=1&itemId={}&type=1&marketType=market_mogujie&page={}'.format(
                    goods_dict['itemID'], i)
                # print(comment_url)
                # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            allow_redirects=False,
                                            timeout=20)
                except:
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=20)
                    except:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=20)
                # print(comment_data)
                comment = re.search(r'{".*"success":true}', response.text)
                # print(comment.group())
                items = json.loads(comment.group())['data']['list']
                # print(len(items))

                goods_comment = dict()
                for item in items:
                    # print(item)
                    date_data = item['time'].replace('年', '-').replace(
                        '月', '-').replace('日', '')
                    if len(date_data.split('-')) == 2:
                        date_data_test = time.strftime('%Y') + '-' + date_data
                    else:
                        date_data_test = date_data
                    # print(date_data_test)
                    # 判断评论时间是否在规定的抓取时间内
                    if self.start_time <= date_data_test.strip():
                        goods_comment['platform'] = goods_dict['platform']
                        goods_comment['date'] = date_data_test.strip()
                        goods_comment['time'] = ''
                        goods_comment['keyword'] = goods_dict['keyword']
                        goods_comment['name'] = goods_dict['name']
                        goods_comment['imageurl'] = goods_dict['imageurl']
                        goods_comment['audiourl'] = goods_dict['audiourl']
                        goods_comment['url'] = goods_dict['url']
                        goods_comment['shop_name'] = ''
                        goods_comment['user_name'] = item['user']['uname']
                        goods_comment['content'] = item['content']
                        goods_comment['content_id'] = item['rateId']
                        goods_comment['brand'] = goods_dict['brand']
                        goods_comment['price'] = goods_dict['price']
                        goods_comment['sales'] = goods_dict['sales']
                        goods_comment['focus_count'] = goods_dict[
                            'focus_count']
                        goods_comment['comment_num'] = goods_dict[
                            'achieve_num']
                        goods_comment['views'] = ''
                        goods_comment['likes'] = ''
                        goods_comment['comments_count'] = ''
                        goods_comment['reposts_count'] = ''
                        goods_comment['author_id'] = item['user']['uid']
                        goods_comment['topic_id'] = goods_dict['itemID']
                        try:
                            goods_comment['type'] = item['style'].split(
                                ':')[1].replace(' 尺码', '')
                        except:
                            goods_comment['type'] = ''
                        try:
                            goods_comment['size'] = item['style'].split(':')[2]
                        except:
                            goods_comment['size'] = ''
                        goods_comment['file_code'] = '177'
                        # logger.log(31, '--------------正在写入符合时间的商品评论-----------------------')
                        # print(goods_comment)
                        item = json.dumps(dict(goods_comment),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_daily/ecommerce/{}/177_{}_{}_MoGujie_nike.json'
                            .format(time.strftime('%Y%m%d'),
                                    time.strftime('%Y%m%d'), self.time_data),
                            item,
                            encoding='utf-8')
                    if date_data.strip() < self.start_time:
                        is_break = True
                if is_break:
                    break
        except:
            print(3333333333333333333, traceback.format_exc())

    # 读取excel获取关键词
    def parse_xlsx(self):
        # 设置路径
        path = './快消采集关键词_v3_20200330.xlsx'
        # 打开execl
        workbook = xlrd.open_workbook(path)

        # 根据sheet索引或者名称获取sheet内容
        Data_sheet = workbook.sheets()[0]  # 通过索引获取

        rowNum = Data_sheet.nrows  # sheet行数
        colNum = Data_sheet.ncols  # sheet列数

        # 获取所有单元格的内容
        list = []
        for i in range(rowNum):
            rowlist = []
            for j in range(colNum):
                rowlist.append(Data_sheet.cell_value(i, j))
            list.append(rowlist)

        for data in list[1::]:
            brand = data[0]
            # print(brand)
            yield {
                '关键词': brand,
            }

    def run(self):
        key_word_list = []
        for item in self.parse_xlsx():
            # print(item)
            key_word_list.append(item)
        for item_dat in key_word_list:
            # print(item_dat['关键词'])
            self.parse_goods(item_dat['关键词'])

Exemple #8

0

Afficher le fichier

Fichier : keyword_Su_ning_2h.py Projet : yuzhujiutian/nike_spider

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段：{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段：{}到{}'.format(start_time_data, end_time))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time
        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;', '', ret6)
            ret8 = re.sub(r'https:', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 过滤商品价格
    def re_price(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'pcData\(', '', message)
            ret2 = re.sub(r'\)', '', ret1)
            return ret2
        except:
            pass

    # 过滤商品品牌信息
    def re_brand(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'"brandName":', '', message)
            ret2 = re.sub(r'&amp;', '', ret1)
            ret3 = re.sub(r'"', '', ret2)
            return ret3
        except:
            pass

    # 根据关键词搜索请求得到商品信息
    def parse_goods_url(self, data):
        goods_dict = dict()
        goods_dict['平台'] = data['平台']
        goods_dict['关键词'] = data['关键词']
        goods_dict['URL'] = data['URL']
        goods_dict['商品名'] = data['商品名']
        goods_dict['商品图片'] = data['商品图片']
        goods_dict['shop_name'] = data['shop_name']
        # logger.log(31, '--------********正在抓取的商品是：%s********--------' % goods_dict)
        self.parse_goods_details(goods_dict)

    # 解析商品品牌信息
    def parse_goods_details(self, goods_dict):
        try:
            headers = {
                'Content-Type':
                'text/html;charset=utf-8',
                # 'Connection': 'keep-alive',
                # 'Cookie': 'SN_SESSION_ID=c55ac35a-f7d1-4b0c-b48a-f88e8bb896f4; useMp4=1.701108; _snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; _snsr=direct%7Cdirect%7C%7C%7C; _snzwt=THiw3Z16a429d6f24nzVa227f; _snmc=1; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; authId=si0BE64747CDCB0EC1B819BB87E6D52FC1; secureToken=E180078268FCC770B6CFC47BFC919E55; _snms=155592217017833779; smhst=651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000a861276981|0000000000a11028136288|0070705161a11002911104|0070756234a101822780|0000000000; _snma=1%7C15553832315961909%7C1555383231596%7C1555923318059%7C1555923324804%7C140%7C9; _snmp=155592332389716467; _snmb=155591411681863515%7C1555923324825%7C1555923324807%7C37',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            goods_url = goods_dict['URL']
            # print(goods_url)
            try:
                time.sleep(0.2)
                response = requests.get(url=goods_url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=goods_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=goods_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            # print(response.text)
            # print('正在抓取的页面是: %s' % goods_url)
            data = response.text
            # 用正则匹配商品品牌
            re_brand = re.search(r'"brandName":".*?"', data)
            try:
                brand_name = str(self.re_brand(re_brand.group()))
            except:
                brand_name = ''
            # print(html)
            # 用正则截取价格和评论链接里需要的两串ID
            partNumber = re.search(r'"partNumber".*?,', data)
            vendorCode = re.search(r'"vendorCode".*?,', data)
            # print(partNumber.group(), vendorCode.group())
            goods_dict['品牌'] = brand_name
            goods_dict['月销量'] = ''
            goods_dict['partNumber'] = self.re_not_number(partNumber.group())
            goods_dict['vendorCode'] = self.re_not_number(vendorCode.group())
            # print(goods_dict)
            self.goods_price(goods_dict)
        except:
            print(2222222222222222222, traceback.format_exc())

    # 抓取商品价格
    def goods_price(self, goods_dict):
        try:
            headers = {
                'Content-Type':
                'text/html; charset=UTF-8',
                # 'Connection': 'keep-alive',
                # 'Cookie': '_snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; smhst=826233089|0000000000a10243606506|0000000000a101822738|0000000000a101822744|0000000000a160764310|0000000000a122819279|0000000000a651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000a861276981|0000000000a11028136288|0070705161a11002911104|0070756234a101822780|0000000000; _snzwt=THr1rb16a47c548027ZyP23a2; authId=si18DAC1A401B32915224307F589BD81BC; secureToken=B185BBC1B1CC477DFED182771566D4D2; _snmc=1; _snsr=direct%7Cdirect%7C%7C%7C; _snms=155598227334334657; _snma=1%7C15553832315961909%7C1555383231596%7C1555982274025%7C1555982278050%7C179%7C10; _snmp=155598227667474202; _snmb=155598227327394678%7C1555982278116%7C1555982278059%7C3',
                'Host':
                'pas.suning.com',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            partNumber = goods_dict['partNumber']
            vendorCode = goods_dict['vendorCode']
            # print(partNumber, vendorCode)
            price_url = 'https://pas.suning.com/nspcsale_0_{}_{}_{}_20_021_0210101_315587_1000267_9264_12113_Z001___R9000361_1.39_0___000165956__.html?callback=pcData'.format(
                partNumber, partNumber, vendorCode)

            try:
                time.sleep(0.2)
                response = requests.get(url=price_url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=price_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=price_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            # print(response.text)
            # print(data)
            re_price_data = self.re_price(response.text)
            price_data = json.loads(re_price_data)['data']['price'][
                'saleInfo'][0]['promotionPrice']
            # print(price_data)
            if price_data.find('-') > 0:
                goods_dict['价格'] = price_data.split('-')[0]
            else:
                goods_dict['价格'] = price_data
            # print(goods_dict)
            self.parse_comment_num(goods_dict)
        except:
            print(33333333333333333333, traceback.format_exc())

    # 解析商品评价人数
    def parse_comment_num(self, goods_dict):
        try:
            headers = {
                'Content-Type':
                'application/javascript;charset=UTF-8',
                # 'Connection': 'keep-alive',
                # 'Cookie': '_snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; tradeMA=127; smhst=651484540|0000000000a826233089|0000000000a10243606506|0000000000a101822738|0000000000a101822744|0000000000a160764310|0000000000a122819279|0000000000a651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000a861276981|0000000000a11028136288|0070705161a11002911104|0070756234a101822780|0000000000; _snms=155641574161962867; route=3798b42173574ff4536b1645bfa56286; _snzwt=THusFg16a66e65b60nBjXc7ab; authId=siB5D783545E4E209EB3048D028A03A056; secureToken=CE1EA8B12441816214F909A8C65BED87; _snma=1%7C15553832315961909%7C1555383231596%7C1556415743033%7C1556504534582%7C192%7C14; _snmc=1; _snsr=direct%7Cdirect%7C%7C%7C; _snmp=155650453353821025; _snmb=155650453458862957%7C1556504534606%7C1556504534588%7C1',
                'Host':
                'review.suning.com',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            comment_num_url = 'https://review.suning.com/ajax/cluster_review_satisfy/style--{}-{}-----satisfy.htm?callback=satisfy'.format(
                goods_dict['partNumber'], goods_dict['vendorCode'])

            try:
                time.sleep(0.2)
                response = requests.get(url=comment_num_url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_num_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=comment_num_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            re_comment_num = re.search(r'{"reviewCounts".*"reCloudDrill":0}',
                                       response.text)
            goods_dict['评论人数'] = json.loads(
                re_comment_num.group())['reviewCounts'][0]['totalCount']
            # print(goods_dict)
            if int(self.re_not_number(goods_dict['评论人数'])) == 0:
                logger.log(31, '-------------没有商品评论--------------')
                # print(goods_data)
                # self.write_Nike_index_jsonfile(goods_data)
            else:
                # 获取评论页数
                page_num = int(
                    math.ceil(
                        float(
                            int(self.re_not_number(goods_dict['评论人数'])) / 10)))
                # print(goods_dict['评论人数'], page_num)
                self.goods_comments(goods_dict, page_num)
        except:
            print(444444444444444444, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, page_num):
        try:
            is_break = self.is_break
            # print(goods_dict)
            partNumber = goods_dict['partNumber']
            vendorCode = goods_dict['vendorCode']
            headers = {
                'Content-Type':
                'application/javascript;charset=UTF-8',
                # 'Connection': 'keep-alive',
                # 'Cookie': '_snvd=1555383181562rH9y3n/THLV; cityCode=021; districtId=12113; cityId=9264; hm_guid=ac41a4ae-4373-4445-ab29-65e90c29b272; _df_ud=60a62287-237d-4cf0-ada4-d39a276f2c2d; _device_session_id=p_2fb27762-ef79-4f07-9f25-e0acad62907a; _cp_dt=bf4a6a96-909f-450a-b7ca-2d8d0b363cee-86574; city=1000267; province=20; district=10002671; provinceCode=20; districtCode=01; streetCode=0210199; SN_CITY=20_021_1000267_9264_01_12113_1_1; tradeMA=127; route=3798b42173574ff4536b1645bfa56286; _snzwt=THusFg16a66e65b60nBjXc7ab; _snsr=direct%7Cdirect%7C%7C%7C; _snmc=1; _snms=155652264991095847; authId=si07DE872B7B580CBB2CB11C7105B450A8; secureToken=5C8868551C3103287B59ADEDD6B90567; smhst=192279908|0000000000a600733096|0000000000a600479244|0000000000a10700388709|0070547159a651484540|0000000000a826233089|0000000000a10243606506|0000000000a101822738|0000000000a101822744|0000000000a160764310|0000000000a122819279|0000000000a651484555|0000000000a10607567457|0000000000a10010138536|0070176294a601763915|0000000000a102374199|0000000000a101822787|0000000000a11012720481|0070752460a11024165323|0070745700a193148008|0000000000; _snma=1%7C15553832315961909%7C1555383231596%7C1556524706411%7C1556524786984%7C224%7C15; _snmp=155652478697968344; _snmb=155652102706620667%7C1556524786995%7C1556524786988%7C28',
                'Host':
                'review.suning.com',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            if int(page_num) >= 50:
                pages = 50
            else:
                pages = page_num
            # 抓取商品评论链接(总共50页,第一页从1开始)
            for i in range(1, int(pages) + 1):
                comment_url = 'https://review.suning.com/ajax/cluster_review_lists/style--{}-{}-newest-{}-default-10-----reviewList.htm?callback=reviewList'.format(
                    partNumber, vendorCode, i)
                # print(comment_url)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                comment_data = response.text
                # print(comment_data)
                comment = re.search(r'{"commodityReviews":.*"reCloudDrill":0}',
                                    comment_data)
                items = json.loads(comment.group())['commodityReviews']
                # print(len(items))
                if len(items) == 0:
                    break
                else:
                    goods_comment = dict()
                    for data in items:
                        # print(data)
                        date_data = data['publishTime'].split(' ')[0]
                        time_data = data['publishTime'].split(' ')[1]
                        # print(date_data.strip(), time_data.strip())
                        try:
                            content = self.re_html(data['content'])
                        except:
                            content = ''
                        # 追加评论
                        try:
                            content_add = data['againReview']['againContent']
                        except:
                            content_add = ''

                        # 判断评论时间是否在规定的抓取时间内
                        if self.start_time <= data['publishTime']:
                            goods_comment['platform'] = goods_dict['平台']
                            goods_comment['date'] = date_data.strip()
                            goods_comment['time'] = time_data.strip()
                            goods_comment['keyword'] = goods_dict['关键词']
                            goods_comment['name'] = goods_dict['商品名']
                            goods_comment['imageurl'] = goods_dict['商品图片']
                            goods_comment['audiourl'] = ''
                            goods_comment['url'] = goods_dict['URL']
                            goods_comment['shop_name'] = goods_dict[
                                'shop_name']
                            goods_comment['user_name'] = data['userInfo'][
                                'nickName']
                            goods_comment[
                                'content'] = content + ';' + content_add
                            goods_comment['content_id'] = str(
                                data['commodityReviewId'])
                            goods_comment['brand'] = goods_dict['品牌']
                            goods_comment['price'] = goods_dict['价格']
                            goods_comment['sales'] = goods_dict['月销量']
                            goods_comment['focus_count'] = ''
                            goods_comment['comment_num'] = goods_dict['评论人数']
                            goods_comment['views'] = ''
                            goods_comment['author_id'] = ''
                            goods_comment['reposts_count'] = ''
                            goods_comment['topic_id'] = str(
                                goods_dict['URL'].split('/')[4].replace(
                                    '.html', ''))
                            test_data = data['commodityInfo']['charaterId1']
                            if test_data == '尺码':
                                goods_comment['type'] = data['commodityInfo'][
                                    'charaterDesc2']
                                goods_comment['size'] = data['commodityInfo'][
                                    'charaterDesc1']
                            else:
                                goods_comment['type'] = data['commodityInfo'][
                                    'charaterDesc1']
                                goods_comment['size'] = data['commodityInfo'][
                                    'charaterDesc2']
                            self.likes_comments(goods_comment)
                        if date_data.strip() < self.start_time:
                            is_break = True
                    if is_break:
                        break
        except:
            print(555555555555555555, traceback.format_exc())

    # 解析商品评论的点赞数和回复数
    def likes_comments(self, goods_comment):
        try:
            comment_id = goods_comment['topic_id']
            url = 'https://review.suning.com/ajax/useful_count/{}-usefulCnt.htm'.format(
                comment_id)
            headers = {
                'Content-Type':
                'application/javascript;charset=UTF-8',
                # 'Cookie': 'tradeMA=55; _snvd=1565067528273QvL8ia7lwZC; SN_CITY=20_021_1000267_9264_01_12113_2_0; cityCode=021; districtId=12113; cityId=9264; hm_guid=ca34f536-186e-4619-aa8f-6c8808ee39a6; _df_ud=e64b917e-c77c-46e0-9d10-d84c86c93f3a; _device_session_id=p_806c72c6-6fa6-462d-bf88-f660c7094c1a; _cp_dt=21f7906b-c341-404f-996b-4d4f2e32e4af-70039; route=e46977517568f7cad53fbfe19eaf4774; _snmc=1; _snsr=direct%7Cdirect%7C%7C%7C; authId=siC700F4CB8ABB1C2E87F1FA1E9650CF7A; secureToken=F9331FD98F503CE8898949382003910A; _snzwt=THs64g16ce02abb69OAUS9a89; _snms=156712934067680848; smhst=690105206|0000000000a10118749983|0000000000a10689501376|0070222946a10949954840|0000000000a10966209249|0000000000a10757523126|0000000000a10620476914|0000000000a11180422688|0000000000a10966225829|0000000000a769909849|0070230352a10580507394|0070222946a826193435|0000000000a10163182478|0000000000a10964625880|0000000000a10571100966|0070074453; _snma=1%7C156506752678869586%7C1565067526788%7C1567129356201%7C1567129676548%7C137%7C12; _snmp=156712967506243164; _snmb=156712899210934272%7C1567129676573%7C1567129676552%7C8',
                'Host':
                'review.suning.com',
                'Pragma':
                'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                time.sleep(0.2)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)

            likes_comments_data = json.loads(
                response.text.replace('usefulCnt(', '').replace(')', ''))
            goods_comment['likes'] = likes_comments_data[
                'reviewUsefuAndReplylList'][0]['usefulCount']
            goods_comment['comments_count'] = likes_comments_data[
                'reviewUsefuAndReplylList'][0]['replyCount']
            goods_comment['file_code'] = '53'
            # logger.log(31, '****-------正在写入符合时间的商品评论信息-------****')
            # print(goods_comment)
            item = json.dumps(dict(goods_comment), ensure_ascii=False) + '\n'
            self.hdfsclient.new_write(
                '/user/cspider_daily/nike_2h/ecommerce/{}/{}/53_{}_Suning_nike_{}.json'
                .format(self.date_time, self.h2_name, time.strftime('%Y%m%d'),
                        self.pid),
                item,
                encoding='utf-8')
        except:
            print(6666666666666666666666666, traceback.format_exc())

    def run(self, lock):
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('su_ning_2h')
            if str(redis_url_num) == '0':
                print(
                    '**************\nRedis消息队列中url为空，程序等待中.....\n进程等待中......\n*************'
                )
            item = self.redis_example.brpop('su_ning_2h', timeout=3600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_goods_url(item1)

Exemple #9

0

Afficher le fichier

Fichier : 19_floor_hours.py Projet : tiger7456/nike_spider

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, file_path):

        self.headers_one = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:   # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2019-09-09'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-09-16'
        logger.info('爬取时间段：{}到{}'.format(self.start_time, self.end_time))
        # logging.info('爬取时间段：{}到{}'.format(self.start_time, self.end_time))
        # 定义评论的抓取时间范围
        self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-09-09'  # 一天回复
        # self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = '2019-09-16'
        # 标记爬虫工作
        self.is_work = True
        self.set_list = []  #去重列表
        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.hdfsclient = HdfsClient(url='http://*****:*****@class="detailtz clearall"]/li[@class="clearall"]')
        for li in li_list:
            print(11111)
            title = ''.join(li.xpath('.//p[@class="title"]//text()')).strip()
            news_date = li.xpath('.//span[@class="postdate"]/text()')[0].strip()
            news_url = 'https:' + li.xpath('.//p[@class="title"]/a/@href')[0].strip()
            views = li.xpath('.//span[@class="fr"]/span/text()')[0].split('(')[-1].split(')')[0]
            replay_no = li.xpath('.//span[@class="fr"]/a/text()')[0].split('(')[-1].split(')')[0]
            # print(title, news_date, news_url, views, replay_no)
            time.sleep(3)
            # # 做时间判断部分---------------
            # get_news_time = time.mktime(time.strptime(news_date, "%Y-%m-%d"))
            # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
            # if self.start_time != '':
            #     start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d"))
            # else:
            #     start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
            # if float(get_news_time) < float(start_time):
            #     self.is_work = False
            #
            # if float(start_time) <= float(get_news_time) < float(end_time):  # 符合时间段的内容
            print(news_date)
            if '1天' in news_date:
                print(222222, news_url)
                if news_url not in self.set_list:   # 去重判断
                    self.get_news_page(news_url, title, views, replay_no, keyword)
                    self.set_list.append(news_url)

        if data.xpath('.//a[@class="page-next"]/@href') and self.is_work:
            next_page_url = 'https:' + data.xpath('.//a[@class="page-next"]/@href')[0].strip()
            self.get_search_page(next_page_url, keyword)

    def get_news_page(self, url, title, views, replay_no, keyword, is_first=True):
        """
        帖子详情页
        :param url:
        :return:
        """
        heasers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Cookie': 'f9big=u62; _Z3nY0d4C_=37XgPK9h; JSESSIONID=4AB05FA49E2A1477353FD49E96A7DC94; sajssdk_2015_cross_new_user=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216cadbc7b78349-00406e52f50a5c-7373e61-2304000-16cadbc7b797b8%22%2C%22%24device_id%22%3A%2216cadbc7b78349-00406e52f50a5c-7373e61-2304000-16cadbc7b797b8%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _DM_SID_=f9eb430f8631d9542bb6023a0185fd5e; _DM_S_=2446eea1926edb3b390dd3a02a7bcfb2; f19big=ip48; _dm_userinfo=%7B%22uid%22%3A0%2C%22stage%22%3A%22%22%2C%22city%22%3A%22%E4%B8%8A%E6%B5%B7%3A%E4%B8%8A%E6%B5%B7%22%2C%22ip%22%3A%22124.78.53.22%22%2C%22sex%22%3A%22%22%2C%22frontdomain%22%3A%22www.19lou.com%22%2C%22category%22%3A%22%22%7D; pm_count=%7B%7D; dayCount=%5B%5D; Hm_lvt_5185a335802fb72073721d2bb161cd94=1566282908; screen=682; _dm_tagnames=%5B%7B%22k%22%3A%22%E8%80%90%E5%85%8B%22%2C%22c%22%3A1%7D%2C%7B%22k%22%3A%22baoma%22%2C%22c%22%3A3%7D%2C%7B%22k%22%3A%22baoma%22%2C%22c%22%3A2%7D%5D; Hm_lpvt_5185a335802fb72073721d2bb161cd94=1566283069',
            'Host': 'www.19lou.com',
            'Referer': 'https://www.19lou.com/search/thread?keyword=%E8%80%90%E5%85%8B',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '******',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        logger.info(url)
        response = requests.get(url, headers=heasers, timeout=120)
        # print(response.text)
        data = etree.HTML(response.content.decode('gb2312', 'ignore'))
        if data.xpath('.//div[@class="user-info thread-side"]'):  # 文章类帖子

            if data.xpath('.//div[@itemprop="replyPost"]'):  # 判断文章回复
                reply_list = data.xpath('.//div[@itemprop="replyPost"]')
                for replay in reply_list:
                    content_list = replay.xpath('.//div[@class="post-cont"]//text()')
                    content = ''.join(content_list).strip()
                    # print(floor)
                    # print(content)
                    item = {}
                    item['platform'] = '19楼'
                    date_all = replay.xpath('.//div[@class="u-add link0 clearall"]/span[@class="fl"]/text()')[0]
                    print(11111, date_all)
                    item['date'] = date_all.split(' ')[0].replace('发表于', '')
                    item['time'] = date_all.split(' ')[1]
                    item['author'] = replay.xpath('.//a[@class="name"]/span/text()')[0]
                    try:
                        item['author_id'] = replay.xpath('.//a[@class="name"]/@href')[0].split('-')[1]
                    except:
                        item['author_id'] = ''
                    item['post_client'] = ''
                    item['title'] = title
                    item['content'] = content
                    item['content_id'] = replay.xpath('.//parent::div/@id')[0]
                    item['brand'] = ''
                    item['carseries'] = ''
                    item['from'] = ''
                    item['series_url'] = ''
                    item['url'] = url
                    item['is_topics'] = '否'
                    item['floor'] = replay.xpath('.//a[@itemprop="postSequenceNumber"]/text()')[0]
                    item['identification'] = ''
                    item['favorite'] = ''
                    item['signin_time'] = ''
                    item['reply_no'] = ''
                    item['views'] = ''
                    item['likes'] = ''
                    item['is_elite'] = ''
                    item['topic_count'] = ''
                    item['reply_count'] = ''
                    item['pick_count'] = ''
                    item['follows'] = ''
                    item['topic_categroy'] = ''
                    item['topic_type'] = ''
                    item['insert_time'] = str(datetime.now()).split('.')[0]
                    item['update_time'] = str(datetime.now()).split('.')[0]
                    item['topic_id'] = url.split('-')[3]
                    item['reply_floor'] = ''
                    item['keyword'] = keyword
                    item['file_code'] = '186'
                    item['reposts_count'] = ''
                    # print(item)
                    self.__write_news_jsonfile(item)

            if is_first:  # 文章类帖子，首页要抓取文章内容
                content_list = data.xpath('.//div[@class="post-cont"]//text()')
                content = ''.join(content_list).strip()
                # print(floor)
                # print(content)
                item = {}
                item['platform'] = '19楼'
                date_all = data.xpath('.//span[@class="u-add-ft"]/@title')[0]
                item['date'] = date_all.split(' ')[0].replace('发表于', '')
                item['time'] = date_all.split(' ')[1]
                item['author'] = data.xpath('.//div[@class="user-name"]/a/span/text()')[0]
                try:
                    item['author_id'] = data.xpath('.//div[@class="user-name"]/a/@href')[0].split('-')[1]
                except:
                    item['author_id'] = ''
                item['post_client'] = ''
                item['title'] = title
                item['content'] = content
                item['content_id'] = url.split('-')[3]
                item['brand'] = ''
                item['carseries'] = ''
                item['from'] = ''
                item['series_url'] = ''
                item['url'] = url
                item['is_topics'] = '是'
                item['floor'] = '楼主'
                item['identification'] = ''
                item['favorite'] = ''
                item['signin_time'] = ''
                item['reply_no'] = replay_no
                item['views'] = views
                item['likes'] = ''
                item['is_elite'] = ''
                item['topic_count'] = ''
                item['reply_count'] = ''
                item['pick_count'] = ''
                item['follows'] = ''
                item['topic_categroy'] = ''
                item['topic_type'] = ''
                item['insert_time'] = str(datetime.now()).split('.')[0]
                item['update_time'] = str(datetime.now()).split('.')[0]
                item['topic_id'] = url.split('-')[3]
                item['reply_floor'] = ''
                item['keyword'] = keyword
                item['file_code'] = '186'
                item['reposts_count'] = ''
                # print(item)
                self.__write_news_jsonfile(item)

        else:   # 论坛类帖子
            div_list = data.xpath('.//div[@id="view-bd"]/div[@id and @itemprop]')
            for div in div_list:
                content_list = div.xpath('.//div[@class="thread-cont"]//text()')
                content = ''.join(content_list).strip()
                floor = div.xpath('.//div[@class="cont-hd clearall"]/a[@data-pid]/text() | .//span[@itemprop="postSequenceNumber"]/text() | .//em[@itemprop="postSequenceNumber"]/text()')
                floor = ''.join(floor).strip()
                # print(floor)
                # print(content)
                item = {}
                item['platform'] = '19楼'
                if floor == '楼主':
                    date_all = div.xpath('.//li[@title]/@title')[0]
                else:
                    date_all = div.xpath('.//p[@class="fl link1"]/span/text()')[0]

                item['date'] = date_all.split(' ')[0].replace('发表于', '')
                item['time'] = date_all.split(' ')[1]
                item['author'] = div.xpath('.//div[@class="uname"]/a/@title')[0]
                try:
                    item['author_id'] = div.xpath('.//div[@class="uname"]/a/@href')[0].split('-')[1]
                except:
                    item['author_id'] = ''
                try:
                    item['post_client'] = div.xpath('.//p[@class="forum-source fl link0"]/a/text()')[0]
                except:
                    item['post_client'] = ''
                item['title'] = title
                item['content'] = content
                item['content_id'] = div.xpath('.//div[@id and @class="cont"]/@id')[0].replace('pid', '')
                item['brand'] = ''
                item['carseries'] = ''
                item['from'] = ''
                item['series_url'] = ''
                item['url'] = url
                if floor == '楼主':
                    is_topics ='是'
                else:
                    is_topics = '否'
                item['is_topics'] = is_topics
                item['floor'] = floor
                item['identification'] = ''
                item['favorite'] = ''
                item['signin_time'] = div.xpath('.//dl/dd[@class="color6" and @itemprop]/text()')[0]
                if is_topics == '是':
                    item['reply_no'] = replay_no
                    item['views'] = views
                else:
                    item['reply_no'] = ''
                    item['views'] = ''
                item['likes'] = ''
                item['is_elite'] = ''
                item['topic_count'] = ''
                item['reply_count'] = ''
                item['pick_count'] = ''
                item['follows'] = ''
                item['topic_categroy'] = ''
                item['topic_type'] = ''
                item['insert_time'] = str(datetime.now()).split('.')[0]
                item['update_time'] = str(datetime.now()).split('.')[0]
                item['topic_id'] = url.split('-')[3]
                item['reply_floor'] = ''
                item['keyword'] = keyword
                item['file_code'] = '186'
                item['reposts_count'] = ''

                # print(item)
                self.__write_news_jsonfile(item)

        if data.xpath('.//a[@class="page-next"]/@href'):
            next_page_url = 'https:' + data.xpath('.//a[@class="page-next"]/@href')[0].strip()
            self.get_news_page(next_page_url, title, views, replay_no, keyword, is_first=False)

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    # 写入json文件
    def __write_news_jsonfile(self, item):
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./../19/{}_floor_news.json'.format(str(datetime.now()).split(' ')[0]), 'ab') as f:
        # with open('{}/{}_floor_news_adidas.json'.format(self.file_path, str(datetime.now()).split(' ')[0]), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write('{}/{}/{}/186_{}_{}_floor_news.json'.format(self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8')


    def run(self):
        url = 'https://www.19lou.com/search/thread?keyword={}&sorts=0&timeType=1&fids=undefined&usesearchtype=1'
        url_list = get_config_para('nike_daily_keywords')
        logger.log(31, url_list)
        for item in url_list:
            # print(1)
            keyword = item['keywords']
            logger.log(31, keyword)
            if keyword:
                search_url = url.format(keyword.strip())
                logger.info('搜索url:' + search_url)
                try:
                    self.get_search_page(search_url, keyword)
                except:
                    logger.error(traceback.format_exc())

Exemple #10

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, file_path, comment_path):

        self.headers_one = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:   # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # print('爬取时间段：{}到{}'.format(news_start_time, yesterday))
        #
        logger.log(31, '爬取时间段：{}到{}'.format(news_start_time, yesterday))
        #
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # 标记爬虫工作
        self.is_work = True

        self.xhsapi = XhsApi('8ac1d719cd0a2d16')
        # 代理服务器
        proxyHost = "http-cla.abuyun.com"
        proxyPort = "9030"

        # 代理隧道验证信息
        proxyUser = "******"
        proxyPass = "******"

        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }

        self.proxies = {
            "http": proxyMeta,
            "https": proxyMeta
        }
        self.set_list = []
        # self.info = seeeion_id_list
        self.file_name_time = self.get_file_name_time()
        pool = redis.ConnectionPool(host='192.168.1.208', port=6379, password='******')  # 实现一个Redis连接池
        self.redis_example = redis.Redis(connection_pool=pool)
        self.error_count = 0

        with open('./session_id_list_hour.json') as f:
            session_id = f.read()
        self.session_id_list = eval(session_id)
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://jq-chance-05:9870', user='******')
        hour = str(datetime.now()).split(' ')[-1].split(':')[0]
        if str(hour) != '00':
            two_hour_ago = int(hour) - 2
            if len(str(two_hour_ago)) == 1:
                two_hour_ago = '0' + str(two_hour_ago)
            self.hour_name = str(two_hour_ago) + '_' + str(hour)
        else:
            self.hour_name = '22_24'
        self.hdfsclient.makedirs('{}/{}/{}'.format(self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}/{}'.format(self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
        self.session_id_error = []

    def change_ip(self):
        logger.log(31, '开始切换ip')
        url = 'http://proxy.abuyun.com/switch-ip'
        time.sleep(random.randint(1, 15))
        response = requests.get(url, proxies=self.proxies)
        logger.log(31, '现使用ip：'+ response.text)

    def res_ip_three_hour(self):
        """
        25分钟-3小时
        :return:
        """
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'webapi.http.zhimacangku.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        }
        # 5-25分 500个ip
        url = 'http://http.tiqu.alicdns.com/getip3?num=1&type=1&pro=110000&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions=&gm=4'
        time.sleep(random.randint(3, 6))
        ip_pro = requests.get(url, headers=headers)
        ip = ip_pro.text.strip()
        return ip

    def get_serach_list(self, page, keyword):
        info = random.choice(self.session_id_list)
        # info = self.sid_info
        logger.log(31, 'session_id下标:  ' + str(self.session_id_list.index(info)))
        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])
        search_ret = self.xhsapi.search(keyword, 1, 20)
        # print(222222222222, search_ret)
        if '{"msg":"","result":0,"success":true}' in search_ret:
            self.session_id_error.append(info)
            if self.session_id_error.count(info) > 5:
                logger.log(31, '无效id：' + str(info))
                del self.session_id_list[self.session_id_list.index(info)]
            if self.error_count > 5:
                self.change_ip()
                self.error_count = 0
            self.error_count += 1
            self.get_serach_list(page, keyword)
            return

        if '参数错误' in search_ret:
            logger.log(31, '参数错误，重试.....')
            self.get_serach_list(page, keyword)
            return

        json_text = json.loads(search_ret)
        # print(json_text)
        note_list = json_text["data"]["notes"]
        for note in note_list:
            title = note["title"]
            if not title:
                title = note["desc"]
            id = note["id"]
            time.sleep(0.1)
            if id not in self.set_list and not self.redis_example.sismember('xiaohongshu_out_day_url_hour', id):
                logger.log(31, '标题：  ' + title)
                try:
                    self.get_note(id, keyword)
                except:
                    print(traceback.format_exc())
                self.set_list.append(id)
            else:
                logger.log(31, '根据去重列表和从redis中判断时间不符合......' + str(id))

    def get_note(self, note_id, keyword):
        info = random.choice(self.session_id_list)
        # info = self.sid_info
        logger.log(31, 'session_id下标:  ' + str(self.session_id_list.index(info)))

        self.xhsapi.set_smid(info['device_fingerprint'])
        self.xhsapi.set_session_id(info['sid'].split('.')[-1])
        note_ret = self.xhsapi.get_note(note_id)
        # print(333333, note_ret)

        if '参数错误' in note_ret:
            logger.log(31, '参数错误，重试.....')
            self.get_note(note_id, keyword)
            return
        # print(response.text)
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     logger.log(31, '无效id：', info)
        #     del self.session_id_list[self.session_id_list.index(info)]
        #     return
        if '{"msg":"","result":0,"success":true}' in note_ret:

            self.session_id_error.append(info)
            if self.session_id_error.count(info) > 5:
                logger.log(31, '无效id：' + str(info))
                del self.session_id_list[self.session_id_list.index(info)]
            if self.error_count > 5:
                self.change_ip()
                self.error_count = 0
            self.error_count += 1
            self.get_note(note_id, keyword)
            return

        json_text = json.loads(note_ret)
        # print(11111, json_text)
        data = json_text["data"][0]['note_list'][0]
        item = {}
        item['platform'] = '小红书'
        # print(222222, data)
        date_all = data['time']
        time_local = time.localtime(float(date_all))
        # 转换成新的时间格式(2016-05-05 20:28:54)
        dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        logger.log(31, "时间:    " + str(dt))
        # # 做时间判断部分---------------
        get_news_time = time.mktime(time.strptime(str(dt).split(' ')[0], "%Y-%m-%d"))
        end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
        if self.start_time != '':
            start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d"))
        else:
            start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
        if float(get_news_time) < float(start_time):
            logger.log(31, '不符合时间')
            self.redis_example.sadd('xiaohongshu_out_day_url_hour', note_id)

        elif float(start_time) <= float(get_news_time) <= float(end_time):

            # print('符合时间')
            news_date = dt.split(' ')[0]
            news_time = dt.split(' ')[1]
            item['date'] = news_date
            item['time'] = news_time
            title = data['share_info']["title"]
            item['title'] = title
            item['content'] = data["desc"]
            note_id = data["id"]
            item['content_id'] = note_id
            item['article_author'] = data["user"]["nickname"]
            item['clicks'] = ''
            item['views'] = data['view_count']
            comments = data["comments_count"]
            item['comments_count'] = comments
            item['likes'] = data["liked_count"]
            item['dislikes'] = ''
            item['keyword'] = keyword
            article_url = data['share_info']["link"]
            item['article_url'] = article_url
            item['series_url'] = ''
            item['list_url'] = ''
            item['article_type'] = ''
            item['article_source'] = ''
            item['insert_time'] = str(datetime.now()).split('.')[0]
            item['update_time'] = str(datetime.now()).split('.')[0]
            item['topic_id'] = note_id
            item['author_id'] = data["user"]["id"]
            item['file_code'] = '28'
            item['reposts_count'] = data['shared_count']
            # print(item)
            self.write_news_jsonfile(item)
            # self.queue.put(item)
            if int(comments) > 0:
                try:
                    self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title)
                except:
                    logging.error(traceback.format_exc())
                    try:
                        self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title)
                    except:
                        logging.error(traceback.format_exc())

    # @retry(stop_max_attempt_number=2, retry_on_exception=retry_if_key_error)
    def get_note_comment(self, note_id, keyword, article_url, news_date, news_time, title, start='', now_page=1):
        if start:
            response = self.xhsapi.get_note_comments(note_id, 20, start)
        else:
            response = self.xhsapi.get_note_comments(note_id, 20)
        # if '"result":0' in response.text and 'msg:' in response.text:
        #     del self.session_id_list[self.session_id_list.index(s)]
        #     return

        data = json.loads(response)
        # print(data)
        comment_list = data['data']["comments"]
        comment_count = data['data']["comment_count_l1"]

        last_comment_id = ''
        total_item = ''
        for comment in comment_list:
            item = {}
            item['platform'] = '小红书'
            item['source_date'] = news_date
            item['source_time'] = news_time
            date_all = comment['time']
            # #转换成localtime
            time_local = time.localtime(float(date_all))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            comment_date = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
            # # 做时间判断部分---------------
            # get_news_time = time.mktime(time.strptime(str(comment_date), "%Y-%m-%d %H:%M"))
            # # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
            # if self.start_time != '':
            #     start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d %H:%M"))
            # else:
            #     start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d %H:%M"))
            # if float(get_news_time) < float(start_time):
            #     self.is_work = False
            #     return
            #
            # if float(start_time) <= float(get_news_time):

            get_news_time = time.mktime(time.strptime(str(comment_date).split(' ')[0], "%Y-%m-%d"))
            end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
            if self.start_time != '':
                start_time = time.mktime(time.strptime(self.start_time, "%Y-%m-%d"))
            else:
                start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
            if float(get_news_time) < float(start_time):
                self.is_get_comment = False  # 返回的回答消息是按时间进行排序的，所以当时间小于指定时间时，就停止爬取，
                break
            elif float(start_time) <= float(get_news_time) <= float(end_time):
                item['date'] = comment_date.split(' ')[0]
                item['time'] = comment_date.split(' ')[1]
                item['title'] = title
                item['author'] = comment['user']["nickname"]
                item['author_id'] = comment['user']["userid"]
                item['content'] = comment["content"]
                comment_id = comment["id"]
                last_comment_id = comment_id
                item['content_id'] = comment_id
                item['floor'] = ''
                item['keyword'] = keyword
                item['source_url'] = article_url
                item['comment_url'] = article_url
                item['views'] = ''
                item['comments_count'] = ''
                item['likes'] = comment["like_count"]
                item['dislikes'] = ''
                item['insert_time'] = str(datetime.now()).split('.')[0]
                item['update_time'] = str(datetime.now()).split('.')[0]
                item['topic_id'] = note_id
                item['file_code'] = '42'
                item['reposts_count'] = ''
                # print(item)
                # print(11111111, item)
                item = json.dumps(dict(item), ensure_ascii=False) + '\n'
                total_item = total_item + item
        self.write_comment_jsonfile(total_item)
        # self.comment_queue.put

        # print(last_comment_id)
        all_page_num = math.ceil(float(int(comment_count)/20))
        if int(all_page_num) > now_page and self.is_work:
            now_page += 1
            time.sleep(0.1)
            try:
                self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title, last_comment_id, now_page)
            except:
                try:
                    self.get_note_comment(note_id, keyword, article_url, news_date, news_time, title, last_comment_id, now_page)
                except:
                    pass

    # 写入json文件
    def write_news_jsonfile(self, item):
        logging.log(31, '写入文章数据，')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./28_{}_xiaohongshu_article.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write('{}/{}/{}/28_{}_{}_xiaohongshu_article.json'.format(self.file_path, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8')

    def write_comment_jsonfile(self, item):
        logging.log(31, '写入评论数据')
        # item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./42_{}_xiaohongshu_comment.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write('{}/{}/{}/42_{}_{}_xiaohongshu_comment.json'.format(self.comment_apth, self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name, str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8')

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    def run(self):
        try:
            url_list = get_config_para('nike_daily_keywords')

            for index, item in enumerate(url_list):
                # print(1)
                keyword = item['keywords']
                logger.log(31, '关键词:' + keyword)
                for i in range(1, 20):
                    print('获取搜索列表页: ', i)
                    try:
                        self.get_serach_list(str(i), keyword)
                    except:
                        logger.error(traceback.format_exc())
        except:
            print(traceback.format_exc())
            logger.critical(traceback.format_exc())

Exemple #11

0

Afficher le fichier

class AiQiYi(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, file_path, comment_path):

        # 时间判断部分
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # print('爬取时间段：{}到{}'.format(news_start_time, yesterday))

        logging.info('爬取时间段：{}到{}'.format(news_start_time, yesterday))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # self.start_time = '2019-09-09'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        # self.end_time = '2019-09-16'
        # 标记爬虫工作
        self.is_work = True
        self.is_stop = False
        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs('{}/{}'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}'.format(
            self.comment_apth,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]

    def get_video_list(self, url, keyword):
        logger.info('搜索url:  ' + url + '   ' + keyword)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=120)
        data = etree.HTML(response.content.decode())
        # print(response.text)
        video_list = data.xpath('.//div[@class="qy-search-result-item"]')
        for video in video_list:
            try:
                video_url = video.xpath(
                    './/h3[@class="qy-search-result-tit title-line"]/a/@href'
                )[0]

                try:
                    video_time = video.xpath(
                        './/div[@class="qy-search-result-info half"]/span[@class="info-des"]/text()'
                    )[0]
                except:
                    video_time = str(datetime.now()).split(' ')[0]
                logger.info('视频时间：' + video_time)
                # 做时间判断部分---------------
                get_news_time = time.mktime(
                    time.strptime(video_time, "%Y-%m-%d"))
                end_time = time.mktime(time.strptime(self.end_time,
                                                     "%Y-%m-%d"))

                start_time = time.mktime(
                    time.strptime(self.start_time, "%Y-%m-%d"))
                if float(get_news_time) < float(start_time):
                    self.is_stop = True  # 返回的回答消息是按时间进行排序的，所以当时间小于指定时间时，就停止爬取，
                    break

                elif float(start_time) <= float(get_news_time) <= float(
                        end_time):
                    try:
                        if 'http' not in video_url:
                            video_url = 'https:' + video_url
                        print(2222222, video_url)
                        self.get_video_page(video_url, keyword)
                    except:
                        print(traceback.format_exc())
                        logger.error(traceback.format_exc())
            except:
                print(traceback.format_exc())

        if data.xpath('.//a[@data-key="down"]') and not self.is_stop:
            next_page = data.xpath('.//a[@data-key="down"]/@href')[0]
            next_page = 'https://so.iqiyi.com' + next_page.strip()
            print(next_page)
            self.get_video_list(next_page, keyword)

    def get_video_page(self, url, keyword):
        logger.info('视频url: ' + url)

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=120)
        # data = etree.HTML(response.content.decode())
        # page_info = data.xpath('.//div[@id="iqiyi-main"]/div/@page-info')
        # print(page_info)
        if response.status_code == 200:
            text = response.text

            get_page_info = re.search("page-info=[\S\s]*video-info", text)[0]
            try:
                page_info = get_page_info[11:][:-13]
                page_info = json.loads(page_info)
            except:
                try:
                    page_info = get_page_info[11:][:-14]
                    page_info = json.loads(page_info)
                except:
                    # print(get_page_info)
                    logger.error(traceback.format_exc())
            # print(page_info)
            video_info = re.search("video-info=[\S\s]*}'>", text)[0]
            video_info = video_info[12:][:-2]
            video_info = json.loads(video_info)
            item = {}
            item['platform'] = '爱奇艺'
            date_all = video_info['firstPublishTime']
            date_all = str(date_all)[:-3]
            # #转换成localtime
            time_local = time.localtime(float(date_all))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d %H:%M:%S",
                               time_local)  # "%Y-%m-%d %H:%M:%S"
            item['date'] = dt.split(' ')[0]  # 发布日期
            item['time'] = dt.split(' ')[1]  # 发布时间
            item['title'] = video_info['name']  # 视频标题
            item['description'] = video_info['description']  # 视频描述
            try:
                item['source_author'] = video_info['user']['name']  # 来源/上传者
                item['followers_count'] = video_info['user'][
                    'followerCount']  # 粉丝数
            except:
                item['source_author'] = ''
                item['followers_count'] = ''
            item['clicks'] = ''  # 点击数
            item['play'] = ''  # 播放量
            item['keyword'] = keyword  # 关键词
            item['url'] = url  # URL
            try:
                item['categroy'] = video_info['categories'][0]['name']  # 视频分类
            except KeyError:
                item['categroy'] = ''
            video_id = video_info['tvId']
            likes = self.get_likes_count(video_id)  # 获取点赞数
            item['likes'] = likes  # 点赞数
            page = 1
            comment_count = self.get_comment_count(video_id, page)  # 获取评论数
            item['comments_count'] = comment_count  # 评论数
            item['topic_id'] = url.split('/')[-1].split('.')[0]  # 主贴id
            item['author_id'] = video_info['user']['id']  # 作者id
            item['content_id'] = url.split('/')[-1].split('.')[0]
            item['file_code'] = '111'
            item['reposts_count'] = ''
            # print(item)
            self.write_news_jsonfile(item)
            if int(comment_count) > 0:
                self.get_comment(video_id, page, url, video_info['name'],
                                 comment_count, keyword,
                                 dt.split(' ')[0],
                                 dt.split(' ')[1])  # 获取评论

    def get_likes_count(self, video_id):
        url = 'http://iface2.iqiyi.com/like/count?businessType=14&entityId={}&qyid=63204618cb07f6722139214f3b31f1b0&callback=jsonp_1550734824178_93496'.format(
            str(video_id))
        response = requests.get(url)
        text = response.text
        text = text[30:][:-2]
        text = json.loads(text)
        likes = text['data']
        return likes

    def get_comment_count(self, video_id, page):
        """
        获取评论数量
        :param video_id:
        :param page:
        :return:
        """
        # http://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id=31067882509&hot_size=10&last_id=&page=1&page_size=10&types=hot,time&callback=jsonp_1550734826037_45721
        url = 'http://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id={}&hot_size=10&last_id=&page={}&page_size=20&types=hot,time&callback=jsonp_1550734826037_45721'.format(
            str(video_id), str(page))
        response = requests.get(url)
        text = response.text
        text = text[31:][:-14]
        text = json.loads(text)
        # print(text)
        comment_count = text['data']['count']
        # print(comment_count)
        return comment_count

    def get_comment(self, video_id, page, source_url, title, comment_count,
                    keyword, source_date, source_time):
        """
        获取评论内容, 和上面的分开写是为了方便调用和修改
        :param video_id:
        :param page:
        :return:
        """
        # http://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id=31067882509&hot_size=10&last_id=&page=1&page_size=10&types=hot,time&callback=jsonp_1550734826037_45721
        url = 'http://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&content_id={}&hot_size=10&last_id=&page={}&page_size=20&types=hot,time&callback=jsonp_1550734826037_45721'.format(
            str(video_id), page)
        response = requests.get(url)
        text = response.text
        text = text[31:][:-14]
        text = json.loads(text)
        # print(22222, text)
        comments_list = text['data']['comments']
        for comment in comments_list:
            # print(comment)
            item = {}
            item['platform'] = '爱奇艺'

            time_all = comment['addTime']
            # #转换成localtime
            time_local = time.localtime(float(time_all))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d %H:%M:%S",
                               time_local)  # "%Y-%m-%d %H:%M:%S"
            # print(dt)
            item['date'] = str(dt).split(' ')[0]
            item['time'] = str(dt).split(' ')[1]
            item['title'] = title
            item['author'] = comment['userInfo']['uname']
            item['content'] = comment['content']
            item['floor'] = comment['floor']
            item['keyword'] = keyword
            item['comment_url'] = source_url
            item['source_url'] = source_url
            item['comments_count'] = ''
            item['likes'] = comment['likes']
            item['views'] = ''
            item['topic_id'] = source_url.split('/')[-1].split('.')[0]  # 主贴id
            item['author_id'] = comment['userInfo']['uid']  # 作者id
            item['content_id'] = comment['id']  # 作者id
            item['file_code'] = '112'
            item['source_date'] = source_date
            item['source_time'] = source_time
            item['reposts_count'] = ''
            self.write_comment_jsonfile(item)
        if int(comment_count) > 20 * page:  # 判断评论数量，进行翻页操作
            page += 1
            self.get_comment(video_id, page, source_url, title, comment_count,
                             keyword, source_date, source_time)

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    # 写入json文件
    def write_news_jsonfile(self, item):
        logging.log(31, '写入数据......')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./../aiqiyi/json_file/{}/{}_aiqiyi_video_adidas.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        try:
            # str_time = time.time()
            self.hdfsclient.new_write(
                '{}/{}/111_{}_{}_aiqiyi_video.json'.format(
                    self.file_path,
                    str(datetime.now()).split(' ')[0].replace('-', ''),
                    str(datetime.now()).split(' ')[0].replace('-', '_'),
                    self.time_time),
                item,
                encoding='utf-8')
            # get_time = time.time() - str_time
            # print('用时：', get_time)
        except:
            logging.log(31, '视频数据写入重试中.....')
            self.write_news_jsonfile(item)

    def write_comment_jsonfile(self, item):
        logging.log(31, '写入评论数据......')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./../aiqiyi/json_file/{}/{}_aiqiyi_video_comment_adidas.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        try:
            self.hdfsclient.new_write(
                '{}/{}/112_{}_{}_aiqiyi_video_comment.json'.format(
                    self.comment_apth,
                    str(datetime.now()).split(' ')[0].replace('-', ''),
                    str(datetime.now()).split(' ')[0].replace('-', '_'),
                    self.time_time),
                item,
                encoding='utf-8')

        except:
            logging.log(31, '评论数据写入重试中.....')
            self.write_comment_jsonfile(item)

    def run(self):

        url_list = get_config_para('nike_daily_keywords')
        logger.log(31, url_list)
        for item in url_list:
            # print(1)
            keyword = item['keywords']
            logger.log(31, keyword)
            # https://so.iqiyi.com/so/q_%E5%A5%A5%E8%BF%AAA3%E4%B8%A4%E5%8E%A2_ctg_%E6%B1%BD%E8%BD%A6_t_0_page_1_p_1_qc_0_rd__site_iqiyi_m_4_bitrate_?af=true
            # for keyword in cols:
            url = 'https://so.iqiyi.com/so/q_{}_ctg__t_0_page_1_p_1_qc_0_rd_2_site_iqiyi_m_4_bitrate_?af=true'.format(
                keyword)
            self.is_stop = False
            self.get_video_list(url, keyword)

Exemple #12

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]

        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]

        print('爬取时间段：{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段：{}到{}'.format(news_start_time, current_day))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day
        # 标记爬虫工作
        self.is_work = False
        self.redis_example = redis_example
        self.pid = os.getpid()
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;', '', ret6)
            ret8 = re.sub(r'https:', '', ret7)
            ret9 = re.sub(r'\[', '', ret8)
            ret10 = re.sub(r'\]', '', ret9)
            ret11 = re.sub(r"'", "", ret10)
            return ret11
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    def parse_goods_url(self, items):
        goods_dict = dict()
        goods_dict['平台'] = items['平台']
        goods_dict['关键词'] = items['关键词']
        goods_dict['商品名'] = items['商品名']
        goods_dict['商品图片'] = items['商品图片']
        goods_dict['URL'] = items['URL']
        goods_dict['shop_name'] = items['shop_name']
        goods_dict['价格'] = items['价格']
        goods_dict['goods_id'] = items['goods_id']
        goods_dict['品牌'] = items['品牌']
        goods_dict['月销量'] = ''
        # logger.log(31, '--------********正在抓取的商品是：%s********--------' % goods_dict)
        self.parse_comment_num(goods_dict)

    # # 解析商品品牌信息
    # def parse_goods_details(self, items):
    #     try:
    #         # print(goods_dict)
    #         goods_url = items['URL']
    #         # print(goods_url)
    #
    #         # 截取评论拼接url里面的productId
    #         productId = items['goods_id']
    #         # print(productId)
    #         headers = {
    #             'content-type': 'text/html; charset=gbk',
    #             'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_5_1556518922567; __jdb=122270672.8.15553802647041324770645|15.1556518168',
    #             'upgrade-insecure-requests': '1',
    #             'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
    #         }
    #         try:
    #             time.sleep(0.2)
    #             response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
    #         except (requests.exceptions.ProxyError, requests.exceptions.ConnectionError, ConnectionResetError):
    #             try:
    #                 time.sleep(0.2)
    #                 response = requests.get(url=goods_url, headers=headers, allow_redirects=False, timeout=30)
    #             except:
    #                 time.sleep(0.2)
    #                 response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
    #         # print(response.content)
    #         # 将响应转换成一个element对象
    #         html = etree.HTML(response.content)
    #         # print(html)
    #         # 获取所有品牌节点列表
    #         try:
    #             pin_pai = html.xpath('//div[@class="p-parameter"]/ul[1]/li/@title')[0]
    #         except:
    #             pin_pai = ''
    #         # print(pin_pai_list)
    #         goods_dict = dict()
    #         goods_dict['平台'] = items['平台']
    #         goods_dict['关键词'] = items['关键词']
    #         goods_dict['URL'] = items['URL']
    #         goods_dict['价格'] = items['价格']
    #         goods_dict['商品名'] = items['商品名']
    #         goods_dict['品牌'] = pin_pai
    #         goods_dict['月销量'] = ''
    #         goods_dict['shop_name'] = items['shop_name']
    #         goods_dict['productId'] = productId
    #         # print(goods_dict)
    #         self.parse_comment_num(goods_dict)
    #     except:
    #         print(111111111111111111111111, traceback.format_exc())

    # 抓取商品评论数
    def parse_comment_num(self, goods_dict):
        try:
            productId = goods_dict['goods_id']
            referer_url = goods_dict['URL']
            comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(
                productId)
            headers = {
                'content-type':
                'text/html;charset=GBK',
                'authority':
                'sclub.jd.com',
                'method':
                'GET',
                # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1',
                'referer':
                referer_url,
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            try:
                time.sleep(0.1)
                response = requests.get(url=comment_url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            comment_data = re.search(r'{"productAttr":null.*]}', response.text)
            if 'commentCount' in response.text:
                comment_number = json.loads(comment_data.group(
                ))['productCommentSummary']['commentCount']
                goods_dict['comment_num'] = json.loads(comment_data.group(
                ))['productCommentSummary']['commentCountStr']
                if int(comment_number) == 0:
                    # print('****************该商品没有评论数据*********')
                    # logger.log(31, '****************该商品没有评论数据*********')
                    pass
                else:
                    pages = int(math.ceil(float(int(comment_number) / 10)))
                    self.goods_comments(goods_dict, pages)
        except:
            print(33333333333333333333333, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, pages):
        try:
            is_break = self.is_work
            # print(goods_dict)
            productId = goods_dict['goods_id']
            headers = {
                'content-type':
                'text/html;charset=GBK',
                'authority':
                'sclub.jd.com',
                'method':
                'GET',
                # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1',
                'referer':
                '{}'.format(goods_dict['URL']),
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            if int(pages) >= 50:
                pages_num = 59
            else:
                pages_num = pages
            # 抓取商品评论链接(总共50页,第一页从0开始)
            for i in range(0, int(pages_num)):
                comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(
                    productId, i)
                # print(comment_url)
                try:
                    time.sleep(0.1)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except (requests.exceptions.ProxyError,
                        requests.exceptions.ConnectionError,
                        ConnectionResetError):
                    try:
                        time.sleep(0.1)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        time.sleep(0.1)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                # logger.log(31, "正在抓取的页面是: %s" % comment_url)
                comments = response.text
                comment = re.search(
                    r'{"productAttr":null.*"afterDays":0}]}|{"productAttr":null.*]}',
                    comments)
                # 总销量
                if 'comments' in comments:
                    items = json.loads(comment.group())['comments']
                    # print(pages_num, len(items))
                    if int(len(items)) == 0:
                        break
                    else:
                        for item in items:
                            date_data = item['creationTime'].split(
                                ' ')[0].strip()
                            time_data = item['creationTime'].split(
                                ' ')[1].strip()
                            # print(date, time)
                            try:
                                content = self.re_html(item['content'])
                            except:
                                content = ''
                            # 追加评论
                            try:
                                comments_2 = item['afterUserComment'][
                                    'content']
                            except:
                                comments_2 = ''

                            # 判断评论时间是否在规定的抓取时间内
                            if self.start_time <= date_data:
                                goods_comment_dict = dict()
                                goods_comment_dict['platform'] = goods_dict[
                                    '平台']
                                goods_comment_dict['date'] = date_data.strip()
                                goods_comment_dict['time'] = time_data.strip()
                                goods_comment_dict['keyword'] = goods_dict[
                                    '关键词']
                                goods_comment_dict['name'] = goods_dict['商品名']
                                goods_comment_dict['imageurl'] = goods_dict[
                                    '商品图片']
                                goods_comment_dict['audiourl'] = ''
                                goods_comment_dict['url'] = goods_dict['URL']
                                goods_comment_dict['shop_name'] = goods_dict[
                                    'shop_name']
                                goods_comment_dict['user_name'] = item[
                                    'nickname']
                                goods_comment_dict['author_id'] = ''
                                goods_comment_dict[
                                    'content'] = content + ';' + comments_2
                                goods_comment_dict['content_id'] = str(
                                    item['id'])
                                goods_comment_dict['brand'] = goods_dict['品牌']
                                goods_comment_dict['price'] = goods_dict['价格']
                                goods_comment_dict['sales'] = goods_dict['月销量']
                                goods_comment_dict['focus_count'] = ''
                                goods_comment_dict['comment_num'] = goods_dict[
                                    'comment_num']
                                goods_comment_dict['views'] = ''
                                goods_comment_dict['likes'] = item[
                                    'usefulVoteCount']
                                try:
                                    goods_comment_dict[
                                        'comments_count'] = item['replyCount']
                                except:
                                    goods_comment_dict['comments_count'] = ''
                                goods_comment_dict['reposts_count'] = ''
                                goods_comment_dict['topic_id'] = str(
                                    goods_dict['goods_id'])
                                try:
                                    goods_comment_dict['type'] = item[
                                        'productColor']
                                except:
                                    goods_comment_dict['type'] = ''
                                try:
                                    goods_comment_dict['size'] = item[
                                        'productSize']
                                except:
                                    goods_comment_dict['size'] = ''
                                goods_comment_dict['file_code'] = '51'
                                # logger.log(31, '-----------正在写入符合时间的商品信息----------------')
                                # print(goods_comment_dict)
                                item = json.dumps(dict(goods_comment_dict),
                                                  ensure_ascii=False) + '\n'
                                self.hdfsclient.new_write(
                                    '/user/cspider_daily/nike_daily/ecommerce/{}/51_{}_{}_jingdong_nike{}.json'
                                    .format(time.strftime('%Y%m%d'),
                                            time.strftime('%Y%m%d'),
                                            self.time_data, self.pid),
                                    item,
                                    encoding='utf-8')
                            if date_data.strip() < self.start_time:
                                is_break = True
                        if is_break:
                            break
        except:
            print(22222222222222222222222, traceback.format_exc())

    def run(self, lock):
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('JingDong_day_url')
            if str(redis_url_num) == '0':
                print(
                    '*****************Redis消息队列中url为空，程序等待中.....进程 {} 等待中......******************'
                    .format(str(os.getpid())))
            item = self.redis_example.brpop('JingDong_day_url', timeout=600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_goods_url(item1)

Exemple #13

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间判断部分
        date = datetime.datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]

        now = datetime.datetime.now() - timedelta(days=0)  # 昨天时间
        now_date = str(now).split(' ')[0]
        print('爬取时间段：{}到{}'.format(news_start_time, now_date))
        logging.info('爬取时间段：{}到{}'.format(news_start_time, now_date))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = now_date
        # 标记爬虫工作
        self.is_work = False
        self.redis_example = redis_example
        self.pid = os.getpid()

        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\[', '', ret1)
            ret3 = re.sub(r'\]', '', ret2)
            ret4 = re.sub(r'广告这些是您在亚马逊上看到的商品广告。点击广告，您将前往商品所在页面。了解更多关于广告的信息',
                          '', ret3)
            ret5 = re.sub(r'\\xa0', '', ret4)
            ret6 = re.sub(r'海外购满200元免运费', '', ret5)
            ret7 = re.sub(r'更多购买选择', '', ret6)
            ret8 = re.sub(r'品牌', '', ret7)
            ret9 = re.sub(r'"append","#cm_cr-review_list",', '', ret8)
            ret10 = re.sub(r'"', '', ret9)
            return ret10
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 过滤user_name
    def re_user_name(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'前\d+名评论人', '', message)
            ret2 = re.sub(r'\n', '', ret1)
            return ret2
        except:
            pass

    # 匹配具体时间
    def clean_date(self, x):
        now = datetime.datetime.now()
        if str(x).find('昨天') != -1:
            x = datetime.datetime.strftime(now + datetime.timedelta(days=-1),
                                           '%Y-%m-%d %H:%M:%S')
        elif str(x).find('前天') != -1:
            x = datetime.datetime.strftime(now + datetime.timedelta(days=-2),
                                           '%Y-%m-%d %H:%M:%S')
        elif str(x).find('天前') != -1:
            x = datetime.datetime.strftime(
                now + datetime.timedelta(days=-int(str(x).replace('天前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('小时前') != -1:
            x = datetime.datetime.strftime(
                now +
                datetime.timedelta(hours=-int(str(x).replace('小时前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('分钟前') != -1:
            x = datetime.datetime.strftime(
                now +
                datetime.timedelta(minutes=-int(str(x).replace('分钟前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('今天') != -1:
            x = str(x).replace('今天', now.strftime('%Y-%m-%d') + ' ')
        elif str(x).find('刚刚') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('秒前') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('月前') != -1:
            x = datetime.datetime.strftime(
                now +
                datetime.timedelta(weeks=-4 * int(str(x).replace('月前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('周前') != -1:
            x = datetime.datetime.strftime(
                now + datetime.timedelta(weeks=-int(str(x).replace('周前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('[') != -1:
            x = x.replace('[', '').replace(']', '')
        elif str(x).find('月') != -1:
            x = x.replace('月', '-').replace('日', '')
        return x

    # 抓取商品品牌信息
    def parse_goods_brand(self, goods_dict):
        try:
            # print(goods_dict)
            url = goods_dict['url']
            # print('*************************商品详情页' + url)
            headers = {
                # 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
                # 'Cookie': 'x-wl-uid=1TZhgwHTJAuLn8oreMzKQn1F14u+yWLnkVnV1mHxoFBZVluB35GzI3vNZyOaUXm1eXxDdVSvG/jk=; session-id=461-0953337-2517812; ubid-acbcn=462-0558053-9620064; i18n-prefs=CNY; lc-acbcn=zh_CN; x-amz-captcha-1=1565689220474259; x-amz-captcha-2=O0HfV0HAdNq8q0k6ODz5yA==; session-token=Masvfy+QDMESO49Iacs+I77sObVPwrSbsVNucyNsgXupKeHI3zVO2/zgQTAUsJUOOcC8swOMHILZfrVmo85e45fYuETObv3I2N3CYtSgBaET4WZ1l7qnzkzQ0yWNVcqvgtSbNDZXWNii93OIcke5QSx0Y3kmJZaGk5+H9Nn2rD7c2YStoxaV/0yQ0UsfRfwj; csm-hit=tb:s-SKSGNJDF9HE5MK9C3DDT|1566530133484&t:1566530133820&adb:adblk_yes; session-id-time=2082729601l',
                'Host':
                'www.amazon.cn',
                'Pragma':
                'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                # time.sleep(0.1)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            # response.encoding = 'utf-8'
            # print(response.text)
            asin_id = response.url.split('/')[4]
            # print(asin_id)
            # 将响应转换成一个element对象
            html = etree.HTML(response.text)
            # 获取商品评论数
            re_comments_num = html.xpath(
                '//span[@class="a-size-base a-color-secondary"]/text()')
            # print(re_comments_num)
            if re_comments_num == []:
                # logger.log(31, '--------------没有商品评论信息----------------')
                pass
            else:
                comments_num = self.re_not_number(re_comments_num[0])
                # print('评论数: ', comments_num)
                # 评价人数
                goods_dict['achieve_num'] = comments_num
                # 获取商品品牌信息
                brand_data1 = re.search(r'品牌</td><td class="value">.*?</td>',
                                        response.text)

                if brand_data1 != None:
                    brand_name = self.re_html(brand_data1.group())
                else:
                    brand_data2 = html.xpath(
                        '//div[@id="ppd"]/div[2]/div[2]/div/div/div[1]/div[1]/div/a/text()'
                    )
                    if brand_data2 == []:
                        brand_name = ''
                    else:
                        try:
                            brand_name = brand_data2[0].split(' ')[0]
                        except:
                            brand_name = ''
                # 商品品牌
                goods_dict['brand'] = brand_name
                # 销量
                goods_dict['sales'] = ''
                goods_dict['asin_id'] = asin_id
                # print(goods_dict)

                # 抓取页数
                page_num = int(
                    math.ceil(
                        float(
                            int(self.re_not_number(goods_dict['achieve_num']))
                            / 10)))
                # print('***---回复数: %s,页数：%s ***---' % (comments_num, page_num))
                # 抓取评论量
                self.parse_amazon_comment(page_num, goods_dict)

        except:
            print(22222222222222222222, traceback.format_exc())

    # 抓取页数大于0的评论
    def parse_amazon_comment(self, page_num, goods_dict):
        try:
            is_break = self.is_work
            # print(goods_dict['url'])
            headers = {
                'Content-Type':
                'application/x-www-form-urlencoded;charset=UTF-8',
                'Cookie':
                'session-id=457-6049818-5407509; i18n-prefs=CNY; ubid-acbcn=461-1543774-5730813; x-wl-uid=1D2HfAfNoe4eUdJ6ZzyM2fnvna5QixxATqyW5m655FgD9MFQ0BQOrYAub+2t2juEPWKvSIO9wETU=; lc-acbcn=zh_CN; session-token=q7jDZTzYPSN0ujucLEDRVnx7QbLwQdbfOyVVn5sdYL1XaQm73hM1Kf01JGRuR/3AZ1IX24BUDL3mq5DGmIiN1UAQ/DtCP/HhHNLIw9ct8KzziVH+J5r2FrvA0ObuVLGlqYbghQbq2Ddhm8zB/AHX7OHvGD0LWTIaDpfYJ62e2fz813rIz0IkwKLvoFjSiT+G; session-id-time=2082729601l; csm-hit=tb:Q0KNXH65T2X9SHESP2YH+s-9R0M13527VFRJHPP284C|1574144443485&t:1574144443485&adb:adblk_yes',
                'Host': 'www.amazon.cn',
                'Origin': 'https://www.amazon.cn',
                'Pragma': 'no-cache',
                'Referer':
                'https://www.amazon.cn/Nike-%E8%80%90%E5%85%8B-Revolution-4-%E7%94%B7%E5%A3%AB%E8%B7%91%E6%AD%A5%E9%9E%8B/product-reviews/B079QP634Q/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
                'X-Requested-With': 'XMLHttpRequest'
            }
            if int(page_num) >= 50:
                pages = 50
            else:
                pages = page_num
            for i in range(1, int(pages) + 1):
                # print('***---抓取评论页为第%s页***---' % i)
                url = 'https://www.amazon.cn/hz/reviews-render/ajax/reviews/get/ref=cm_cr_getr_d_paging_btm_prev_{}'.format(
                    i)
                # print(url)
                form_data = {
                    'sortBy': 'recent',
                    'reviewerType': 'all_reviews',
                    'formatType': '',
                    'mediaType': '',
                    'filterByStar': '',
                    'pageNumber': i,
                    'filterByLanguage': '',
                    'filterByKeyword': '',
                    'shouldAppend': 'undefined',
                    'deviceType': 'desktop',
                    'reftag': 'cm_cr_getr_d_paging_btm_prev_{}'.format(i),
                    'pageSize': '10',
                    'asin': goods_dict['asin_id'],
                    'scope': 'reviewsAjax{}'.format(int(i) - 1)
                }
                try:
                    # time.sleep(0.1)
                    response = requests.post(url=url,
                                             headers=headers,
                                             data=form_data,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
                except:
                    try:
                        # time.sleep(0.1)
                        response = requests.post(url=url,
                                                 headers=headers,
                                                 data=form_data,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                    except:
                        # time.sleep(0.1)
                        response = requests.post(url=url,
                                                 headers=headers,
                                                 data=form_data,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                comment_data = response.text.split('&&&')[5:-5]
                # print(comment_data)
                comment_dict = dict()
                for item in comment_data:
                    # print(goods_dict['url'])
                    data = self.re_html(item.replace(' ', ''))
                    # print(data)
                    # 帖子id
                    topic_id = re.search(r'<divid=\\".*?\\',
                                         item.replace(' ',
                                                      '')).group().replace(
                                                          '<divid=\\"',
                                                          '').replace(
                                                              '\\', '')
                    # 评价用户名
                    user_name = self.re_user_name(
                        re.sub(r'\d.\d颗星，最多5颗星', '-',
                               data).split('-')[0].replace('\\n', ''))
                    # 评论日期
                    date_data_test = re.search(
                        r'\d{1,4}年\d{1,4}月\d{1,4}日', data).group().replace(
                            '年', '-').replace('月', '-').replace('日', '')
                    a = date_data_test.split('-')[1]
                    b = date_data_test.split('-')[2]
                    if int(len(a)) == 1 and int(len(b)) != 1:
                        date_data = date_data_test.split(
                            '-')[0] + '-0' + date_data_test.split(
                                '-')[1] + '-' + date_data_test.split('-')[2]
                    elif int(len(a)) != 1 and int(len(b)) == 1:
                        date_data = date_data_test.split(
                            '-')[0] + '-' + date_data_test.split(
                                '-')[1] + '-0' + date_data_test.split('-')[2]
                    elif int(len(a)) == 1 and int(len(b)) == 1:
                        date_data = date_data_test.split(
                            '-')[0] + '-0' + date_data_test.split(
                                '-')[1] + '-0' + date_data_test.split('-')[2]
                    else:
                        date_data = date_data_test
                    # 评价时间
                    time_data = ''
                    # 作者id
                    author_id = ''
                    # print(achieve_content_data)
                    # print(data)
                    test_type_sisz = re.search(
                        '\d{1,4}年\d{1,2}月\d{1,2}日.*?有帮助', data)
                    # print(test_type_sisz.group())
                    # 鞋子类型
                    try:
                        type_data = test_type_sisz.group().split(':')[2].split(
                            '已确认购买')[0].replace('颜色', '')
                    except:
                        type_data = ''
                    # 鞋子尺码
                    try:
                        size = data.split(':')[1].replace('颜色', '')
                    except:
                        size = ''
                    # print(type)
                    # print(size)
                    # 判断评论时间是否在规定的抓取时间内
                    if self.start_time <= date_data.strip():
                        comment_dict['platform'] = goods_dict['platform']
                        comment_dict['date'] = date_data.strip()
                        comment_dict['time'] = time_data.strip()
                        comment_dict['keyword'] = goods_dict['keyword']
                        comment_dict['name'] = goods_dict['name']
                        comment_dict['imageurl'] = goods_dict['商品图片']
                        comment_dict['audiourl'] = ''
                        comment_dict['url'] = goods_dict['url']
                        comment_dict['shop_name'] = ''
                        comment_dict['user_name'] = self.re_user_name(
                            user_name)
                        try:
                            comment_dict['content'] = data.split(
                                '已确认购买')[1].split('有帮助')[0].split('\\n')[0]
                        except:
                            comment_dict['content'] = ''
                        comment_dict['content_id'] = str(topic_id)
                        comment_dict['brand'] = goods_dict['brand']
                        comment_dict['price'] = goods_dict['price']
                        comment_dict['sales'] = goods_dict['sales']
                        comment_dict['focus_count'] = ''
                        comment_dict['comment_num'] = goods_dict['achieve_num']
                        comment_dict['views'] = ''
                        comment_dict['likes'] = ''
                        comment_dict['comments_count'] = ''
                        comment_dict['reposts_count'] = ''
                        comment_dict['author_id'] = str(author_id)
                        comment_dict['topic_id'] = str(
                            goods_dict['url'].split('/')[4])
                        comment_dict['type'] = type_data
                        comment_dict['size'] = size
                        comment_dict['file_code'] = '54'
                        # print('***********正在写入符合时间的评论*******************')
                        # print(comment_dict)
                        # items = json.dumps(dict(comment_dict), ensure_ascii=False) + '\n'
                        # with open('./json_data/54_{}_{}_amazon_nike{}.json'.format(time.strftime('%Y%m%d'), self.time_data, self.pid), 'ab') as f:
                        #     f.write(items.encode("utf-8"))
                        item = json.dumps(dict(comment_dict),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_daily/ecommerce/{}/54_{}_{}_amazon_nike{}.json'
                            .format(time.strftime('%Y%m%d'),
                                    time.strftime('%Y%m%d'), self.time_data,
                                    self.pid),
                            item,
                            encoding='utf-8')
                    if date_data.strip() < self.start_time:
                        is_break = True
                if is_break:
                    break
        except:
            print(444444444444444444444, traceback.format_exc())

    def run(self, lock):
        for num in range(100000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('anazon_day_url')
            if str(redis_url_num) == '0':
                print(
                    '********************\nRedis消息队列中url为空.....\n进程 {} 抓取结束......\n********************'
                    .format(str(os.getpid())))

            item = self.redis_example.brpop('anazon_day_url', timeout=600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item1)
            self.parse_goods_brand(item1)

Exemple #14

0

Afficher le fichier

class Spider(object):
    """
    get 文章
    """
    def __init__(self, file_path, comment_path, need_time):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        # # 时间判断部分
        # date = datetime.now() - timedelta(days=300)
        # news_start_time = str(date).split(' ')[0]
        # yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        # yesterday = str(yesterday).split(' ')[0]
        #
        # # 定义开始时间 y-m-d  离现在时间远  news_start_time
        # self.start_time = news_start_time
        # # 定义结束时间 y-m-d  离现在时间近  yesterday
        # self.end_time = yesterday
        # print('爬取时间段：{}到{}'.format(self.start_time, self.end_time))
        #
        # logging.info('爬取时间段：{}到{}'.format(self.start_time, self.end_time))
        #
        # # 定义评论的抓取时间范围
        # # self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-08-01'  # 一天回复
        # # self.comment_start_time = ''  # 不限定时间回复
        #
        # self.comment_end_time = yesterday
        # # self.comment_end_time = yesterday

        # get_now_time = time.time() - 86400
        get_now_time = time.time() - int(need_time)
        time_local = time.localtime(float(get_now_time))
        # 转换成新的时间格式(2016-05-05 20:28:54)
        dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        end_t = time.time()
        time_local = time.localtime(float(end_t))
        # 转换成新的时间格式(2016-05-05 20:28:54)
        end_dt = time.strftime("%Y-%m-%d %H:%M",
                               time_local)  # "%Y-%m-%d %H:%M:%S"
        # end_time = str(end_time).split(' ')[0]
        logger.log(31, '爬取时间段：{}到{}'.format(dt, end_dt))
        # 定义开始时间 y-m-d  离现在时间远
        self.start_time = dt
        # self.start_time = '2019-09-09 12:01'
        # 定义结束时间 y-m-d  离现在时间近
        self.end_time = end_dt
        # self.end_time = '2019-09-16 12:01'
        # 标记爬虫工作
        self.is_work = True
        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://*****:*****@class="clearfix"]/li')
        self.is_work = True

        for li in li_list:
            title = li.xpath('.//div[@class="news-intro"]/text()')[0]
            # print(title)
            views = li.xpath('.//span[@class="tip-view"]/text()')[0]
            comments_count = li.xpath(
                './/span[@class="tip-comment"]/text()')[0]
            date_all = li.xpath('.//span[@class="tip-date"]/text()')[0]
            date_all = self.time_change(date_all)
            # 做时间判断部分---------------
            logger.log(31, '时间' + date_all)
            if len(date_all) == 10:
                date_all += ' 12:01:01'
            if len(date_all) == 7:
                date_all += '-01 12:01:01'
            get_news_time = time.mktime(
                time.strptime(date_all[:-3], "%Y-%m-%d %H:%M"))
            end_time = time.mktime(
                time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time, "%Y-%m-%d %H:%M"))
            else:
                start_time = time.mktime(
                    time.strptime('2010-1-1', "%Y-%m-%d %H:%M"))
            if float(get_news_time) < float(start_time):
                self.is_work = False  # 返回的回答消息是按时间进行排序的，所以当时间小于指定时间时，就停止爬取，
            elif float(start_time) <= float(get_news_time) <= float(end_time):
                # print(views, comments_count, date_all)
                news_url = 'http://www.dunkhome.com' + li.xpath(
                    './/a[@title]/@href')[0].strip()
                self.get_news_info(news_url, views, comments_count, title)

        if self.is_work:  # 判断是否要继续进行翻页
            # 列表页翻页
            next_page = data.xpath('.//a[@rel="next"]/@href')[0]
            # print(next_page)
            next_page_url = 'http://www.dunkhome.com' + next_page
            self.get_news_page_list_two(next_page_url)
        else:
            logger.log(31, '版块抓取到指定时间......')

    def get_news_page_list_two(self, url):
        """
        社区版块 列表页
        :param url:
        :return:
        """
        response = requests.get(url, headers=self.headers_one)
        data = etree.HTML(response.content.decode())
        li_list = data.xpath(
            './/div[@class="items"]/div[@class="evaluation-item s-object-item s-eva-item"]'
        )
        self.is_work = True
        for li in li_list:
            title = li.xpath('.//h6/text()')[0]
            # print(title)
            views = ''
            comments_count = li.xpath('.//a[@class="item-comment"]/text()')[0]
            date_all = li.xpath('.//span[@class="item-time" ]/text()')[0]
            date_all = self.time_change(date_all)
            # 做时间判断部分---------------
            # print(date_all)
            if len(date_all) == 16:
                date_all += ':12'
            get_news_time = time.mktime(
                time.strptime(date_all[:-3], "%Y-%m-%d %H:%M"))
            end_time = time.mktime(
                time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time, "%Y-%m-%d %H:%M"))
            else:
                start_time = time.mktime(
                    time.strptime('2010-1-1', "%Y-%m-%d %H:%M"))
            if float(get_news_time) < float(start_time):
                self.is_work = False  # 返回的回答消息是按时间进行排序的，所以当时间小于指定时间时，就停止爬取，
            elif float(start_time) <= float(get_news_time) <= float(end_time):
                # print(views, comments_count, date_all)
                news_url = 'http://www.dunkhome.com' + li.xpath(
                    './/div[@class="item-content"]/a/@href')[0].strip()
                try:
                    self.get_news_info(news_url, views, comments_count, title)
                except:
                    pass

        if self.is_work:  # 判断是否要继续进行翻页

            # 列表页翻页
            next_page = data.xpath('.//a[@rel="next"]/@href')[0]
            # print(next_page)
            next_page_url = 'http://www.dunkhome.com' + next_page
            self.get_news_list_page(next_page_url)
        else:
            logger.log(31, '版块抓取到指定时间......')

    def get_news_info(self, url, views, comments_count, title):
        logger.log(31, url)
        response = requests.get(url, headers=self.headers_one)
        data = etree.HTML(response.content.decode())
        content_list = data.xpath(
            './/div[@class="content s-news-content"]//p/text() | .//div[@class="show-content"]//p/text()'
        )
        # print(content_list)
        item = {}
        item['platform'] = 'get'
        try:
            date_all = data.xpath('.//div[@class="fl"]/span/text()')[0]
            item['date'] = date_all.split(' ')[0]
        except:
            date_all = data.xpath('.//span[@class="i-time"]/text()')[0]
            item['date'] = date_all.split(' ')[0]
        item['time'] = date_all.split(' ')[1]
        item['title'] = title
        item['content'] = ''.join(content_list)
        item['content_id'] = url.split('/')[-1]
        try:
            item['article_author'] = data.xpath(
                './/span[@class="s-name"]/text()')[0]
        except:
            try:
                item['article_author'] = data.xpath(
                    './/span[@class="i-nickname"]/text()')[0]
            except:
                item['article_author'] = ''
        item['clicks'] = ''
        item['views'] = views
        item['comments_count'] = comments_count
        try:
            item['likes'] = data.xpath('.//span[@class="item-like"]/text()')[0]
        except:
            item['likes'] = ''
        item['dislikes'] = ''
        item['keyword'] = ''
        item['article_url'] = url
        item['series_url'] = ''
        item['list_url'] = ''
        item['article_type'] = ''
        item['article_source'] = ''
        item['insert_time'] = str(datetime.now()).split('.')[0]
        item['update_time'] = str(datetime.now()).split('.')[0]
        topic_id = url.split('/')[-1]
        item['topic_id'] = url.split('/')[-1]
        item['content_id'] = url.split('/')[-1]
        item['reposts_count'] = ''
        item['file_code'] = '154'
        try:
            item['author_id'] = data.xpath(
                './/div[@class="t-user-avator"]/a/@href')[0].split('/')[-1]
        except:
            try:
                item['author_id'] = data.xpath(
                    './/div[@class="avator"]/img/@src')[0].split(
                        '/')[-1].split('.')[0].split('_')[-1]
            except:
                item['author_id'] = ''
        # print(item)
        self.write_news_jsonfile(item)
        if int(comments_count) > 0:
            all_page = math.ceil(float(int(comments_count)) / 10)

            for i in range(1, int(all_page) + 1):
                comment_url = url + '?page=' + str(i)
                self.get_comment(comment_url, url, title, topic_id)

    def get_comment(self, url, news_url, title, topic_id):
        # print(111111111111111111111111)
        response = requests.get(url, headers=self.headers_one)
        data = etree.HTML(response.content.decode())
        li_list = data.xpath('.//div[@class="comment-list"]/ul/li')
        for li in li_list:
            content_id = li.xpath('.//parent::li/@data-id')[0]
            # print(etree.tostring(li))
            content = li.xpath('.//div[@class="c-message"]//p/text()')[0]
            item = {}
            item['platform'] = 'get'
            item['source_date'] = ''
            item['source_time'] = ''
            date_all = li.xpath(
                './/div[@class="c-nickname"]/text()')[0].strip()
            date_all = self.time_change(date_all)
            #  评论部分做时间判断部分---------------
            get_news_time = time.mktime(
                time.strptime(date_all[:-3], "%Y-%m-%d %H:%M"))
            end_time = time.mktime(
                time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time, "%Y-%m-%d %H:%M"))
            else:
                start_time = time.mktime(
                    time.strptime('2010-1-1', "%Y-%m-%d %H:%M"))
            if float(get_news_time) < float(start_time):
                self.is_get_comment = False  # 返回的回答消息是按时间进行排序的，所以当时间小于指定时间时，就停止爬取，
                break
            elif float(start_time) <= float(get_news_time) <= float(end_time):

                item['date'] = date_all.split(' ')[0]
                item['time'] = ''
                item['title'] = title
                item['author'] = li.xpath(
                    './/div[@class="c-nickname"]/span/text()')[0].strip()
                item['author_id'] = li.xpath(
                    './/div[@data-user-id]/@data-user-id')[0]
                item['content'] = content
                item['content_id'] = content_id
                item['floor'] = ''
                item['keyword'] = ''
                item['source_url'] = news_url
                item['comment_url'] = ''
                item['views'] = ''
                item['comments_count'] = ''
                try:
                    item['likes'] = li.xpath(
                        './/a[@class="item-like"]/text()')[0]
                except:
                    item['likes'] = ''
                item['dislikes'] = ''
                item['insert_time'] = str(datetime.now()).split('.')[0]
                item['update_time'] = str(datetime.now()).split('.')[0]
                item['topic_id'] = topic_id
                item['reposts_count'] = ''
                item['file_code'] = '155'
                # print(item)
                self.write_comment_jsonfile(item)

    def time_change(self, str_time):
        """
        时间可是转换， 将‘分钟前’，‘小时前’，‘昨天’，‘前天’，转换成标准时间格式Y-m-d h:m:s
        :param str_time:
        :return:
        """
        # print(str_time, 55555555555)
        if '秒' in str_time or '刚刚' in str_time:
            get_time = str(datetime.now()).split('.')[0]
            return get_time

        elif '分钟' in str_time:
            get_time_num = re.search('\d{1,2}', str_time).group(0)
            get_time_num = int(get_time_num) * 60
            # print(get_time_num)
            int_time = int(str(time.time()).split('.')[0]) - get_time_num
            # #转换成localtime
            time_local = time.localtime(float(int_time))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d %H:%M:%S",
                               time_local)  # "%Y-%m-%d %H:%M:%S"
            return dt

        elif '小时' in str_time:
            get_time_num = re.search('\d{1,2}', str_time).group(0)
            get_time_num = int(get_time_num) * 60 * 60
            # print(get_time_num)
            int_time = int(str(time.time()).split('.')[0]) - get_time_num
            # #转换成localtime
            time_local = time.localtime(float(int_time))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d %H:%M:%S",
                               time_local)  # "%Y-%m-%d %H:%M:%S"
            return dt

        elif '今天' in str_time:
            part_time = str_time.split(' ')[1]
            yesterday = datetime.now() - timedelta(days=0)  # 今天时间
            dt = str(yesterday).split(' ')[0] + ' ' + part_time
            return dt

        elif '昨天' in str_time:
            part_time = str_time.split(' ')[1]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0] + ' ' + part_time
            return yesterday

        elif '前天' in str_time:
            part_time = str_time.split(' ')[1]
            two_days_ago = datetime.now() - timedelta(days=2)  # 前天时间
            two_days_ago = str(two_days_ago).split(
                ' ')[0] + ' ' + part_time.replace('点', ':').replace('分', '')
            return two_days_ago

        elif '天前' in str_time:
            get_time_num = re.search('\d{1,2}', str_time).group(0)
            get_time_num = int(get_time_num) * 60 * 60 * 24
            # print(get_time_num)
            int_time = int(str(time.time()).split('.')[0]) - get_time_num
            # #转换成localtime
            time_local = time.localtime(float(int_time))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d %H:%M:%S",
                               time_local)  # "%Y-%m-%d %H:%M:%S"
            return dt

        elif '201' not in str_time:
            str_time = '2019-' + str_time
            return str_time
        else:
            return str_time

    # 写入json文件
    def write_news_jsonfile(self, item):
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./../get/json_file/{}/{}_get_news.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write('{}/{}/{}/154_{}_{}_get_news.json'.format(
            self.file_path,
            self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name,
            str(datetime.now()).split(' ')[0].replace('-', '_'),
            self.time_time),
                                  item,
                                  encoding='utf-8')

    def write_comment_jsonfile(self, item):
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./../get/json_file/{}/{}_get_news_comments.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write(
            '{}/{}/{}/155_{}_{}_get_news_comments.json'.format(
                self.comment_apth,
                self.file_name_time.split(' ')[0].replace('-',
                                                          ''), self.hour_name,
                str(datetime.now()).split(' ')[0].replace('-', '_'),
                self.time_time),
            item,
            encoding='utf-8')

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    def run(self):
        url = 'http://www.dunkhome.com/news'
        self.get_news_list_page(url)
        url = 'http://www.dunkhome.com/evaluations'
        self.get_news_page_list_two(url)

Exemple #15

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self):
        # 时间判断部分
        date = datetime.datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]

        now_date = datetime.datetime.now() - timedelta(days=0)  # 当前时间
        now_time = str(now_date).split(' ')[0]
        print('爬取时间段：{}到{}'.format(news_start_time, now_time))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = now_time
        self.is_break = False
        self.pid = os.getpid()

        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/weibo/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹

    # 使用try清理数据
    def clean_data(self, data):
        try:
            clean_data = data
        except:
            clean_data = ''
        return clean_data

    # 时间格式转换
    def changetime(self, timestr):
        fmt2 = '%a %b %d  %H:%M:%S %z %Y'
        timestrp = time.strptime(timestr, fmt2)
        # temp_time = time.strftime("%Y-%m-%d %H:%M:%S", timestrp)
        # logger.info(f"last time {temp_time}, continue request")
        timestampstr = time.mktime(timestrp)
        timeArray = time.localtime(int(timestampstr))
        otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
        # print(otherStyleTime)  # 2013--10--10 23:40:00
        return otherStyleTime

    # 请求获取用户age
    def parse_age(self, uid):
        try:
            weibo_dict = dict()
            # 请求接口需要的携带参数
            data = {
                'access_token': code,  # 访问许可
                'uid': '{}'.format(uid),  # 搜索关键词
            }
            url = 'https://c.api.weibo.com/2/users/birthday/other.json'
            try:
                time.sleep(0.1)
                response = requests.get(url, data, timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
            # print('000000000000000000', response.text)
            if 'birthday_visible' in response.text:
                # print('有用户年龄')
                age_data = json.loads(response.text)['birthday']
                birthday_visible = json.loads(
                    response.text)['birthday_visible']
                if int(birthday_visible) == 3:
                    if age_data == '':
                        weibo_dict['age'] = ''
                        # print('111111111111111111', weibo_dict)
                        self.parse_weibo(weibo_dict, uid)
                    elif int(age_data) >= 1900:
                        weibo_dict['age'] = age_data
                        # print('111111111111111111', weibo_dict)
                        self.parse_weibo(weibo_dict, uid)
                    else:
                        weibo_dict['age'] = ''
                        # print('111111111111111111', weibo_dict)
                        self.parse_weibo(weibo_dict, uid)
            else:
                weibo_dict['age'] = ''
                # print('111111111111111111', weibo_dict)
                self.parse_weibo(weibo_dict, uid)

        except:
            self.parse_age(uid)
            try:
                self.parse_age(uid)
            except:
                print(00000000000000, traceback.format_exc())

    # 根据关键词搜索请求得到微博信息
    def parse_weibo(self, weibo_dict, uid):
        try:
            is_break = self.is_break
            date = time.strftime("%Y%m%d")
            st = int(time.mktime(time.strptime(
                date, '%Y%m%d')))  # 自定义起始时间 '2019-10-21 00:00:00'
            et = st - 86400  # 自定义终止时间 '2018-11-26 00:00:00'

            url = 'https://c.api.weibo.com/2/statuses/user_timeline/other.json'  # 接口链接
            # 请求接口需要的携带参数
            data = {
                'access_token': code,  # 访问许可
                'uid': '{}'.format(uid),
                'endtime':
                '{}'.format(st),  # 首次采集终止点为当前日期的零点，'2019-10-23 00:00:00'
                'count': 20
            }  # 单页返回的记录条数，最大不超过100，超过100以100处理，默认为20。数据是从当前零点往前一天零点时间递减出现
            try:
                time.sleep(0.1)
                response = requests.get(url, data, timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
            # print(weibo_dict)
            # print(response.text)
            if 'statuses' in response.text:
                data_list = json.loads(response.text, strict=False)['statuses']
                # print(len(data_list))
                for item in data_list:
                    date_time_data = item['created_at']
                    # print(self.changetime(date_time_data))
                    try:
                        date_data = self.changetime(date_time_data).split(
                            ' ')[0]
                    except:
                        date_data = ''
                    try:
                        time_data = self.changetime(date_time_data).split(
                            ' ')[1]
                    except:
                        time_data = ''
                    # print(date_data, time_data)
                    weibo_dict['platform'] = '微博'
                    weibo_dict['keyword'] = str(uid)
                    weibo_dict['date'] = date_data.strip()
                    weibo_dict['time'] = time_data.strip()
                    weibo_dict['weibo_id'] = str(item['id'])
                    weibo_dict['mid'] = str(item['mid'])
                    weibo_dict['idstr'] = str(item['idstr'])
                    try:
                        weibo_dict['content'] = item['longText'][
                            'longTextContent'].replace('\u200b', ' ').replace(
                                '\u200e', ' ').replace('\u200c',
                                                       ' ').replace('\n', ' ')
                    except:
                        weibo_dict['content'] = item['text'].replace(
                            '\u200b',
                            ' ').replace('\u200e',
                                         ' ').replace('\u200c',
                                                      ' ').replace('\n', ' ')
                    weibo_dict['source'] = item['source']
                    weibo_dict['favorited'] = item['favorited']
                    weibo_dict['truncated'] = item['truncated']
                    try:
                        location_data = item['user']['location']
                    except:
                        location_data = ''
                    try:
                        weibo_dict['province_name'] = location_data.split(
                            ' ')[0]
                        weibo_dict['address'] = location_data.split(' ')[1]
                    except:
                        weibo_dict['province_name'] = location_data
                        weibo_dict['address'] = ''
                    # print(weibo_dict['province_name'], weibo_dict['address'])
                    try:
                        weibo_dict['pinyin'] = item['pinyin']
                    except:
                        weibo_dict['pinyin'] = ''
                    weibo_dict['uid'] = str(item['user']['id'])
                    try:
                        weibo_dict['screen_name'] = item['user']['screen_name']
                    except:
                        weibo_dict['screen_name'] = ''
                    try:
                        weibo_dict['name'] = item['user']['name']
                    except:
                        weibo_dict['name'] = ''
                    try:
                        weibo_dict['province'] = item['user']['province']
                    except:
                        weibo_dict['province'] = ''
                    try:
                        weibo_dict['city'] = item['user']['city']
                    except:
                        weibo_dict['city'] = ''
                    try:
                        weibo_dict['location'] = item['user']['location']
                    except:
                        weibo_dict['location'] = ''
                    try:
                        weibo_dict['gender'] = item['user']['gender']
                    except:
                        weibo_dict['gender'] = ''
                    try:
                        weibo_dict['allow_all_act_msg'] = item['user'][
                            'allow_all_act_msg']
                    except:
                        weibo_dict['allow_all_act_msg'] = ''
                    try:
                        weibo_dict['geo_enabled'] = item['user']['geo_enabled']
                    except:
                        weibo_dict['geo_enabled'] = ''
                    try:
                        weibo_dict['verified'] = item['user']['verified']
                    except:
                        weibo_dict['verified'] = ''
                    try:
                        weibo_dict['verified_reason'] = item['user'][
                            'verified_reason']
                    except:
                        weibo_dict['verified_reason'] = ''
                    weibo_dict['likes'] = item['attitudes_count']
                    try:
                        weibo_dict['views'] = item['views']
                    except:
                        weibo_dict['views'] = ''
                    try:
                        weibo_dict['retweeted_status'] = str(
                            item['retweeted_status'])
                    except:
                        weibo_dict['retweeted_status'] = ''
                    weibo_dict['reposts_count'] = item['reposts_count']
                    weibo_dict['comments_count'] = item['comments_count']
                    weibo_dict['attitudes_count'] = item['attitudes_count']
                    weibo_dict['visible'] = str(item['visible'])
                    weibo_dict['pic_ids'] = str(item['pic_ids'])
                    try:
                        weibo_dict['ad'] = item['ad']
                    except:
                        weibo_dict['ad'] = ''
                    weibo_dict['isLongText'] = item['isLongText']
                    weibo_dict['url'] = 'http://m.weibo.cn/' + str(
                        item['user']['id']) + '/' + str(item['idstr'])
                    try:
                        weibo_dict['followers_count'] = item['user'][
                            'followers_count']
                    except:
                        weibo_dict['followers_count'] = ''
                    try:
                        weibo_dict['favourites_count'] = item['user'][
                            'favourites_count']
                    except:
                        weibo_dict['favourites_count'] = ''
                    try:
                        weibo_dict['friends_count'] = item['user'][
                            'friends_count']
                    except:
                        weibo_dict['friends_count'] = ''
                    try:
                        weibo_dict['statuses_count'] = item['user'][
                            'statuses_count']
                    except:
                        weibo_dict['statuses_count'] = ''
                    try:
                        weibo_dict['bi_followers_count'] = item['user'][
                            'bi_followers_count']
                    except:
                        weibo_dict['bi_followers_count'] = ''
                    try:
                        weibo_dict['avatar_large'] = item['user'][
                            'avatar_large']
                    except:
                        weibo_dict['avatar_large'] = ''
                    try:
                        weibo_dict['avatar_hd'] = item['user']['avatar_hd']
                    except:
                        weibo_dict['avatar_hd'] = ''
                    try:
                        weibo_dict['retweeted_time'] = item[
                            'retweeted_status']['created_at']
                    except:
                        weibo_dict['retweeted_time'] = ''
                    try:
                        weibo_dict['retweeted_post_id'] = item[
                            'retweeted_status']['id']
                    except:
                        weibo_dict['retweeted_post_id'] = ''
                    try:
                        weibo_dict['retweeted_author'] = item[
                            'retweeted_status']['in_reply_to_screen_name']
                    except:
                        weibo_dict['retweeted_author'] = ''
                    try:
                        weibo_dict['retweeted_author_id'] = item[
                            'retweeted_status']['in_reply_to_status_id']
                    except:
                        weibo_dict['retweeted_author_id'] = ''
                    try:
                        weibo_dict['profile_url'] = item['user']['profile_url']
                    except:
                        weibo_dict['profile_url'] = ''
                    try:
                        weibo_dict['domain'] = item['user']['domain']
                    except:
                        weibo_dict['domain'] = ''
                    try:
                        weibo_dict['user_url'] = item['user']['domain']
                    except:
                        weibo_dict['user_url'] = ''
                    weibo_dict['author_url'] = 'http://m.weibo.cn/' + str(
                        item['user']['id'])
                    weibo_dict['tags'] = self.parse_tags(weibo_dict)

                    # 图片列表判断
                    img_list = item['pic_ids']
                    if len(img_list) == 0:
                        weibo_dict['imageurl'] = ''
                        weibo_dict['audiourl'] = ''
                    else:
                        weibo_img = []
                        original_pic = item['original_pic'].split(
                            'large/')[0] + 'large/'
                        for img in img_list:
                            img_data = original_pic + img + '.jpg'
                            weibo_img.append(img_data)
                        weibo_dict['imageurl'] = weibo_img
                        weibo_dict['audiourl'] = ''

                    # print(weibo_dict['imageurl'])
                    self.write_goods_jsonfile(weibo_dict)
                    index_num = data_list.index(item)
                    if index_num == len(data_list) - 1:
                        # print(index_num)
                        last_time = self.changetime(
                            data_list[int(index_num)]['created_at'])
                        last_date = self.changetime(data_list[int(
                            index_num)]['created_at']).split(' ')[0]
                        # print(last_time)
                        # print(last_date)
                        if self.start_time <= last_date:
                            # 将其转换为时间数组
                            timeArray = time.strptime(last_time,
                                                      "%Y-%m-%d %H:%M:%S")
                            # 转换为时间戳:
                            timeStamp = int(time.mktime(timeArray))
                            # print('最后一个时间%s转换成时间戳是: ' % last_time, timeStamp)
                            self.parse_weibo_data(weibo_dict, uid, timeStamp)
                            # pass
                        if self.start_time > last_date:
                            is_break = True
                    if is_break:
                        break
        except:
            print(111111111111111111111111, traceback.format_exc())

    # 根据关键词搜索请求得到微博信息
    def parse_weibo_data(self, weibo_dict, uid, timeStamp):
        try:
            is_break = self.is_break
            url = 'https://c.api.weibo.com/2/search/statuses/limited.json'  # 接口链接
            # 请求接口需要的携带参数
            data = {
                'access_token': code,  # 访问许可
                'uid': '{}'.format(uid),
                'endtime': '{}'.format(
                    timeStamp),  # 首次采集终止点为当前日期的零点，'2019-10-23 00:00:00'
                'count': 20
            }  # 单页返回的记录条数，最大不超过100，超过100以100处理，默认为20。数据是从当前零点往前一天零点时间递减出现
            try:
                time.sleep(0.1)
                response = requests.get(url, data, timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
            # print(response.text)
            if 'statuses' in response.text:
                data_list = json.loads(response.text, strict=False)['statuses']
                # print(len(data_list))
                for item in data_list:
                    date_time_data = item['created_at']
                    # print(self.changetime(date_time_data))
                    try:
                        date_data = self.changetime(date_time_data).split(
                            ' ')[0]
                    except:
                        date_data = ''
                    try:
                        time_data = self.changetime(date_time_data).split(
                            ' ')[1]
                    except:
                        time_data = ''
                    # print(date_data, time_data)
                    weibo_dict['platform'] = '微博'
                    weibo_dict['keyword'] = str(uid)
                    weibo_dict['date'] = date_data.strip()
                    weibo_dict['time'] = time_data.strip()
                    weibo_dict['weibo_id'] = str(item['id'])
                    weibo_dict['mid'] = str(item['mid'])
                    weibo_dict['idstr'] = str(item['idstr'])
                    try:
                        weibo_dict['content'] = item['longText'][
                            'longTextContent'].replace('\u200b', ' ').replace(
                                '\u200e', ' ').replace('\u200c',
                                                       ' ').replace('\n', ' ')
                    except:
                        weibo_dict['content'] = item['text'].replace(
                            '\u200b',
                            ' ').replace('\u200e',
                                         ' ').replace('\u200c',
                                                      ' ').replace('\n', ' ')
                    weibo_dict['source'] = item['source']
                    weibo_dict['favorited'] = item['favorited']
                    weibo_dict['truncated'] = item['truncated']
                    try:
                        location_data = item['user']['location']
                    except:
                        location_data = ''
                    try:
                        weibo_dict['province_name'] = location_data.split(
                            ' ')[0]
                        weibo_dict['address'] = location_data.split(' ')[1]
                    except:
                        weibo_dict['province_name'] = location_data
                        weibo_dict['address'] = ''
                    # print(weibo_dict['province_name'], weibo_dict['address'])
                    try:
                        weibo_dict['pinyin'] = item['pinyin']
                    except:
                        weibo_dict['pinyin'] = ''
                    weibo_dict['uid'] = str(item['user']['id'])
                    try:
                        weibo_dict['screen_name'] = item['user']['screen_name']
                    except:
                        weibo_dict['screen_name'] = ''
                    try:
                        weibo_dict['name'] = item['user']['name']
                    except:
                        weibo_dict['name'] = ''
                    try:
                        weibo_dict['province'] = item['user']['province']
                    except:
                        weibo_dict['province'] = ''
                    try:
                        weibo_dict['city'] = item['user']['city']
                    except:
                        weibo_dict['city'] = ''
                    try:
                        weibo_dict['location'] = item['user']['location']
                    except:
                        weibo_dict['location'] = ''
                    try:
                        weibo_dict['gender'] = item['user']['gender']
                    except:
                        weibo_dict['gender'] = ''
                    try:
                        weibo_dict['allow_all_act_msg'] = item['user'][
                            'allow_all_act_msg']
                    except:
                        weibo_dict['allow_all_act_msg'] = ''
                    try:
                        weibo_dict['geo_enabled'] = item['user']['geo_enabled']
                    except:
                        weibo_dict['geo_enabled'] = ''
                    try:
                        weibo_dict['verified'] = item['user']['verified']
                    except:
                        weibo_dict['verified'] = ''
                    try:
                        weibo_dict['verified_reason'] = item['user'][
                            'verified_reason']
                    except:
                        weibo_dict['verified_reason'] = ''
                    weibo_dict['likes'] = item['attitudes_count']
                    try:
                        weibo_dict['views'] = item['views']
                    except:
                        weibo_dict['views'] = ''
                    try:
                        weibo_dict['retweeted_status'] = str(
                            item['retweeted_status'])
                    except:
                        weibo_dict['retweeted_status'] = ''
                    weibo_dict['reposts_count'] = item['reposts_count']
                    weibo_dict['comments_count'] = item['comments_count']
                    weibo_dict['attitudes_count'] = item['attitudes_count']
                    weibo_dict['visible'] = str(item['visible'])
                    weibo_dict['pic_ids'] = str(item['pic_ids'])
                    try:
                        weibo_dict['ad'] = item['ad']
                    except:
                        weibo_dict['ad'] = ''
                    weibo_dict['isLongText'] = item['isLongText']
                    weibo_dict['url'] = 'http://m.weibo.cn/' + str(
                        item['user']['id']) + '/' + str(item['idstr'])
                    try:
                        weibo_dict['followers_count'] = item['user'][
                            'followers_count']
                    except:
                        weibo_dict['followers_count'] = ''
                    try:
                        weibo_dict['favourites_count'] = item['user'][
                            'favourites_count']
                    except:
                        weibo_dict['favourites_count'] = ''
                    try:
                        weibo_dict['friends_count'] = item['user'][
                            'friends_count']
                    except:
                        weibo_dict['friends_count'] = ''
                    try:
                        weibo_dict['statuses_count'] = item['user'][
                            'statuses_count']
                    except:
                        weibo_dict['statuses_count'] = ''
                    try:
                        weibo_dict['bi_followers_count'] = item['user'][
                            'bi_followers_count']
                    except:
                        weibo_dict['bi_followers_count'] = ''
                    try:
                        weibo_dict['avatar_large'] = item['user'][
                            'avatar_large']
                    except:
                        weibo_dict['avatar_large'] = ''
                    try:
                        weibo_dict['avatar_hd'] = item['user']['avatar_hd']
                    except:
                        weibo_dict['avatar_hd'] = ''
                    try:
                        weibo_dict['retweeted_time'] = item[
                            'retweeted_status']['created_at']
                    except:
                        weibo_dict['retweeted_time'] = ''
                    try:
                        weibo_dict['retweeted_post_id'] = item[
                            'retweeted_status']['id']
                    except:
                        weibo_dict['retweeted_post_id'] = ''
                    try:
                        weibo_dict['retweeted_author'] = item[
                            'retweeted_status']['in_reply_to_screen_name']
                    except:
                        weibo_dict['retweeted_author'] = ''
                    try:
                        weibo_dict['retweeted_author_id'] = item[
                            'retweeted_status']['in_reply_to_status_id']
                    except:
                        weibo_dict['retweeted_author_id'] = ''
                    try:
                        weibo_dict['profile_url'] = item['user']['profile_url']
                    except:
                        weibo_dict['profile_url'] = ''
                    try:
                        weibo_dict['domain'] = item['user']['domain']
                    except:
                        weibo_dict['domain'] = ''
                    try:
                        weibo_dict['user_url'] = item['user']['domain']
                    except:
                        weibo_dict['user_url'] = ''
                    weibo_dict['author_url'] = 'http://m.weibo.cn/' + str(
                        item['user']['id'])
                    weibo_dict['tags'] = self.parse_tags(weibo_dict)

                    # 图片列表判断
                    img_list = item['pic_ids']
                    if len(img_list) == 0:
                        weibo_dict['imageurl'] = ''
                        weibo_dict['audiourl'] = ''
                    else:
                        weibo_img = []
                        original_pic = item['original_pic'].split(
                            'large/')[0] + 'large/'
                        for img in img_list:
                            img_data = original_pic + img + '.jpg'
                            weibo_img.append(img_data)
                        weibo_dict['imageurl'] = weibo_img
                        weibo_dict['audiourl'] = ''

                    # print(weibo_dict['imageurl'])
                    self.write_goods_jsonfile(weibo_dict)
                    index_num = data_list.index(item)
                    if index_num == len(data_list) - 1:
                        # print(index_num)
                        last_time = self.changetime(
                            data_list[int(index_num)]['created_at'])
                        last_date = self.changetime(data_list[int(
                            index_num)]['created_at']).split(' ')[0]
                        # print(last_time)
                        # print(last_date)
                        if self.start_time <= last_date:
                            # a = "2019-10-27 23:37:07"
                            # 将其转换为时间数组
                            timeArray = time.strptime(last_time,
                                                      "%Y-%m-%d %H:%M:%S")
                            # 转换为时间戳:
                            timeStamp1 = int(time.mktime(timeArray))
                            # print('最后一个时间%s转换成时间戳是: ' % last_time, timeStamp)
                            self.parse_weibo_data(weibo_dict, uid, timeStamp1)
                        if self.start_time > last_date:
                            is_break = True
                    if is_break:
                        break
        except:
            print(22222222222222222222, traceback.format_exc())

    # 请求获取tags
    def parse_tags(self, weibo_dict):
        try:
            # 请求接口需要的携带参数
            data = {
                'access_token': code,  # 访问许可
                'uids': '{}'.format(weibo_dict['uid']),  # 搜索关键词
            }
            url = 'https://c.api.weibo.com/2/tags/tags_batch/other.json'  # 接口链接
            try:
                time.sleep(0.1)
                response = requests.get(url, data, timeout=30)
            except:
                try:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
                except:
                    time.sleep(0.1)
                    response = requests.get(url, data, timeout=30)
            # print(response.text)
            if 'tags' in response.text:
                tags = re.search(r'"tags":\[{.*?"}\]',
                                 response.text).group().replace('"tags":', '')
                return tags
            else:
                return ''
        except:
            print(555555555555555555555555, traceback.format_exc())

    # 写入json文件
    def write_goods_jsonfile(self, item):
        # print(item)
        item_data = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.hdfsclient.new_write(
            '/user/cspider_daily/nike_daily/weibo/{}/104_{}_weibo_nike_uid.json'
            .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d')),
            item_data,
            encoding='utf-8')
        # item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./104_new_weibo_uid_{}.json'.format(time.strftime('%Y%m%d')), 'ab') as f:
        #     f.write(item.encode("utf-8"))

    def run(self, keyword):
        print(keyword)
        self.parse_age(keyword)

Exemple #16

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, file_path, comment_path):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }

        # 时间判断部分
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday
        logging.log(31, '爬取时间段：{}到{}'.format(self.start_time, self.end_time))
        # 定义评论的抓取时间范围
        # self.comment_start_time = yesterday  # 一天回复
        # self.comment_start_time = '2019-08-01'  # 一天回复
        self.comment_start_time = ''  # 不限定时间回复
        self.comment_end_time = yesterday
        # self.comment_end_time = yesterday
        # 标记爬虫工作
        self.is_work = True
        self.commnet_port_url = 'http://comment.sina.com.cn/page/info?version=1&format=json&channel=ty&newsid=comos-{}&group=0&compress=0&ie=utf-8&oe=utf-8&page={}&page_size=10&t_size=3&h_size=3&thread=1&callback=jsonp_1542676393124&_=1542676393124'
        self.page_num = 1
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs('{}/{}'.format(
            self.file_path,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}'.format(
            self.comment_apth,
            str(datetime.now()).split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]

        self.article_queue = Queue()
        self.comment_queue = Queue()
        self.total_item = ''

    def get_list_page(self, url):
        logger.log(31, '列表页url:  ' + url)
        response = requests.get(url, headers=self.headers_one)
        data = json.loads(response.text[46:-14])
        list_data = data['result']['data']
        for li_data in list_data:
            news_url = li_data['url']
            ctime = li_data['ctime']
            time_local = time.localtime(float(ctime))
            # 转换成新的时间格式(2016-05-05 20:28:54)
            dt = time.strftime("%Y-%m-%d %H:%M:%S",
                               time_local)  # "%Y-%m-%d %H:%M:%S"
            #
            try:
                self.get_news_info(
                    news_url, '',
                    'http://sports.sina.com.cn/roll/index.shtml#pageid=13&lid=2503&k=&num=50&page=1',
                    dt)
            except:
                logger.error(traceback.format_exc())

    def get_news_info(self, url, news_type, page_list, date_all):
        logger.log(31, '新闻url:  ' + url)
        item = dict()
        response = requests.get(url, headers=self.headers_one, timeout=60)
        try:
            data = etree.HTML(response.content.decode())
            # 网站
            item['platform'] = '新浪新闻'
            # 标题
            try:
                title = data.xpath('.//h1[@class="main-title"]/text()')[0]
            except:
                title = data.xpath('.//h2/text()')[0]
            item['title'] = title
            # date_all = data.xpath('.//div[@class="date-source"]/span/text()')[0].replace('年', '-').replace('月', '-').replace('日', '')
            date = date_all.split(' ')[0]
            news_time = date_all.split(' ')[1]
            # print(date)
            item['date'] = date
            item['time'] = news_time
            # 文章来源
            try:
                article_source = data.xpath(
                    './/div[@class="date-source"]/a/text()')[0]
            except:
                article_source = data.xpath(
                    './/p[@class="from"]/span[2]//text()')
                article_source = ''.join(article_source)
            item['article_source'] = article_source
            # article_author
            try:
                article_author = data.xpath(
                    './/div[@class="show_author"]/text()')
            except:
                article_author = ''
            if article_author:
                item['article_author'] = article_author[0]
            else:
                item['article_author'] = ''
            # 内容
            try:
                content = data.xpath(
                    './/div[@id="article_content"]/div[1]/div/p/text()')
            except:
                content = data.xpath('.//em[@class="vdiCont"]//text()')
            content = ''.join(content)
            # 翻页数据
            next_page = data.xpath(
                './/div[@data-sudaclick="content_pagination_p"]/a/@href')
            if len(next_page) > 3:
                next_page = next_page[1:][:-2]
                for page_url in next_page:
                    print('获取翻页数据')
                    next_content = self.get_next_page(page_url)
                    content = content + next_content

            item['content'] = content

            # 从接口处获取评论数
            news_id = re.search('(\w{7}\d{7})', url).group(0)
            try:
                comment_count = self.get_commnet_count(news_id)
            except AttributeError:
                comment_count = '0'
            item['comments_count'] = comment_count
            item['clicks'] = ''
            item['views'] = ''
            item['likes'] = ''
            item['keyword'] = ''
            item['article_url'] = url  # 文章详情URL
            item['dislikes'] = ''  # 踩人数
            item['series_url'] = ''  # 车系首页
            item['list_url'] = page_list  # 文章列表URL
            # item['article_type'] = news_type  # 文章类型
            item['article_type_1st'] = news_type  # 文章类型
            item['article_type_2nd'] = ''  # 文章类型
            item['insert_time'] = str(datetime.now()).split('.')[0]  # 初始爬取时间
            item['update_time'] = str(datetime.now()).split('.')[0]  # 最后爬取时间
            content_id = url.split('/')[-1].split('.')[0].split('_')[-1].split(
                '-')[-1]
            # content_id = re.search('\d{5,8}', content_id).group(0)
            item['content_id'] = str(content_id)  # 文章id
            item['topic_id'] = str(content_id)  # 主贴id
            item['author_id'] = ''  # 作者id
            item['file_code'] = '17'
            item['reposts_count'] = ''
            # 做时间判断部分---------------
            get_news_time = time.mktime(time.strptime(date, "%Y-%m-%d"))
            end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time, "%Y-%m-%d"))
            else:
                start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
            if float(get_news_time) < float(start_time):
                self.is_work = False
                return

            if float(start_time) <= float(get_news_time) <= float(end_time):
                self.write_news_jsonfile(item)

                # self.article_queue.put(item)
                if int(comment_count) > 0:
                    self.is_get_comment = True

                    while True:
                        if self.is_get_comment:
                            self.get_comments_info(news_id, title, date,
                                                   news_time, url)
                        else:
                            self.page_num = 1
                            break
        except IndexError:
            time.sleep(5)
            logger.error('网页请求404 url: {},   {}'.format(
                url, traceback.format_exc()))

    # 获取翻页数据
    def get_next_page(self, url):
        response = requests.get(url, headers=self.headers_one, timeout=60)
        try:
            data = etree.HTML(response.content)
            # 内容
            content = data.xpath(
                './/div[@id="article_content"]/div[1]/div/p/text()')
            content = ''.join(content)
            return content
        except:
            content = ''
            return content

    # 获取评论数
    def get_commnet_count(self, news_id):
        response = requests.get(self.commnet_port_url.format(news_id, str(1)))
        data = response.content.decode()
        data = data[20:][:-1]
        # print(11111,data)
        data = json.loads(data)
        # print(222222,data)
        # data = re.search('"qreply": \d{0,9}', data).group(0)
        try:
            comment_count = data['result']['count']['show']
        except:
            comment_count = 0
        return comment_count

    # 获取评论信息
    def get_comments_info(self,
                          news_id,
                          title,
                          source_date,
                          source_time,
                          source_url,
                          page_id="1"):

        url = self.commnet_port_url.format(news_id, str(self.page_num))
        response = requests.get(url)
        data = response.content.decode()
        # data = re.search(r'{"result.*}\)', data).group(0)
        data = data[20:][:-1]
        data = json.loads(data)
        comments_list = data['result']['cmntlist']
        if comments_list:
            for comment in comments_list:
                item = {}
                item['platform'] = u'新浪新闻'
                item['source_date'] = source_date
                item['source_time'] = source_time
                date_all = comment['time']
                date = date_all.split(' ')[0]
                commnet_time = date_all.split(' ')[1]
                item['date'] = date
                item['time'] = commnet_time
                #  评论部分做时间判断部分---------------
                get_news_time = time.mktime(
                    time.strptime(str(date), "%Y-%m-%d"))
                end_time = time.mktime(
                    time.strptime(self.comment_end_time, "%Y-%m-%d"))
                if self.comment_start_time != '':
                    start_time = time.mktime(
                        time.strptime(self.comment_start_time, "%Y-%m-%d"))
                else:
                    start_time = time.mktime(
                        time.strptime('2010-1-1', "%Y-%m-%d"))
                if float(get_news_time) < float(start_time):
                    self.is_get_comment = False  # 返回的回答消息是按时间进行排序的，所以当时间小于指定时间时，就停止爬取，
                    break
                elif float(start_time) <= float(get_news_time) <= float(
                        end_time):

                    item['title'] = title
                    author = comment['nick']
                    item['author'] = author
                    item['author_id'] = comment['uid']  # 用户id
                    content = comment['content']

                    item['content'] = content
                    item['floor'] = ''
                    item['keyword'] = ''
                    item['source_url'] = source_url
                    comment_url = 'http://comment5.news.sina.com.cn/comment/skin/default.html?channel=ty&newsid=comos-{}&group=0'.format(
                        news_id)
                    item['comment_url'] = comment_url
                    item['views'] = ''
                    item['comments_count'] = ''
                    likes = comment['agree']
                    item['likes'] = likes
                    item['dislikes'] = ''  # 踩人数
                    item['insert_time'] = str(
                        datetime.now()).split('.')[0]  # 初始爬取时间
                    item['update_time'] = str(
                        datetime.now()).split('.')[0]  # 最后爬取时间
                    item['content_id'] = str(uuid.uuid4()).replace('-', '')
                    topic_id = source_url.split('/')[-1].split('.')[0].split(
                        '_')[-1].split('-')[-1]
                    # topic_id = re.search('\d{5,8}', topic_id).group(0)
                    item['topic_id'] = topic_id  # 主贴id
                    item['file_code'] = '31'
                    item['reposts_count'] = ''
                    # self.write_comment_jsonfile(item)
                    # print(11111111, item)
                    item = json.dumps(dict(item), ensure_ascii=False) + '\n'
                    self.total_item = self.total_item + item
                    # self.comment_queue.put(item)
            self.write_comment_jsonfile()
            if self.is_get_comment:
                self.page_num += 1
                # self.get_comments_info(news_id, title, source_date, source_time, source_url,page_id=str(self.page_num))
        else:
            self.page_num = 1
            logger.log(31, '评论抓取完毕   ' + url)
            self.is_get_comment = False

    # ------------------------------------------------新能源模块--------------------------------------------------------

    def write_news_jsonfile(self, item):

        # q_size = self.article_queue.qsize()
        # total_item = ''
        # if q_size > 0:
        #     for i in range(q_size):
        #         item = self.article_queue.get()
        #         # print('写入数据中......')
        print('写入新闻数据......')
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # total_item += item
        try:
            self.hdfsclient.new_write('{}/{}/17_{}_{}_sina_news.json'.format(
                self.file_path,
                str(datetime.now()).split(' ')[0].replace('-', ''),
                str(datetime.now()).split(' ')[0].replace('-', '_'),
                self.time_time),
                                      item,
                                      encoding='utf-8')
        except:

            logging.error(traceback.format_exc())
            self.write_news_jsonfile(item)
            return
        # else:
        #     pass

    def write_comment_jsonfile(self):
        # q_size = self.comment_queue.qsize()
        # total_item = ''
        # if q_size > 0:
        #     print(q_size)
        #     for i in range(q_size):
        #         item = self.comment_queue.get()
        #         print(2222222, item)
        #         # print('写入数据中......')
        #         item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        #         total_item = total_item + item
        #
        #     # try:
        #     #
        #     #     self.hdfsclient.new_write('{}/{}/31_{}_{}_sina_comment.json'.format(self.comment_apth, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), total_item,encoding='utf-8')
        #     # except:
        #     #     logging.error(traceback.format_exc())
        # else:
        #     pass
        # print(3333333, total_item)
        try:
            self.hdfsclient.new_write(
                '{}/{}/31_{}_{}_sina_comment.json'.format(
                    self.comment_apth,
                    str(datetime.now()).split(' ')[0].replace('-', ''),
                    str(datetime.now()).split(' ')[0].replace('-', '_'),
                    self.time_time),
                self.total_item,
                encoding='utf-8')
        except:
            print('写入重试中......')
            self.write_comment_jsonfile()
            return
        print('写入成功......')
        self.total_item = ''

    def run(self):
        for i in range(1, 100):
            if self.is_work:
                url = 'http://feed.mix.sina.com.cn/api/roll/get?pageid=13&lid=2503&k=&num=50&page={}&r=0.6019004029484454&callback=jQuery311016308312964736538_1566799251373&_=1566799251388'.format(
                    str(i))
                self.get_list_page(url)
                # self.write_news_jsonfile()
            else:
                logger.log(31, '爬取到指定时间......')

Exemple #17

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段：{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段：{}到{}'.format(start_time_data, end_time))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time
        # 标记爬虫工作
        self.is_work = True
        self.redis_example = redis_example
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;&hellip;', '', ret6)
            ret8 = re.sub(r'":', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 13位时间戳转换成日期
    def time_change(self, data):
        timeStamp = float(int(data) / 1000)
        timeArray = time.localtime(timeStamp)
        otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
        return otherStyleTime

    # 获取品牌信息
    def parse_brand(self, goods_dict):
        try:
            headers = {
                'content-type':
                'text/html;charset=UTF-8',
                'cookie':
                'kaola_user_key=b87e28b9-e7fc-43ba-8ca7-42abae97a079; _ntes_nnid=116c0ca91001bfb53c23f45f9e55ac87,1568617522153; _ga=GA1.2.290138937.1568617522; _klhtxd_=31; _ga=GA1.3.290138937.1568617522; __da_ntes_utma=2525167.417648162.1568617522.1568617522.1568617522.1; davisit=1; __da_ntes_utmz=2525167.1568617522.1.1.; __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); usertrack=CrGZAV2DFzgLhl54AwtSAg==; KAOLA_NEW_USER_COOKIE=yes; cna=MQj5FQMZD0sCAXxONRZeF0y0; WM_TID=beYPJ03r5ilFBUFUFEZo5jCUV1mKk4PC; t=cf5d799c2331f5cabed38ae64e05e79e; KAOLA_USER_ID=109999078912652422; [email protected]; JSESSIONID-WKL-8IO=0zc3WMz%2Bz0rQe5Jcv1xai4OAOScJJgZviUPXMI3RUo2IYlneCBZYhem2pXj85vvoJ8Z%2B2yMxkJZ%2BDbqGhohayCkj0RWfrbvXgwt00Wju%2BMWVg7WjBsfPPuM6Bq0yJI1vkeq%5C17ndJLsLrHGeY1Sf0k231zopBvGmtXomvGZ5J9TWLbPq%3A1586842936344; davisit=2; __da_ntes_utmb=2525167.1.10.1586756536; _samesite_flag_=true; cookie2=1f50b0bd27965ea6d4731440eb0ab6b2; _tb_token_=57e48eee49e7; csg=7c23ee4b; NTES_OSESS=REPpP5MMDS0ti.Kjs4kXCagwqwIe5DsWd2J6spGZnnoVWWhz6L9pI2HlXPVp_85PuZGCsnYofZ0FK56aZ.uX88iBgdi0zJZsRBB8fdi_YIZfYxQlVYg4kvmcVqVCqK9kxhu.Yzv4Avj3rW.UPrCYFGfnrd5TZovCzX0lNqe3j5rAEWHpYRLXj1PsCx_75evCuvl01iv5jej2sgH2yqYAm2a0p; kaola_csg=93dad892; kaola-user-beta-traffic=12217883524; firstLogin=0; hb_MA-AE38-1FCC6CD7201B_source=search.kaola.com; NTES_KAOLA_RV=1537539_1586756945560_0|2884042_1586756792280_0|5522516_1586513810003_0|5705591_1585881322711_0|8317307_1585880658885_0|5553701_1585880652352_0|8517421_1585879009306_0|1467929_1571291229258_0|5218698_1569811431977_0|5536790_1569811422334_0|5457794_1569811411408_0|5115159_1569811404628_0|2843760_1569566707083_0|5481268_1569489750583_0|2723610_1569488978899_0|2546067_1569485553114_0|1758828_1569485116618_0|1616628_1569482665961_0|5111078_1569482641632_0|2482224_1569482624326_0; isg=BHV1IQtJR6edB6MO8FzlgBdJhPHvWigPBiZuAfeb4ewRzpfAv0AP1GEMGNLdjkG8',
                'pragma':
                'no-cache',
                'referer':
                'https://search.kaola.com/search.html?key=AlphaBounce&oldQuery=AIR%2520MAX&searchRefer=searchbutton&zn=top',
                'upgrade-insecure-requests':
                '1',
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            url = goods_dict['url']
            try:
                # time.sleep(0.2)
                response1 = requests.get(url=url,
                                         headers=headers,
                                         proxies=proxies,
                                         allow_redirects=False,
                                         timeout=30)
            except:
                try:
                    # time.sleep(0.2)
                    response1 = requests.get(url=url,
                                             headers=headers,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
                except:
                    # time.sleep(0.2)
                    response1 = requests.get(url=url,
                                             headers=headers,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
            html1 = etree.HTML(response1.text)
            # 品牌
            try:
                goods_dict['brand'] = html1.xpath(
                    '//dt[@class="orig-country"]/a/text()')[0].spilt(' ')[0]
            except:
                goods_dict['brand'] = ''
            # print(goods_dict)
            self.goods_comments(goods_dict)
        except:
            print(9999999999999999999999, traceback.format_exc())

    # 抓取第一页商品评论
    def goods_comments(self, goods_dict):
        try:
            if int(goods_dict['achieve_num']) == 0:
                pass
                # logger.log(31, '**********---------没有商品评论------************')
            else:
                goods_id = goods_dict['goods_id']
                comment_url = 'https://goods.kaola.com/commentAjax/comment_list_new.json'
                # print(comment_url, goods_id)
                headers = {
                    'authority':
                    'goods.kaola.com',
                    'method':
                    'POST',
                    'path':
                    '/commentAjax/comment_list_new.json',
                    'scheme':
                    'https',
                    'accept':
                    '*/*',
                    'accept-encoding':
                    'gzip, deflate, br',
                    'accept-language':
                    'zh-CN,zh;q=0.9',
                    'cache-control':
                    'no-cache',
                    'content-length':
                    '220',
                    'content-type':
                    'application/x-www-form-urlencoded',
                    'cookie':
                    'kaola_user_key=b87e28b9-e7fc-43ba-8ca7-42abae97a079; _ntes_nnid=116c0ca91001bfb53c23f45f9e55ac87,1568617522153; _ga=GA1.2.290138937.1568617522; _klhtxd_=31; _ga=GA1.3.290138937.1568617522; __da_ntes_utma=2525167.417648162.1568617522.1568617522.1568617522.1; davisit=1; __da_ntes_utmz=2525167.1568617522.1.1.; __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); usertrack=CrGZAV2DFzgLhl54AwtSAg==; KAOLA_NEW_USER_COOKIE=yes; cna=MQj5FQMZD0sCAXxONRZeF0y0; WM_TID=beYPJ03r5ilFBUFUFEZo5jCUV1mKk4PC; t=cf5d799c2331f5cabed38ae64e05e79e; KAOLA_USER_ID=109999078912652422; [email protected]; JSESSIONID-WKL-8IO=0zc3WMz%2Bz0rQe5Jcv1xai4OAOScJJgZviUPXMI3RUo2IYlneCBZYhem2pXj85vvoJ8Z%2B2yMxkJZ%2BDbqGhohayCkj0RWfrbvXgwt00Wju%2BMWVg7WjBsfPPuM6Bq0yJI1vkeq%5C17ndJLsLrHGeY1Sf0k231zopBvGmtXomvGZ5J9TWLbPq%3A1586842936344; davisit=2; __da_ntes_utmb=2525167.1.10.1586756536; _samesite_flag_=true; cookie2=1f50b0bd27965ea6d4731440eb0ab6b2; _tb_token_=57e48eee49e7; csg=7c23ee4b; NTES_OSESS=REPpP5MMDS0ti.Kjs4kXCagwqwIe5DsWd2J6spGZnnoVWWhz6L9pI2HlXPVp_85PuZGCsnYofZ0FK56aZ.uX88iBgdi0zJZsRBB8fdi_YIZfYxQlVYg4kvmcVqVCqK9kxhu.Yzv4Avj3rW.UPrCYFGfnrd5TZovCzX0lNqe3j5rAEWHpYRLXj1PsCx_75evCuvl01iv5jej2sgH2yqYAm2a0p; kaola_csg=93dad892; kaola-user-beta-traffic=12217883524; firstLogin=0; hb_MA-AE38-1FCC6CD7201B_source=search.kaola.com; NTES_KAOLA_RV=1537539_1586756945560_0|2884042_1586756792280_0|5522516_1586513810003_0|5705591_1585881322711_0|8317307_1585880658885_0|5553701_1585880652352_0|8517421_1585879009306_0|1467929_1571291229258_0|5218698_1569811431977_0|5536790_1569811422334_0|5457794_1569811411408_0|5115159_1569811404628_0|2843760_1569566707083_0|5481268_1569489750583_0|2723610_1569488978899_0|2546067_1569485553114_0|1758828_1569485116618_0|1616628_1569482665961_0|5111078_1569482641632_0|2482224_1569482624326_0; isg=BHV1IQtJR6edB6MO8FzlgBdJhPHvWigPBiZuAfeb4ewRzpfAv0AP1GEMGNLdjkG8',
                    'origin':
                    'https://goods.kaola.com',
                    'pragma':
                    'no-cache',
                    'referer':
                    'https://goods.kaola.com/review/{}.html'.format(
                        str(goods_id)),
                    'user-agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
                    'x-requested-with':
                    'XMLHttpRequest'
                }
                form_data = {
                    'goodsId': '{}'.format(str(goods_id)),
                    'grade': '0',
                    'tagType': '0',
                    'hasContent': '0',
                    'paginationContext': 'null',
                    'pageNo': '1',
                    'pageSize': '20',
                }
                try:
                    # time.sleep(0.2)
                    response = requests.post(url=comment_url,
                                             headers=headers,
                                             data=form_data,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
                except:
                    try:
                        # time.sleep(0.2)
                        response = requests.post(url=comment_url,
                                                 headers=headers,
                                                 data=form_data,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                    except:
                        # time.sleep(0.2)
                        response = requests.post(url=comment_url,
                                                 headers=headers,
                                                 data=form_data,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                # print(response.text)
                data = json.loads(response.text)
                # 获取评论列表
                comments_list = data['data']['commentPage']['result']
                if int(len(comments_list)) == 0:
                    return
                else:
                    # 获取当前页数
                    page_data = data['data']['commentPage']['pageNo']
                    # 评价总页数
                    pages_num = data['data']['commentPage']['totalPage']
                    # logger.log(31, '*******************第1页评论****************')
                    for item in comments_list:
                        kao_la_dict = dict()
                        time_data = self.time_change(item['createTime'])
                        # print(data_time_data)
                        try:
                            content = item['commentContent'].replace('\n', ' ')
                        except:
                            content = ''
                        # 追加评论
                        try:
                            comments_2 = item['replyList'][0]['replyContent']
                        except:
                            comments_2 = ''
                        if self.start_time <= time_data:
                            kao_la_dict['platform'] = goods_dict['platform']
                            kao_la_dict['date'] = time_data.split(' ')[0]
                            kao_la_dict['time'] = time_data.split(' ')[1]
                            kao_la_dict['keyword'] = goods_dict['keyword']
                            kao_la_dict['name'] = goods_dict['name']
                            kao_la_dict['imageurl'] = goods_dict['商品图片']
                            kao_la_dict['audiourl'] = ''
                            kao_la_dict['url'] = goods_dict['url']
                            kao_la_dict['shop_name'] = goods_dict['shop_name']
                            kao_la_dict['user_name'] = ''
                            kao_la_dict['content'] = content + ';' + comments_2
                            kao_la_dict['content_id'] = str(
                                item['goodsCommentId'])
                            kao_la_dict['brand'] = goods_dict['brand']
                            kao_la_dict['price'] = goods_dict['price']
                            kao_la_dict['sales'] = goods_dict['sales']
                            kao_la_dict['focus_count'] = ''
                            kao_la_dict['comment_num'] = goods_dict[
                                'achieve_num']
                            kao_la_dict['views'] = ''
                            kao_la_dict['likes'] = item['zanCount']
                            kao_la_dict['comments_count'] = ''
                            kao_la_dict['author_id'] = ''
                            kao_la_dict['reposts_count'] = ''
                            kao_la_dict['topic_id'] = str(item['goodsId'])
                            try:
                                kao_la_dict['type'] = item['skuPropertyList'][
                                    1]['propertyValue']
                            except:
                                kao_la_dict['type'] = ''
                            try:
                                kao_la_dict['size'] = item['skuPropertyList'][
                                    0]['propertyValue']
                            except:
                                kao_la_dict['size'] = ''
                            kao_la_dict['file_code'] = '176'
                            # print(kao_la_dict)
                            item = json.dumps(dict(kao_la_dict),
                                              ensure_ascii=False) + '\n'
                            self.hdfsclient.new_write(
                                '/user/cspider_daily/nike_2h/ecommerce/{}/{}/176_{}_KaoLa_nike{}.json'
                                .format(self.date_time, self.h2_name,
                                        time.strftime('%Y%m%d'), self.pid),
                                item,
                                encoding='utf-8')
                        else:
                            pass
                    if int(page_data) < int(pages_num):
                        # 获取第一页评论最后一个的id以及下一页从哪页跳转参数
                        lastId = data['data']['paginationContext']['lastId']
                        lastPage = data['data']['paginationContext'][
                            'lastPage']
                        # print(lastId, lastPage)
                        self.goods_comments_2(lastId, lastPage, goods_id,
                                              goods_dict,
                                              int(page_data) + 1)
                    else:
                        pass
        except:
            print(22222222222222222, traceback.format_exc())

    # 获取第一页之后的所有页面评论
    def goods_comments_2(self, lastId, lastPage, goods_id, goods_dict, i):
        try:
            comment_url = 'https://goods.kaola.com/commentAjax/comment_list_new.json'
            # print(comment_url, goods_id, lastId, lastPage)
            headers = {
                'authority':
                'goods.kaola.com',
                'method':
                'POST',
                'path':
                '/commentAjax/comment_list_new.json',
                'scheme':
                'https',
                'accept':
                '*/*',
                'accept-encoding':
                'gzip, deflate, br',
                'accept-language':
                'zh-CN,zh;q=0.9',
                'cache-control':
                'no-cache',
                'content-length':
                '247',
                'content-type':
                'application/x-www-form-urlencoded',
                'cookie':
                'kaola_user_key=b87e28b9-e7fc-43ba-8ca7-42abae97a079; _ntes_nnid=116c0ca91001bfb53c23f45f9e55ac87,1568617522153; _ga=GA1.2.290138937.1568617522; _klhtxd_=31; _ga=GA1.3.290138937.1568617522; __da_ntes_utma=2525167.417648162.1568617522.1568617522.1568617522.1; davisit=1; __da_ntes_utmz=2525167.1568617522.1.1.; __da_ntes_utmfc=utmcsr%3D(direct)%7Cutmccn%3D(direct)%7Cutmcmd%3D(none); usertrack=CrGZAV2DFzgLhl54AwtSAg==; KAOLA_NEW_USER_COOKIE=yes; cna=MQj5FQMZD0sCAXxONRZeF0y0; WM_TID=beYPJ03r5ilFBUFUFEZo5jCUV1mKk4PC; t=cf5d799c2331f5cabed38ae64e05e79e; KAOLA_USER_ID=109999078912652422; [email protected]; JSESSIONID-WKL-8IO=0zc3WMz%2Bz0rQe5Jcv1xai4OAOScJJgZviUPXMI3RUo2IYlneCBZYhem2pXj85vvoJ8Z%2B2yMxkJZ%2BDbqGhohayCkj0RWfrbvXgwt00Wju%2BMWVg7WjBsfPPuM6Bq0yJI1vkeq%5C17ndJLsLrHGeY1Sf0k231zopBvGmtXomvGZ5J9TWLbPq%3A1586842936344; davisit=2; __da_ntes_utmb=2525167.1.10.1586756536; _samesite_flag_=true; cookie2=1f50b0bd27965ea6d4731440eb0ab6b2; _tb_token_=57e48eee49e7; csg=7c23ee4b; NTES_OSESS=REPpP5MMDS0ti.Kjs4kXCagwqwIe5DsWd2J6spGZnnoVWWhz6L9pI2HlXPVp_85PuZGCsnYofZ0FK56aZ.uX88iBgdi0zJZsRBB8fdi_YIZfYxQlVYg4kvmcVqVCqK9kxhu.Yzv4Avj3rW.UPrCYFGfnrd5TZovCzX0lNqe3j5rAEWHpYRLXj1PsCx_75evCuvl01iv5jej2sgH2yqYAm2a0p; kaola_csg=93dad892; kaola-user-beta-traffic=12217883524; firstLogin=0; hb_MA-AE38-1FCC6CD7201B_source=search.kaola.com; NTES_KAOLA_RV=1537539_1586756945560_0|2884042_1586756792280_0|5522516_1586513810003_0|5705591_1585881322711_0|8317307_1585880658885_0|5553701_1585880652352_0|8517421_1585879009306_0|1467929_1571291229258_0|5218698_1569811431977_0|5536790_1569811422334_0|5457794_1569811411408_0|5115159_1569811404628_0|2843760_1569566707083_0|5481268_1569489750583_0|2723610_1569488978899_0|2546067_1569485553114_0|1758828_1569485116618_0|1616628_1569482665961_0|5111078_1569482641632_0|2482224_1569482624326_0; isg=BHV1IQtJR6edB6MO8FzlgBdJhPHvWigPBiZuAfeb4ewRzpfAv0AP1GEMGNLdjkG8',
                'origin':
                'https://goods.kaola.com',
                'pragma':
                'no-cache',
                'referer':
                'https://goods.kaola.com/review/{}.html'.format(str(goods_id)),
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
                'x-requested-with':
                'XMLHttpRequest'
            }
            form_data = {
                'goodsId': '{}'.format(str(goods_id)),
                'grade': '0',
                'tagType': '0',
                'hasContent': '0',
                'showSelfGoodsComment': 'false',
                'paginationContext': {
                    "lastId": '{}'.format(lastId),
                    "lastPage": '{}'.format(lastPage)
                },
                'pageNo': '{}'.format(i),
                'pageSize': '20',
                'hasInitCommentTab': 'true'
            }
            try:
                # time.sleep(0.2)
                response = requests.post(url=comment_url,
                                         headers=headers,
                                         data=form_data,
                                         proxies=proxies,
                                         allow_redirects=False,
                                         timeout=30)
            except:
                try:
                    # time.sleep(0.2)
                    response = requests.post(url=comment_url,
                                             headers=headers,
                                             data=form_data,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
                except:
                    # time.sleep(0.2)
                    response = requests.post(url=comment_url,
                                             headers=headers,
                                             data=form_data,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
            data = json.loads(response.text)
            # print(data)
            # 获取评论列表
            comments_list = data['data']['commentPage']['result']
            # logger.log(31, '**********************第{}页评论**********************'.format(i))
            if int(len(comments_list)) == 0:
                return
            else:
                # 获取当前页数
                page_data = data['data']['commentPage']['pageNo']
                # 评价总页数
                pages_num = data['data']['commentPage']['totalPage']
                for item in comments_list:
                    kao_la_goods = dict()
                    time_data = self.time_change(item['createTime'])
                    try:
                        content = item['commentContent'].replace('\n', ' ')
                    except:
                        content = ''
                    # 追加评论
                    try:
                        comments_2 = item['replyList'][0]['replyContent']
                    except:
                        comments_2 = ''
                    if self.start_time <= time_data:
                        kao_la_goods['platform'] = goods_dict['platform']
                        kao_la_goods['date'] = time_data.split(' ')[0]
                        kao_la_goods['time'] = time_data.split(' ')[1]
                        kao_la_goods['keyword'] = goods_dict['keyword']
                        kao_la_goods['name'] = goods_dict['name']
                        kao_la_goods['imageurl'] = goods_dict['商品图片']
                        kao_la_goods['audiourl'] = ''
                        kao_la_goods['url'] = goods_dict['url']
                        kao_la_goods['shop_name'] = goods_dict['shop_name']
                        kao_la_goods['user_name'] = ''
                        kao_la_goods['content'] = content + ';' + comments_2
                        kao_la_goods['content_id'] = str(
                            item['goodsCommentId'])
                        kao_la_goods['brand'] = goods_dict['brand']
                        kao_la_goods['price'] = goods_dict['price']
                        kao_la_goods['sales'] = goods_dict['sales']
                        kao_la_goods['focus_count'] = ''
                        kao_la_goods['comment_num'] = goods_dict['achieve_num']
                        kao_la_goods['views'] = ''
                        kao_la_goods['likes'] = item['zanCount']
                        kao_la_goods['comments_count'] = ''
                        kao_la_goods['author_id'] = ''
                        kao_la_goods['reposts_count'] = ''
                        kao_la_goods['topic_id'] = str(item['goodsId'])
                        try:
                            kao_la_goods['type'] = item['skuPropertyList'][1][
                                'propertyValue']
                        except:
                            kao_la_goods['type'] = ''
                        try:
                            kao_la_goods['size'] = item['skuPropertyList'][0][
                                'propertyValue']
                        except:
                            kao_la_goods['size'] = ''
                        kao_la_goods['file_code'] = '176'
                        # print(kao_la_goods)
                        item = json.dumps(dict(kao_la_goods),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_2h/ecommerce/{}/{}/176_{}_KaoLa_nike{}.json'
                            .format(self.date_time, self.h2_name,
                                    time.strftime('%Y%m%d'), self.pid),
                            item,
                            encoding='utf-8')
                    else:
                        pass

                if int(page_data) < int(pages_num):
                    # 获取第2页评论最后一个的id以及下一页从哪页跳转参数
                    lastId = data['data']['paginationContext']['lastId']
                    lastPage = data['data']['paginationContext']['lastPage']
                    i += 1
                    self.goods_comments_2(lastId, lastPage, goods_id,
                                          goods_dict, i)
                else:
                    pass
        except:
            print(3333333333333333333, traceback.format_exc())

    # # 读取excel获取关键词
    # def parse_xlsx(self):
    #     # 设置路径
    #     path = './快消采集关键词_0916_v3-1.xlsx'
    #     # 打开execl
    #     workbook = xlrd.open_workbook(path)
    #
    #     # 根据sheet索引或者名称获取sheet内容
    #     Data_sheet = workbook.sheets()[0]  # 通过索引获取
    #
    #     rowNum = Data_sheet.nrows  # sheet行数
    #     colNum = Data_sheet.ncols  # sheet列数
    #
    #     # 获取所有单元格的内容
    #     list = []
    #     for i in range(rowNum):
    #         rowlist = []
    #         for j in range(colNum):
    #             rowlist.append(Data_sheet.cell_value(i, j))
    #         list.append(rowlist)
    #
    #     for data in list[1::]:
    #         brand = data[0]
    #         # print(brand)
    #         yield {
    #             '关键词': brand,
    #         }

    def run(self, lock):
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('kaola_2h_url')
            if str(redis_url_num) == '0':
                print(
                    '**********Redis消息队列中url为空.....进程 {} 抓取结束......************'
                    .format(str(os.getpid())))
                return
            item = self.redis_example.brpop('kaola_2h_url', timeout=3600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_brand(item1)

Exemple #18

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]
        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]
        print('爬取时间段：{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段：{}到{}'.format(news_start_time, current_day))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day

        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;&hellip;', '', ret6)
            ret8 = re.sub(r'":', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 过滤url里面的#detail
    def re_detail(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'#detail', '', message)
            return ret1
        except:
            pass

    # 解析请求得到的商品信息
    def parse_goods_url(self, data):
        goods_dict = dict()
        goods_dict['平台'] = '天猫'
        goods_dict['URL'] = data['URL']
        goods_dict['商品名'] = data['商品名']
        goods_dict['价格'] = data['价格']
        goods_dict['shop_name'] = data['shop_name']
        goods_dict['月销量'] = data['月销量'].replace('人付款', '')
        goods_dict['关键词'] = data['关键词']
        goods_dict['品牌'] = data['品牌']
        goods_dict['itemId'] = data['itemId']
        goods_dict['sellerId'] = data['sellerId']
        goods_dict['imageurl'] = data['商品图片']
        goods_dict['audiourl'] = ''
        # logger.log(31, '***************************正在抓取的商品是:%s.................' % goods_dict)
        self.parse_goods_details(goods_dict)

    # 解析商品品牌信息
    def parse_goods_details(self, goods_dict):
        try:
            url = goods_dict['URL']
            headers = {
                'content-type':
                'text/html;charset=GBK',
                # 'authority': 'detail.tmall.com',
                # 'method': 'GET',
                # 'path': path,
                # 'scheme': 'https',
                # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                # 'accept-encoding': 'gzip, deflate, br',
                # 'accept-language': 'zh-CN,zh;q=0.9',
                # 'cache-control': 'no-cache',
                'cookie':
                'cq=ccp%3D1; cookie2=1f727f7b9a023d3336775fda77aa6c64; t=e0b1a5b3f801e8ad7974356321ff2384; _tb_token_=33173e5033eee; csg=0cc83e88; dnk=tb9619067_2012; tracknick=tb9619067_2012; lid=tb9619067_2012; enc=x%2FOcdI7JLsr6CpzvtdqWU16H2R3aBtna09TzEkBv9ziurQwe2F0J9TpnrtPlgFZW95bG4nGzcBhhVk6VKxhnTw%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; uc1=cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&cookie21=UtASsssme%2BBq&cookie15=URm48syIIVrSKA%3D%3D&existShop=false&pas=0&cookie14=UoTaECbNEKyHyQ%3D%3D&tag=8&lng=zh_CN; UM_distinctid=16d142a6ebc31-00f1b1ad240fa2-37c143e-1fa400-16d142a6ebda91; cna=MQj5FQMZD0sCAXxONRZeF0y0; isg=BAgI5_cywVJRgi3av8Xnqb_92XYasWy7u5UjZsK5VAN2nagHasE8S56fETVItiST; l=cBQfGYbVq1scMYebBOCanurza77OSIRYYuPzaNbMi_5pY6TsXPQOkP9r_F96VjWd9vYB41hTyPJ9-etkqPrdT9dbHZ9R.; CNZZDATA1000427971=618518977-1568000449-https%253A%252F%252Fs.taobao.com%252F%7C1568254167; pnm_cku822=098%23E1hvuvvUvbpvUvCkvvvvvjiPRFLUljtVn2sUQjivPmPWljYWRF5vQjDvRLLyAjlhiQhvCvvvpZpEvpCW9HKm93zEn1m65kx%2FQjZJ%2Bu0XjobyACuXjBrQpWkQRqJ6WeCpqU0QKfUpwy2IAfUTKFyK2ixrV4TJVVQHYWpXei%2BffCuYiLUpVE6FpdyCvm9vvhCvvvvvvvvvBGwvvUjZvvCj1Qvvv3QvvhNjvvvmmvvvBGwvvvUUkphvC9hvpyPwl8yCvv9vvhh%2BFHx7CphCvvOv9hCvvvvPvpvhvv2MMsyCvvpvvhCv',
                'pragma':
                'no-cache',
                'upgrade-insecure-requests':
                '1',
                'user-agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                time.sleep(0.3)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            timeout=30)
                except:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            timeout=30)
            # response.encoding = 'gbk'
            # print("正在抓取的链接是: %s" % url)
            # print(response.text)
            spuid = re.search(
                r'"spuId":".*","categoryId"|"spuId":.*?,|amp;spuId=.*?"|spuId=.*?"',
                response.text)
            re_spuid = self.re_not_number(spuid.group())
            # logger.log(31, 'spuid:', re_spuid)
            # 将响应转换成一个element对象
            # html = etree.HTML(response.text)
            # # print(html)
            # # 获取所有品牌节点列表
            # pin_pai_list = html.xpath('//li[@id="J_attrBrandName"]/text()|//ul[@class="attributes-list"]/li[1]/text()')[0]
            # if '品牌' not in pin_pai_list:
            #     pin_pai = html.xpath('//li[@id="J_attrBrandName"]/text()|//ul[@class="attributes-list"]/li[2]/text()')[0]
            # else:
            #     pin_pai = pin_pai_list
            # print(goods_url, pin_pai_list[0])
            # data_list = []
            # 遍历品牌节点列表
            goods_dict['spuid'] = re_spuid
            # print(goods_dict)
            response.close()
            self.goods_collection_num(goods_dict)
        except:
            print(33333333333333333333, traceback.format_exc())

    # 抓取商品收藏数（人气）
    def goods_collection_num(self, goods_dict):
        try:
            url = 'https://count.taobao.com/counter3?callback=jsonp243&keys=SM_368_dsr-{},ICCP_1_{}'.format(
                goods_dict['sellerId'], goods_dict['itemId'])
            headers = {
                'content-type': 'application/x-javascript',
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _uab_collina=155540168306791903478476; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=f32553e159b195a4f17c00010f2bcd2e_1564547678304; _m_h5_tk_enc=3268f7bf49fd78b94768c96e3ef51817; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b227365617263686170703b32223a2266303666326434356635336264366335613639393662663834646632366531644349582b684f6f46454c47473571714e726f325955526f4c4f4459314d5467794d6a4d334f7a453d227d; l=cBaOcPD7qg21z_uyBOfZKurza779uIdf1sPzaNbMiICPO_fh5wONWZFb8t8MCnGVLsI2535t6zUaBXYaGyUIh2nk8b8CgsDd.; isg=BOzsOXiZnf59JomJ--wm9a9SvcreDZFEZ8nHSkYsbxe0UY9baraQ30WjcVnMWcin; JSESSIONID=A9F406FD84CDFD576728A12ECBD98A53',
                'upgrade-insecure-requests': '1',
                'user-agent': random.choice(user_agent_list)
            }

            try:
                time.sleep(0.2)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            re_collection_num = re.search(r'":\d{1,20}', response.text)
            # print(re_collection_num.group())
            goods_dict['人气数'] = self.re_html(re_collection_num.group())
            # print(goods_dict)
            response.close()
            self.parse_goods_comment_num(goods_dict)
        except:
            print(444444444444444444, traceback.format_exc())

    # 抓取商品评论数
    def parse_goods_comment_num(self, goods_dict):
        try:
            url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId={}&spuId={}&sellerId={}&groupId&_ksTS=1564105737969_212&callback=jsonp213'.format(
                goods_dict['itemId'], goods_dict['spuid'],
                goods_dict['sellerId'])
            headers = {
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _uab_collina=155540168306791903478476; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=f32553e159b195a4f17c00010f2bcd2e_1564547678304; _m_h5_tk_enc=3268f7bf49fd78b94768c96e3ef51817; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b227365617263686170703b32223a2266303666326434356635336264366335613639393662663834646632366531644349582b684f6f46454c47473571714e726f325955526f4c4f4459314d5467794d6a4d334f7a453d227d; l=cBaOcPD7qg21z_uyBOfZKurza779uIdf1sPzaNbMiICPO_fh5wONWZFb8t8MCnGVLsI2535t6zUaBXYaGyUIh2nk8b8CgsDd.; isg=BOzsOXiZnf59JomJ--wm9a9SvcreDZFEZ8nHSkYsbxe0UY9baraQ30WjcVnMWcin; JSESSIONID=A9F406FD84CDFD576728A12ECBD98A53',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'Referer':
                'https://detail.tmall.com/item.htm?spm=a230r.1.14.6.17c44e4c0Tr15d&id=572069385580&cm_id=140105335569ed55e27b&abbucket=1',
                'User-Agent': random.choice(user_agent_list)
            }
            try:
                time.sleep(0.2)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            re_comment_num = re.search(r'{"dsr":{.*}}', response.text)
            goods_dict['评价人数'] = json.loads(
                re_comment_num.group())['dsr']['rateTotal']
            if int(self.re_not_number(goods_dict['评价人数'])) == 0:
                # logger.log(31, '---------该商品没有评价数据-------')
                response.close()
                pass
            else:
                headers1 = {
                    'cookie':
                    't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                    'pragma': 'no-cache',
                    'upgrade-insecure-requests': '1',
                    'Content-Type': 'application/x-www-form-urlencoded',
                    'Referer':
                    'https://detail.tmall.com/item.htm?spm=a230r.1.14.16.26804e4ck29eWS&id=597034992998&ns=1&abbucket=1',
                    'User-Agent': random.choice(user_agent_list)
                }
                comment_url = 'https://rate.tmall.com/list_detail_rate.htm?itemId={}&spuId={}&sellerId={}&order=1&currentPage=1'.format(
                    goods_dict['itemId'], goods_dict['spuid'],
                    goods_dict['sellerId'])
                try:
                    time.sleep(0.2)
                    response1 = requests.get(url=comment_url,
                                             headers=headers1,
                                             proxies=proxies,
                                             allow_redirects=False,
                                             timeout=30)
                except:
                    try:
                        time.sleep(0.2)
                        response1 = requests.get(url=comment_url,
                                                 headers=headers1,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                    except:
                        time.sleep(0.2)
                        response1 = requests.get(url=comment_url,
                                                 headers=headers1,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                comment_data = response1.text
                comment = re.search(
                    r'{"rateDetail":{"rateCount":{"total":.*"tags":\[]}}',
                    comment_data)
                pages_data = json.loads(
                    comment.group())['rateDetail']['paginator']['items']
                if int(pages_data) == 0:
                    pass
                else:
                    pages_num = int(math.ceil(float(int(pages_data) / 20)))
                    response1.close()
                    response.close()
                    self.goods_comments(goods_dict, pages_num)
        except:
            print(5555555555555555555555, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, pages_num):
        try:
            is_break = self.is_break
            # print(goods_dict)
            itemId = goods_dict['itemId']
            sellerId = goods_dict['sellerId']
            spuId = goods_dict['spuid']

            headers = {
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer':
                'https://detail.tmall.com/item.htm?spm=a230r.1.14.16.26804e4ck29eWS&id=597034992998&ns=1&abbucket=1',
                'User-Agent': random.choice(user_agent_list)
            }
            if int(pages_num) >= 99:
                pages = 99
            else:
                pages = pages_num
            # logger.log(31, '-------------评论总页数是：%s --------------' % pages)
            # 抓取商品评论链接(总共99页,从1开始)
            for i in range(1, int(pages) + 1):
                comment_url = 'https://rate.tmall.com/list_detail_rate.htm?itemId={}&spuId={}&sellerId={}&order=1&currentPage={}'.format(
                    itemId, spuId, sellerId, i)
                # print(comment_url)
                # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                comment_data = response.text
                # logger.log(31, '开始抓取评论')
                # print(comment_data)
                comment = re.search(
                    r'{"rateDetail":{"rateCount":{"total":.*"tags":\[]}}',
                    comment_data)
                # print(comment.group())
                items = json.loads(comment.group())['rateDetail']['rateList']
                # print(items)
                goods_data = dict()
                for item in items:
                    date_data = item['rateDate'].split(' ', 1)[0]
                    time_data = item['rateDate'].split(' ', 1)[1]
                    # print('评论时间', date_data, time_data)
                    try:
                        content = item['appendComment']['content']
                    except:
                        content = ''

                    # 判断评论时间是否在规定的抓取时间内
                    if self.start_time <= date_data.strip():
                        goods_data['platform'] = goods_dict['平台']
                        goods_data['date'] = date_data.strip()
                        goods_data['time'] = time_data.strip()
                        goods_data['keyword'] = goods_dict['关键词']
                        goods_data['name'] = goods_dict['商品名']
                        goods_data['url'] = goods_dict['URL']
                        goods_data['shop_name'] = goods_dict['shop_name']
                        goods_data['user_name'] = item['displayUserNick']
                        goods_data['content'] = self.re_html(
                            item['rateContent']) + ';' + str(
                                self.re_html(content))
                        goods_data['content_id'] = str(item['id'])
                        goods_data['brand'] = goods_dict['品牌']
                        goods_data['price'] = goods_dict['价格']
                        goods_data['sales'] = goods_dict['月销量']
                        goods_data['focus_count'] = goods_dict['人气数']
                        goods_data['comment_num'] = goods_dict['评价人数']
                        goods_data['views'] = ''
                        goods_data['likes'] = ''
                        goods_data['comments_count'] = ''
                        goods_data['author_id'] = ''
                        goods_data['reposts_count'] = ''
                        goods_data['topic_id'] = str(goods_dict['itemId'])
                        # 判断size和type
                        test_data = item['auctionSku']
                        if '分类' in test_data:
                            goods_data['type'] = test_data.split(
                                ':')[1].replace(';尺码:',
                                                '').replace(';鞋码', '').replace(
                                                    ';尺码', '')
                            try:
                                goods_data['size'] = test_data.split(
                                    ':')[2].split(';')[0]
                            except:
                                try:
                                    goods_data['size'] = test_data.split(
                                        ':')[2]
                                except:
                                    goods_data['size'] = ''
                        else:
                            goods_data['type'] = ''
                            goods_data['size'] = ''
                        goods_data['imageurl'] = goods_dict['imageurl']
                        goods_data['audiourl'] = goods_dict['audiourl']
                        goods_data['file_code'] = '50'
                        # logger.log(31, '--------********开始写入商品数据********--------')
                        # print(goods_data)
                        item = json.dumps(dict(goods_data),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_daily/ecommerce/{}/50_{}_{}_Tmall_nike{}.json'
                            .format(time.strftime('%Y%m%d'),
                                    time.strftime('%Y%m%d'), self.time_data,
                                    self.pid),
                            item,
                            encoding='utf-8')
                    if date_data.strip() < self.start_time:
                        is_break = True
                if is_break:
                    break
        except:
            print(7777777777777777777, traceback.format_exc())

    def run(self, lock):
        # f = open('E:/chance/电商2/taobao/parse_keyword/pk2_Tmall_url_NIKE.json', 'r', encoding='utf-8')
        # goods_data_list = []
        # for line in f.readlines():
        #     dic_data = json.loads(line)
        #     goods_data_list.append(dic_data)
        # for data in goods_data_list:
        #     self.parse_goods_url(data)
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('Tmall_day_url')
            if str(redis_url_num) == '0':
                print('*******Redis消息队列中url为空，程序等待中...进程 {} 等待中....******'.
                      format(str(os.getpid())))
            item = self.redis_example.brpop('Tmall_day_url', timeout=600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_goods_url(item1)

Exemple #19

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段：{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段：{}到{}'.format(start_time_data, end_time))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time

        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;&hellip;', '', ret6)
            ret8 = re.sub(r'":', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 过滤url里面的#detail
    def re_detail(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'#detail', '', message)
            return ret1
        except:
            pass

    # 过滤品牌
    def re_pin_pai(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'<li title.*?>', '', ret1)
            ret3 = re.sub(r'品牌:&nbsp;', '', ret2)
            return ret3
        except:
            pass

    # 解析请求得到的商品信息
    def parse_goods_url(self, data):
        goods_dict = dict()
        goods_dict['平台'] = '淘宝'
        goods_dict['URL'] = data['URL']
        goods_dict['商品名'] = data['商品名']
        try:
            goods_dict['品牌'] = data['品牌']
        except:
            goods_dict['品牌'] = ''
        goods_dict['价格'] = data['价格']
        goods_dict['shop_name'] = data['shop_name']
        goods_dict['月销量'] = data['月销量'].replace('人付款', '')
        goods_dict['关键词'] = data['关键词']
        goods_dict['itemId'] = data['itemId']
        goods_dict['sellerId'] = data['sellerId']
        goods_dict['imageurl'] = data['商品图片']
        goods_dict['audiourl'] = ''
        # logger.log(31, '************************正在抓取的商品是:%s................' % goods_dict)
        self.goods_collection_num(goods_dict)

    # 抓取商品收藏数（人气）
    def goods_collection_num(self, goods_dict):
        try:
            url = 'https://count.taobao.com/counter3?callback=jsonp235&keys=ICCP_1_{}'.format(
                goods_dict['itemId'])
            headers = {
                'content-type': 'application/x-javascript',
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; _uab_collina=155540168306791903478476; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=f32553e159b195a4f17c00010f2bcd2e_1564547678304; _m_h5_tk_enc=3268f7bf49fd78b94768c96e3ef51817; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b227365617263686170703b32223a2266303666326434356635336264366335613639393662663834646632366531644349582b684f6f46454c47473571714e726f325955526f4c4f4459314d5467794d6a4d334f7a453d227d; l=cBaOcPD7qg21z_uyBOfZKurza779uIdf1sPzaNbMiICPO_fh5wONWZFb8t8MCnGVLsI2535t6zUaBXYaGyUIh2nk8b8CgsDd.; isg=BOzsOXiZnf59JomJ--wm9a9SvcreDZFEZ8nHSkYsbxe0UY9baraQ30WjcVnMWcin; JSESSIONID=A9F406FD84CDFD576728A12ECBD98A53',
                'upgrade-insecure-requests': '1',
                'user-agent': random.choice(user_agent_list)
            }

            try:
                time.sleep(0.3)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            re_collection_num = re.search(r'":\d{1,20}', response.text)
            # print(re_collection_num.group())
            goods_dict['人气数'] = self.re_html(re_collection_num.group())
            # print(goods_dict)
            response.close()
            self.parse_goods_comment_num(goods_dict)
        except:
            print(444444444444444444, traceback.format_exc())

    # 抓取商品评论数
    def parse_goods_comment_num(self, goods_dict):
        try:
            url = 'https://rate.taobao.com/detailCommon.htm?auctionNumId={}&userNumId={}'.format(
                goods_dict['itemId'], goods_dict['sellerId'])
            headers = {
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer':
                'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1',
                'User-Agent': random.choice(user_agent_list)
            }
            try:
                time.sleep(0.2)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            # print('11111')
            # print(response.text)
            if 'total' in response.text:
                re_comment_num = json.loads(
                    response.text.replace('(', '').replace(')', ''))
                # print(re_comment_num)
                goods_dict['评价人数'] = re_comment_num['data']['count']['total']
                # print(goods_dict['评价人数'])
                if int(self.re_not_number(goods_dict['评价人数'])) == 0:
                    # logger.log(31, '-----------该商品没有评论数据--------------')
                    pass
                else:
                    itemId = goods_dict['itemId']
                    sellerId = goods_dict['sellerId']
                    headers1 = {
                        'cookie':
                        't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                        'pragma': 'no-cache',
                        'upgrade-insecure-requests': '1',
                        'Content-Type': 'application/x-www-form-urlencoded',
                        'Referer':
                        'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1',
                        'User-Agent': random.choice(user_agent_list)
                    }
                    comment_url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId={}&currentPageNum=1&pageSize=20&rateType=&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0'.format(
                        itemId, sellerId)
                    try:
                        time.sleep(0.3)
                        response1 = requests.get(url=comment_url,
                                                 headers=headers1,
                                                 proxies=proxies,
                                                 allow_redirects=False,
                                                 timeout=30)
                    except:
                        try:
                            time.sleep(0.3)
                            response1 = requests.get(url=comment_url,
                                                     headers=headers1,
                                                     proxies=proxies,
                                                     allow_redirects=False,
                                                     timeout=30)
                        except:
                            time.sleep(0.3)
                            response1 = requests.get(url=comment_url,
                                                     headers=headers1,
                                                     proxies=proxies,
                                                     allow_redirects=False,
                                                     timeout=30)
                    re_pages = re.search(
                        r'{"qnaDisabled":true,"watershed":.*"maxPage":.*}',
                        response1.text)
                    comment_nums = json.loads(re_pages.group())['total']
                    if int(comment_nums) == 0:
                        pass
                    else:
                        pages_num = int(
                            math.ceil(float(int(comment_nums) / 20)))
                        response.close()
                        response1.close()
                        self.goods_comments(goods_dict, pages_num)
        except:
            print(5555555555555555555555, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, pages_num):
        try:
            is_break = self.is_break
            # print(goods_dict)
            itemId = goods_dict['itemId']
            sellerId = goods_dict['sellerId']

            headers = {
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer':
                'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1',
                'User-Agent': random.choice(user_agent_list)
            }
            # print('----------------商品评论总页数是： %s -----------------------' % pages_num)
            # 抓取商品评论链接(总共99页,从1开始)
            for i in range(1, int(pages_num) + 1):
                comment_url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId={}&currentPageNum={}&pageSize=20&rateType=&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0'.format(
                    itemId, sellerId, i)
                # print(comment_url)
                # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10)
                try:
                    time.sleep(0.3)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    try:
                        time.sleep(0.3)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        time.sleep(0.3)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                comment_data = response.text
                # print('开始抓取评论')
                # print(comment_data)
                comment = re.search(
                    r'{"qnaDisabled":true,"watershed":.*"maxPage":.*}',
                    comment_data)
                # print(comment.group())
                items = json.loads(comment.group())['comments']
                # print(items)
                goods_data = dict()
                for item in items:
                    # if item['date'] != None:
                    # time_test = item['date'].split(' ')[0].replace('年', '-').replace('月', '-').replace('日', '') + ' ' + item['date'].split(' ')[1] + ':00'
                    date_data = item['date'].split(' ')[0].replace(
                        '年', '-').replace('月', '-').replace('日', '')
                    try:
                        time_data = item['date'].split(' ')[1] + ':00'
                    except:
                        time_data = '00:00:00'
                    # print('评论时间', date_data, time_data)
                    try:
                        content = item['content']
                    except:
                        content = ''
                    # 追加评论
                    try:
                        comments_2 = item['appendList'][0]['content']
                    except:
                        comments_2 = ''
                    time_test = date_data + ' ' + time_data
                    # 判断评论时间是否在规定的抓取时间内
                    if self.start_time <= time_test:
                        goods_data['platform'] = goods_dict['平台']
                        goods_data['date'] = date_data.strip()
                        goods_data['time'] = time_data.strip()
                        goods_data['keyword'] = goods_dict['关键词']
                        goods_data['name'] = goods_dict['商品名']
                        goods_data['url'] = goods_dict['URL']
                        goods_data['shop_name'] = goods_dict['shop_name']
                        goods_data['user_name'] = item['user']['nick']
                        goods_data['content'] = content + ';' + comments_2
                        goods_data['content_id'] = str(item['rateId'])
                        goods_data['brand'] = goods_dict['品牌']
                        goods_data['price'] = goods_dict['价格']
                        goods_data['sales'] = goods_dict['月销量']
                        goods_data['focus_count'] = goods_dict['人气数']
                        goods_data['comment_num'] = goods_dict['评价人数']
                        goods_data['views'] = ''
                        goods_data['likes'] = item['useful']
                        goods_data['comments_count'] = ''
                        goods_data['author_id'] = ''
                        goods_data['reposts_count'] = ''
                        goods_data['topic_id'] = str(goods_dict['itemId'])
                        # 判断size和type 颜色分类:黑色高帮&nbsp;&nbsp尺码:37
                        test_data = item['auction']['sku']
                        if '码' in test_data:
                            goods_data['type'] = test_data.split(
                                ':')[1].replace('尺码', '').replace(
                                    '&nbsp;&nbsp',
                                    '').replace('鞋码', '').replace(';尺码', '')
                            goods_data['size'] = test_data.split(':')[2]
                        else:
                            goods_data['type'] = ''
                            goods_data['size'] = ''
                        # print('--------********开始写入商品数据********--------')
                        # print(goods_data)
                        goods_data['imageurl'] = goods_dict['imageurl']
                        goods_data['audiourl'] = goods_dict['audiourl']
                        goods_data['file_code'] = '55'
                        # logger.log(31, '--------********开始写入商品数据********--------')
                        # print(goods_data)
                        item = json.dumps(dict(goods_data),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_2h/ecommerce/{}/{}/55_{}_TaoBao_nike{}.json'
                            .format(self.date_time, self.h2_name,
                                    time.strftime('%Y%m%d'), str(self.pid)),
                            item,
                            encoding='utf-8')
                    if date_data.strip() < self.start_time:
                        is_break = True
                if is_break:
                    break
        except:
            print(7777777777777777777, traceback.format_exc())

    def run(self, lock):
        for i in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('tao_bao_2h')
            if str(redis_url_num) == '0':
                print(
                    '******Redis消息队列中url为空，程序等待中...进程{}等待中...*********'.format(
                        str(os.getpid())))
            item = self.redis_example.brpop('tao_bao_2h', timeout=3600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_goods_url(item1)

Exemple #20

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=7)
        news_start_time = str(date).split(' ')[0]

        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]

        print('爬取时间段：{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段：{}到{}'.format(news_start_time, current_day))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day
        # 标记爬虫工作1
        self.is_break = False
        # 标记爬虫工作2
        self.is_work = False
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/forum/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'                                -', '', ret1)
            ret3 = re.sub(
                r'                                                            ',
                '', ret2)
            ret4 = re.sub(r"hot\(.*\d?','", '', ret3)
            ret5 = re.sub(r'\[', '', ret4)
            ret6 = re.sub(r'\]', '', ret5)
            ret7 = re.sub(r"',", "", ret6)
            ret8 = re.sub(r"'", "", ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 匹配具体时间
    def clean_date(self, x):
        now = datetime.now()
        if str(x).find('昨天') != -1:
            x = datetime.strftime(now + timedelta(days=-1),
                                  '%Y-%m-%d %H:%M:%S')
        elif str(x).find('前天') != -1:
            x = datetime.strftime(now + timedelta(days=-2),
                                  '%Y-%m-%d %H:%M:%S')
        elif str(x).find('天前') != -1:
            x = datetime.strftime(
                now + timedelta(days=-int(str(x).replace('天前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('小时前') != -1:
            x = datetime.strftime(
                now + timedelta(hours=-int(str(x).replace('小时前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('分钟前') != -1:
            x = datetime.strftime(
                now + timedelta(minutes=-int(str(x).replace('分钟前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('今天') != -1:
            x = str(x).replace('今天', now.strftime('%Y-%m-%d') + ' ')
        elif str(x).find('刚刚') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('秒前') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('月前') != -1:
            x = datetime.strftime(
                now + timedelta(weeks=-4 * int(str(x).replace('月前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('周前') != -1:
            x = datetime.strftime(
                now + timedelta(weeks=-int(str(x).replace('周前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('[') != -1:
            x = x.replace('[', '').replace(']', '')
        elif str(x).find('月') != -1:
            x = x.replace('月', '-').replace('日', '')
        return x

    # 根据关键词搜索请求得到帖子信息
    def parse_goods(self, key_word):
        try:
            print('正在抓取的关键词是：%s' % key_word)
            insert_time = time.strftime('%Y-%m-%d %H:%M:%S')
            key_word_data = urllib.parse.quote(key_word)
            is_break = self.is_break
            url = 'http://so.hualongxiang.com/?keyword={}&desc=time'
            # print(url)
            headers = {
                # 'Content-Type': 'text/html; charset=UTF-8',
                # 'Host': 'so.hualongxiang.com',
                # 'Pragma': 'no-cache',
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            proxies_list = [
                {
                    "http": "222.89.32.136:9999"
                },
                # {"http": "117.80.86.239:3128"}
            ]
            # print('调用代理是:%s' % random.choice(proxies_list))
            time.sleep(10)
            response = requests.get(url=url.format(key_word),
                                    headers=headers,
                                    allow_redirects=False)
            # 将响应转换成一个element对象
            html = etree.HTML(response.text)
            # 获取帖子总数
            topic_num = self.re_not_number(
                html.xpath('//div[@class="wapper"]/p/text()')[0].split('，')[0])
            # print(topic_num)
            if int(topic_num) == 0:
                logger.log(
                    31, '*******-------关键词:%s 搜索不到内容-------*******' % key_word)
            else:
                # 获取帖子页数
                pages_num = int(math.ceil(float(int(topic_num) / 20)))
                # logger.log(31, '关键词: %s , 搜索帖子总数是: %s , 帖子总页数是：%s' % (key_word, topic_num, pages_num))

                for i in range(1, int(pages_num) + 1):
                    topic_url = 'http://so.hualongxiang.com/search/index?keyword={}&desc=time&page={}'.format(
                        key_word, i)
                    # print(topic_url, '调用代理是:%s' % random.choice(proxies_list))
                    time.sleep(10)
                    response1 = requests.get(url=topic_url,
                                             headers=headers,
                                             allow_redirects=False)
                    # 将响应转换成一个element对象
                    html1 = etree.HTML(response1.text)
                    # 获取帖子列表
                    topic_list = html1.xpath(
                        '//div[@class="shopper-list-long"]/ul/li')
                    # print(len(topic_list))
                    hua_long_xiang = dict()
                    # 遍历帖子列表
                    for data in topic_list:
                        date_time_data = self.clean_date(
                            self.re_html(
                                data.xpath('./div[@class="time"]/span/text()')
                                [1]))
                        # print(date_time_data)
                        date_data = date_time_data.split(' ')[0].strip()
                        if self.start_time <= date_data:
                            hua_long_xiang['platform'] = '化龙巷'
                            hua_long_xiang['date'] = date_data
                            hua_long_xiang['insert_time'] = insert_time
                            hua_long_xiang['author'] = data.xpath(
                                './div[@class="time"]/a/text()')[0]
                            hua_long_xiang['author_url'] = data.xpath(
                                './div[@class="time"]/a/@href')[0]
                            hua_long_xiang['post_client'] = '化龙巷APP'
                            hua_long_xiang['title'] = self.re_html(
                                data.xpath('./div[@class="title"]/a/@onclick')
                                [0]).replace("'", '').replace(')', '')
                            hua_long_xiang['url'] = data.xpath(
                                './div[@class="title"]/a/@href')[0]
                            hua_long_xiang['content_id'] = self.re_not_number(
                                hua_long_xiang['url'])
                            hua_long_xiang['brand'] = ''
                            hua_long_xiang['carseries'] = ''
                            hua_long_xiang['series_url'] = ''
                            # print(hua_long_xiang)
                            response.close()
                            response1.close()
                            self.parse_topic_data(hua_long_xiang['url'],
                                                  hua_long_xiang)
                        if date_data < self.start_time:
                            is_break = True
                    if is_break:
                        logger.log(31, '没有符合时间的帖子')
                        break
        except:
            print(111111111111111111111, traceback.format_exc())

    # 解析帖子内容
    def parse_topic_data(self, url, hua_long_xiang):
        try:
            headers = {
                # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                # 'Accept-Encoding': 'gzip, deflate',
                # 'Accept-Language': 'zh-CN,zh;q=0.9',
                # 'Cache-Control': 'no-cache',
                # 'Connection': 'keep-alive',
                'Cookie':
                'srcurl=687474703a2f2f7777772e6875616c6f6e677869616e672e636f6d2f6368617a756f2f3135303537343135;f04e6_lastpos=T15057415;f04e6_ipstate=1573461495;security_session_verify=acc65a1e29d3f4b165840dab4d94db31;security_session_mid_verify=428b31ce793e13908b5c599759e876a4;f04e6_lastvisit=19992%091573462167%09%2Frewrite.php%3Fpychazuo%2F15057415;f04e6_ci=read%091573462167%0915057415%09103;Hm_lpvt_82d62f38b0397423b12572434961fe6c=1573462167',
                # 'Host': 'www.hualongxiang.com',
                # 'Pragma': 'no-cache',
                # 'Referer': 'http://www.hualongxiang.com/chazuo/14994153?security_verify_data=313932302c31303830',
                # 'Upgrade-Insecure-Requests': '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            proxies_list = [
                {
                    "http": "222.89.32.136:9999"
                },
                # {"http": "117.80.86.239:3128"}
            ]
            # print(url, '调用代理是:%s' % random.choice(proxies_list))
            try:
                time.sleep(10)
                response = requests.get(url=url,
                                        headers=headers,
                                        allow_redirects=False)
            except:
                try:
                    time.sleep(10)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False)
                except:
                    time.sleep(10)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False)
            response.encoding = 'gbk'
            if '发表于' not in response.text:
                logger.log(31, '主贴:' + url + '请求失败，重新发起请求')
                time.sleep(20)
                self.parse_topic_data(url, hua_long_xiang)
            else:
                # 将响应转换成一个element对象
                html = etree.HTML(response.text)
                # print(response.text)
                # 获取发帖时间
                time_data_test = re.search(r'发表于：.*?</p>', response.text)
                time_test = time_data_test.group().replace('发表于：', '').replace(
                    '</p>', '').split(' ')[1]
                lang = len(time_test.split(':'))
                if int(lang) == 3:
                    time_data = time_data_test
                else:
                    time_data = time_test.split(
                        ':')[0] + ':' + time_test.split(':')[1] + ':' + '00'
                hua_long_xiang['time'] = time_data
                hua_long_xiang['content'] = self.re_html(
                    html.xpath(
                        '//div[@class="fs16 mb10" and @id="read_tpc"]//text()')
                ).replace('\\r',
                          '').replace('\\n',
                                      '').replace('\\t',
                                                  '').replace('\\xa0', '')
                hua_long_xiang['imageurl'] = html.xpath(
                    '//div[@class="fs16 mb10" and @id="read_tpc"]//img/@src')
                hua_long_xiang['audiourl'] = ''
                hua_long_xiang['from'] = ''
                hua_long_xiang['is_topics'] = '是'
                hua_long_xiang['floor'] = html.xpath(
                    '//div[@class="fr gc6"]/a[2]/text()')[0]
                hua_long_xiang['author_id'] = self.re_not_number(
                    re.search(
                        r"uid=.*?'",
                        html.xpath('//div[@class="fr gc6"]/a[1]/@onclick')
                        [0]).group())
                hua_long_xiang['identification'] = ''
                hua_long_xiang['favorite'] = ''
                hua_long_xiang['signin_time'] = ''
                hua_long_xiang['reply_no'] = html.xpath(
                    '//ul[@class="data"]/li[2]/span/text()')[0]
                hua_long_xiang['views'] = html.xpath(
                    '//ul[@class="data"]/li[1]/span/text()')[0]
                hua_long_xiang['likes'] = ''
                hua_long_xiang['is_elite'] = ''
                hua_long_xiang['topic_count'] = html.xpath(
                    '//span[@class="user-info2" and @id="showface_0"]/ul/li[1]/a/text()'
                )[0]
                hua_long_xiang['reply_count'] = ''
                hua_long_xiang['pick_count'] = ''
                hua_long_xiang['follows'] = ''
                hua_long_xiang['topic_categroy'] = ''
                hua_long_xiang['topic_type'] = ''
                hua_long_xiang['reposts_count'] = ''
                hua_long_xiang['update_time'] = time.strftime(
                    '%Y-%m-%d %H:%M:%S')
                hua_long_xiang['topic_id'] = hua_long_xiang['content_id']
                hua_long_xiang['file_code'] = '187'

                # logger.log(31, '----------------正在写入主贴----------------')
                # print(hua_long_xiang)
                response.close()
                item = json.dumps(dict(hua_long_xiang),
                                  ensure_ascii=False) + '\n'
                self.hdfsclient.new_write(
                    '/user/cspider_daily/nike_daily/forum/{}/187_{}_{}_hualongxiang_Nike.json'
                    .format(time.strftime('%Y%m%d'), time.strftime('%Y%m%d'),
                            self.time_data),
                    item,
                    encoding='utf-8')
                if int(hua_long_xiang['reply_no']) == 0:
                    logger.log(31, '没有回帖')
                else:
                    # 获取回帖页数
                    pages_num = int(
                        math.ceil(float(int(hua_long_xiang['reply_no']) / 25)))
                    for i in range(pages_num, 0, -1):
                        url_topic = 'http://www.hualongxiang.com/read.php?tid={}&pd=0&page={}'.format(
                            hua_long_xiang['content_id'], i)
                        self.parse_reply(url_topic, hua_long_xiang)
        except:
            print(url, '请求主贴失败，重新发起请求')
            time.sleep(20)
            self.parse_topic_data(url, hua_long_xiang)
            print(222222222222222222222, traceback.format_exc())

    # 抓取回帖内容
    def parse_reply(self, url_topic, hua_long_xiang):
        try:
            is_work = self.is_work
            start_time = time.strftime('%Y-%m-%d %H:%M:%S')
            headers = {
                # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                # 'Accept-Encoding': 'gzip, deflate',
                # 'Accept-Language': 'zh-CN,zh;q=0.9',
                # 'Cache-Control': 'no-cache',
                # 'Connection': 'keep-alive',
                'Cookie':
                'srcurl=687474703a2f2f7777772e6875616c6f6e677869616e672e636f6d2f6368617a756f2f3135303537343135;f04e6_lastpos=T15057415;f04e6_ipstate=1573461495;security_session_verify=acc65a1e29d3f4b165840dab4d94db31;security_session_mid_verify=428b31ce793e13908b5c599759e876a4;f04e6_lastvisit=19992%091573462167%09%2Frewrite.php%3Fpychazuo%2F15057415;f04e6_ci=read%091573462167%0915057415%09103;Hm_lpvt_82d62f38b0397423b12572434961fe6c=1573462167',
                # 'Host': 'www.hualongxiang.com',
                # 'Pragma': 'no-cache',
                # 'Referer': 'http://www.hualongxiang.com/chazuo/14994153?security_verify_data=313932302c31303830',
                # 'Upgrade-Insecure-Requests': '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            proxies_list = [
                {
                    "http": "222.89.32.136:9999"
                },
                # {"http": "117.80.86.239:3128"}
            ]
            # print('调用代理是:%s' % random.choice(proxies_list))
            try:
                time.sleep(10)
                response = requests.get(url=url_topic,
                                        headers=headers,
                                        allow_redirects=False)
            except:
                try:
                    time.sleep(10)
                    response = requests.get(url=url_topic,
                                            headers=headers,
                                            allow_redirects=False)
                except:
                    time.sleep(10)
                    response = requests.get(url=url_topic,
                                            headers=headers,
                                            allow_redirects=False)
            response.encoding = 'gbk'
            if '发表于' not in response.text:
                logger.log(31, '回帖:' + url_topic + '请求失败，重新发起请求')
                time.sleep(20)
                self.parse_reply(url_topic, hua_long_xiang)
            else:
                # 将响应转换成一个element对象
                html = etree.HTML(response.text)

                reply_dict = dict()
                # 获取回帖列表
                reply_list = html.xpath(
                    '//div[@class="read_t"]/table[@class="floot"]')
                for item in reply_list[::-1]:
                    floor_data = item.xpath(
                        './tr[1]/td[2]/div[2]/div/a[2]/text()')[0]
                    print(floor_data)
                    if floor_data == '楼主' or floor_data == '置顶':
                        pass
                    else:
                        url_data = response.url
                        floor_test = floor_data
                        date_time_test = item.xpath(
                            './tr[1]/td[2]/div[2]/p/text()')[0].replace(
                                '发表于 ', '').strip()
                        print(date_time_test)
                        # 发表日期
                        date_data = date_time_test.split(' ')[0].strip()
                        # 发表时间
                        time_data_test = date_time_test.split(' ')[1]
                        lang = len(time_data_test.split(':'))
                        if int(lang) == 3:
                            time_data = time_data_test.strip()
                        else:
                            time_data = (time_data_test.split(':')[0] + ':' +
                                         time_data_test.split(':')[1] + ':' +
                                         '00').strip()
                        if self.start_time <= date_data:
                            reply_dict['platform'] = hua_long_xiang['platform']
                            reply_dict['date'] = date_data
                            reply_dict['time'] = time_data
                            reply_dict['author'] = item.xpath(
                                './tr[1]/td[1]/div/div[1]/span[3]/text()')[0]
                            reply_dict['author_url'] = item.xpath(
                                './tr[1]/td[1]/div/div[2]/a/@href')[0]
                            reply_dict['author_id'] = self.re_not_number(
                                re.search(
                                    r"uid=.*?'",
                                    item.xpath(
                                        './tr[1]/td[2]/div[2]/div/a[1]/@onclick'
                                    )[0]).group())
                            reply_dict['post_client'] = hua_long_xiang[
                                'post_client']
                            reply_dict['title'] = hua_long_xiang['title']
                            reply_dict['content'] = item.xpath(
                                './tr[1]/td[2]/div[4]/div/div[2]/text()')[0]
                            reply_dict['imageurl'] = ''
                            reply_dict['audiourl'] = ''
                            reply_dict['content_id'] = self.re_not_number(
                                item.xpath(
                                    './tr[1]/td[2]/div[4]/div/div[2]/@id')[0])
                            reply_dict['brand'] = ''
                            reply_dict['carseries'] = ''
                            reply_dict['from'] = ''
                            reply_dict['series_url'] = ''
                            reply_dict['url'] = url_data
                            reply_dict['is_topics'] = '否'
                            reply_dict['floor'] = floor_test
                            reply_dict['identification'] = ''
                            reply_dict['favorite'] = ''
                            reply_dict['signin_time'] = ''
                            reply_dict['reply_no'] = ''
                            reply_dict['views'] = ''
                            reply_dict['likes'] = ''
                            reply_dict['is_elite'] = ''
                            reply_dict['topic_count'] = item.xpath(
                                './tr[1]/td[1]/div/span/ul/li[1]/a/text()')[0]
                            reply_dict['reply_count'] = ''
                            reply_dict['pick_count'] = ''
                            reply_dict['follows'] = ''
                            reply_dict['topic_categroy'] = ''
                            reply_dict['topic_type'] = ''
                            reply_dict['reposts_count'] = ''
                            reply_dict['insert_time'] = start_time
                            reply_dict['update_time'] = time.strftime(
                                '%Y-%m-%d %H:%M:%S')
                            reply_dict['topic_id'] = hua_long_xiang['topic_id']
                            reply_dict['file_code'] = '187'
                            # logger.log(31, '******************开始写入回帖数据**********************')
                            # print(reply_dict)
                            response.close()
                            item = json.dumps(dict(reply_dict),
                                              ensure_ascii=False) + '\n'
                            self.hdfsclient.new_write(
                                '/user/cspider_daily/nike_daily/forum/{}/187_{}_{}_hualongxiang_Nike.json'
                                .format(time.strftime('%Y%m%d'),
                                        time.strftime('%Y%m%d'),
                                        self.time_data),
                                item,
                                encoding='utf-8')
                        if date_data < self.start_time:
                            is_work = True
                    if is_work:
                        break
        except:
            print(url_topic, '请求回贴失败，重新发起请求')
            time.sleep(20)
            self.parse_topic_data(url_topic, hua_long_xiang)
            print(333333333333333333333, traceback.format_exc())

    # 读取excel获取关键词
    def parse_xlsx(self):
        # 设置路径
        path = './快消采集关键词_v12_20200119.xlsx'
        # 打开execl
        workbook = xlrd.open_workbook(path)

        # 根据sheet索引或者名称获取sheet内容
        Data_sheet = workbook.sheets()[0]  # 通过索引获取

        rowNum = Data_sheet.nrows  # sheet行数
        colNum = Data_sheet.ncols  # sheet列数

        # 获取所有单元格的内容
        list = []
        for i in range(rowNum):
            rowlist = []
            for j in range(colNum):
                rowlist.append(Data_sheet.cell_value(i, j))
            list.append(rowlist)

        for data in list[1::]:
            brand = data[0]
            # print(brand)
            yield {
                '关键词': brand,
            }

    def run(self):
        key_word_list = []
        for item in self.parse_xlsx():
            key_word_list.append(item)
        for item_data in key_word_list:
            self.parse_goods(item_data['关键词'])

Exemple #21

0

Afficher le fichier

Fichier : wangyi_esport_hour.py Projet : yuzhujiutian/nike_spider

class Spider(object):
    """
    网易体育新闻
    """
    def __init__(self, file_path, comment_path, need_time):

        self.headers_one = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
        }
        # 评论接口模板
        self.comment_port_url = 'http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/{}/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset={}&callback=jsonp_1542355418897&_=1542355418898'

        # # get_now_time = time.time() - 86400
        # get_now_time = time.time() - int(need_time)
        # time_local = time.localtime(float(get_now_time))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # end_t = time.time()
        # time_local = time.localtime(float(end_t))
        # # 转换成新的时间格式(2016-05-05 20:28:54)
        # end_dt = time.strftime("%Y-%m-%d %H:%M", time_local)  # "%Y-%m-%d %H:%M:%S"
        # # end_time = str(end_time).split(' ')[0]
        # logging.log(31, '爬取时间段：{}到{}'.format(dt, str(datetime.now())))
        # # 定义开始时间 y-m-d  离现在时间远
        # self.start_time = dt
        # # self.start_time = '2019-09-09 00:01'
        # # 定义结束时间 y-m-d  离现在时间近
        # self.end_time = end_dt
        # # self.end_time = '2019-09-16 12:57'

        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:  # 对于凌晨 0 点的判断
            # 时间判断部分
            date = datetime.now() - timedelta(days=1)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        else:
            # 时间判断部分
            date = datetime.now() - timedelta(days=0)
            news_start_time = str(date).split(' ')[0]
            yesterday = datetime.now() - timedelta(days=0)  # 昨天时间
            yesterday = str(yesterday).split(' ')[0]
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time + ' 0:00'
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = yesterday + ' 23:59'
        # 标记爬虫工作
        self.is_work = True
        self.file_name_time = self.get_file_name_time()
        self.file_path = file_path
        self.comment_apth = comment_path
        self.hdfsclient = HdfsClient(url='http://*****:*****@class="articleList"]/li')
        for li in li_list:
            title = li.xpath('.//a/text()')[0]
            news_url = li.xpath('.//a/@href')[0]
            try:
                self.get_news_info_page(news_url, '', '')
            except:
                try:
                    self.get_news_info_page(news_url, '', '')
                except:
                    logger.error(traceback.format_exc())

    # 获取新闻详情页
    def get_news_info_page(self, news_url, comment_count, page_list):
        logger.log(31, '文章url:  ' + news_url)
        item = {}
        response = requests.get(news_url, headers=self.headers_one)
        status_code = response.status_code
        if status_code == 200:
            try:
                data = response.content.decode('gbk')
            except (UnicodeDecodeError, ):
                data = response.content.decode('utf-8')
            data = etree.HTML(data)
            news_id = news_url.split('/')[-1].split('.')[0]
            try:
                title = data.xpath('.//div[@id="epContentLeft"]/h1/text()')[0]
            except:
                title = data.xpath('.//h1/text()')[0]
            try:
                date_all = data.xpath(
                    './/div[@class="post_time_source"]/text()')[0]
                date_all = re.findall('\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                      date_all)[0]
            except:
                date_all = data.xpath(
                    './/div[@class="headline"]/span/text()')[0]

            # 获取评论数
            try:
                comment_response = requests.get('http://comment.tie.163.com/' +
                                                str(news_id) + '.html',
                                                headers=self.headers_one)
                # print('http://comment.tie.163.com/' + str(news_id) + '.html')
                # comment_data = comment_response.content.decode()
                count = re.search('"tcount":\d{0,10}',
                                  comment_response.text).group(0)
                count = count.split(":")[1]
                comment_id = news_id
            except AttributeError:
                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
                    'Host': 'comment.tie.163.com',
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                    'Upgrade-Insecure-Requests': '1',
                    'Accept-Language': 'zh-CN,zh;q=0.9',
                }
                comment_id = re.search('docId" :  "(.*)?",',
                                       response.text).group(1)
                # print(comment_id)
                # print('http://comment.tie.163.com/' + str(comment_id) + '.html')
                comment_response = requests.get('http://comment.tie.163.com/' +
                                                str(comment_id) + '.html',
                                                headers=headers)
                count = re.search('"tcount":\d{0,10}',
                                  comment_response.text).group(0)
                count = count.split(":")[1]
            except:
                # print(traceback.format_exc())
                count = ''

            # 网站
            item['platform'] = '网易新闻'
            # 日期date
            #  评论部分做时间判断部分---------------
            get_news_time = time.mktime(
                time.strptime(str(date_all.split(' ')[0]).strip(), "%Y-%m-%d"))
            # end_time = time.mktime(time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
            if self.start_time != '':
                start_time = time.mktime(
                    time.strptime(self.start_time.split(' ')[0], "%Y-%m-%d"))
            else:
                start_time = time.mktime(time.strptime('2010-1-1', "%Y-%m-%d"))
            if float(get_news_time) < float(start_time):
                print('时间不符合')
            elif float(start_time) <= float(get_news_time):

                date = date_all.strip().split(' ')[0]
                item['date'] = date
                news_time = date_all.strip().split(' ')[1]
                item['time'] = news_time
                item['title'] = title
                # 来源
                try:
                    source = data.xpath(
                        './/div[@class="post_time_source"]/a/text()')[0]
                    item['article_source'] = source  # 文章来源
                except:
                    item['article_source'] = ''
                try:
                    item['article_author'] = data.xpath(
                        './/span[@class="ep-editor"]/text()')[0]
                except:
                    item['article_author'] = ''
                # 正文内容
                content = data.xpath(
                    './/div[@id="endText"]/p/text() | .//div[@id="endText"]/p/a/text() |.//div[@class="overview"]//p/text()'
                )
                images_url = data.xpath('.//div[@id="endText"]//img/@src')

                content = ''.join(content)
                content = content.replace('\n', '')
                content = content.replace(' ', '')
                item['content'] = content
                item['keyword'] = ''
                item['views'] = ''
                item['comments_count'] = count
                item['likes'] = ''
                item['clicks'] = ''
                item['article_url'] = news_url  # 文章详情URL
                item['dislikes'] = ''  # 踩人数
                item['series_url'] = ''  # 车系首页
                item['list_url'] = page_list  # 文章列表URL
                if 'buy' in page_list:
                    news_type = '购车'
                elif 'nauto' in page_list:
                    news_type = '新车'
                elif 'drive' in page_list:
                    news_type = '试驾'
                elif 'buyers_guides' in page_list:
                    news_type = '导购'
                elif 'auto_newenergy' in page_list:
                    news_type = '新能源'
                elif 'news' in page_list:
                    news_type = '行业'
                else:
                    news_type = ''

                item['article_type_1st'] = news_type  # 文章类型
                item['article_type_2nd'] = ''  # 文章类型
                item['insert_time'] = str(
                    datetime.now()).split('.')[0]  # 初始爬取时间
                item['update_time'] = str(
                    datetime.now()).split('.')[0]  # 最后爬取时间
                content_id = news_url.split('/')[-1].split('.')[0]
                item['content_id'] = content_id
                item['topic_id'] = str(content_id)  # 主贴id
                item['author_id'] = ''  # 作者id
                item['content_id'] = str(content_id)
                item['file_code'] = '15'
                item['reposts_count'] = ''
                item['imageurl'] = images_url
                item['audiourl'] = []
                # print(item)
                self.__write_news_jsonfile(item)

                # 调用爬取评论的函数
                # http://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/E0IBEEA10008856S/comments/newList?ibc=newspc&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&offset=0&callback=jsonp_1542355418897&_=1542355418898
                self.is_get_comment = True
                self.comment_page_num = 30
                self.get_comment_info(
                    self.comment_port_url.format(comment_id, "0"), news_id,
                    date, news_time, title, news_url)
                # with open('./../wangyi/json_file/{}/{}_news_id.json'.format(self.file_name_time.split(' ')[0],self.file_name_time.split(' ')[0]),'a') as f:
                #     com_item = {}
                #     com_item['url'] = self.comment_port_url.format(comment_id, "0")
                #     com_item['news_id'] = news_id
                #     com_item['date'] = date
                #     com_item['news_time'] = news_time
                #     com_item['title'] = title
                #     com_item['news_url'] = news_url
                #     f.write(str(com_item) + '\n')

    # 获取评论信息
    def get_comment_info(self, url, news_id, source_date, source_time,
                         source_title, source_url):
        # time.sleep(1)

        s = requests.session()
        s.keep_alive = False
        respnse = requests.get(url, headers=self.headers_one)
        status_code = respnse.status_code
        if status_code == 200:
            data = respnse.content.decode()
            try:
                data = re.findall(r'{"commentIds.*newListSize":\d{0,10}}',
                                  data)[0]
                data = json.loads(data)
                comment_data = data['comments']
                comment_id = data['commentIds']
                if comment_id:
                    total_item = ''
                    for comment_info in comment_data.items():
                        # print(comment_info)
                        item = {}
                        comment_info = comment_info[1]
                        # 网站
                        item['platform'] = '网易新闻'
                        # 日期时间
                        date_all = comment_info['createTime']
                        get_date = date_all[:-3]
                        #  评论部分做时间判断部分---------------
                        logger.log(31, date_all)
                        logger.log(31, get_date)
                        get_news_time = time.mktime(
                            time.strptime(str(get_date), "%Y-%m-%d %H:%M"))
                        end_time = time.mktime(
                            time.strptime(self.end_time, "%Y-%m-%d %H:%M"))
                        if self.start_time != '':
                            start_time = time.mktime(
                                time.strptime(self.start_time,
                                              "%Y-%m-%d %H:%M"))
                        else:
                            start_time = time.mktime(
                                time.strptime('2010-1-1', "%Y-%m-%d %H:%M"))
                        if float(get_news_time) < float(start_time):
                            self.is_get_comment = False  # 返回的回答消息是按时间进行排序的，所以当时间小于指定时间时，就停止爬取，
                            break
                        elif float(start_time) <= float(
                                get_news_time) <= float(end_time):
                            item['date'] = get_date
                            comment_time = date_all.split(' ')[1]
                            item['time'] = comment_time
                            # 发帖作者
                            try:
                                author = comment_info['user']['nickname']
                            except KeyError:
                                author = comment_info['user']['location'] + '网友'
                            item['author'] = author

                            item['author_id'] = comment_info['user'][
                                'userId']  # 用户id
                            # 内容
                            content = comment_info['content']
                            item['content'] = content
                            # 点赞数
                            item['likes'] = comment_info['vote']
                            # 原文发布日期时间
                            item['source_date'] = source_date
                            item['source_time'] = source_time
                            # 原文标题
                            item['title'] = source_title
                            # 原文url
                            item['source_url'] = source_url
                            item['keyword'] = ''
                            item['floor'] = ''
                            item[
                                'comment_url'] = 'http://comment.tie.163.com/' + str(
                                    news_id) + '.html'
                            item['comments_count'] = ''
                            item['views'] = ''
                            item['dislikes'] = comment_info['against']  # 踩人数
                            item['insert_time'] = str(
                                datetime.now()).split('.')[0]  # 初始爬取时间
                            item['update_time'] = str(
                                datetime.now()).split('.')[0]  # 最后爬取时间
                            item['content_id'] = comment_info['commentId']
                            content_id = source_url.split('/')[-1].split(
                                '.')[0]
                            item['topic_id'] = str(content_id)  # 主贴id
                            item['content_id'] = comment_info[
                                'commentId']  # 主贴id
                            item['file_code'] = '29'
                            item['reposts_count'] = ''
                            item = json.dumps(dict(item),
                                              ensure_ascii=False) + '\n'
                            total_item += item
                            # print(item)
                    self.__write_comment_jsonfile(total_item)
                    if self.is_get_comment:
                        self.comment_page_num += 30
                        # print(self.comment_page_num, '111111111111111111111111')
                        self.get_comment_info(
                            self.comment_port_url.format(
                                news_id, str(self.comment_page_num)), news_id,
                            source_date, source_time, source_title, source_url)
                else:
                    logger.log(31, '评论爬取完毕')
                    self.comment_page_num = 30
            except:
                logger.error(traceback.format_exc())

    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    # 写入json文件
    def __write_news_jsonfile(self, item):
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        # with open('./../wangyi/json_file/{}/{}_wangyi_news.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write('{}/{}/{}/15_{}_{}_wangyi_news.json'.format(
            self.file_path,
            self.file_name_time.split(' ')[0].replace('-', ''), self.hour_name,
            str(datetime.now()).split(' ')[0].replace('-', '_'),
            self.time_time),
                                  item,
                                  encoding='utf-8')

    def __write_comment_jsonfile(self, item):

        # with open('./../wangyi/json_file/{}/{}_wangyi_news_comment.json'.format(self.file_name_time.split(' ')[0], self.file_name_time), 'ab') as f:
        #     f.write(item.encode("utf-8"))
        self.hdfsclient.new_write(
            '{}/{}/{}/29_{}_{}_wangyi_news_comment.json'.format(
                self.comment_apth,
                self.file_name_time.split(' ')[0].replace('-',
                                                          ''), self.hour_name,
                str(datetime.now()).split(' ')[0].replace('-', '_'),
                self.time_time),
            item,
            encoding='utf-8')

    def run(self):
        # self.get_list_page('http://sports.163.com/special/0005rt/news_json.js?0.4744335570460496')
        #
        self.get_list_page_two(
            'http://sports.163.com/special/0005rt/sportsgd.html')
        for i in range(2, 5):
            if len(str(i)) == 1:
                i = '0' + str(i)
            self.get_list_page_two(
                'http://sports.163.com/special/0005rt/sportsgd_{}.html'.format(
                    str(i)))

Exemple #22

0

Afficher le fichier

Fichier : taobao-新版.py Projet : yuzhujiutian/nike_spider

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]
        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]
        print('爬取时间段：{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段：{}到{}'.format(news_start_time, current_day))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day

        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;&hellip;', '', ret6)
            ret8 = re.sub(r'":', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 过滤url里面的#detail
    def re_detail(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'#detail', '', message)
            return ret1
        except:
            pass

    # 过滤品牌
    def re_pin_pai(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'<li title.*?>', '', ret1)
            ret3 = re.sub(r'品牌:&nbsp;', '', ret2)
            return ret3
        except:
            pass

    # 解析请求得到的商品信息
    def parse_goods_url(self, data):
        goods_dict = dict()
        goods_dict['平台'] = '淘宝'
        goods_dict['URL'] = data['URL']
        goods_dict['商品名'] = data['商品名']
        try:
            goods_dict['品牌'] = data['品牌']
        except:
            goods_dict['品牌'] = ''
        goods_dict['价格'] = data['价格']
        goods_dict['shop_name'] = data['shop_name']
        goods_dict['关键词'] = data['关键词']
        goods_dict['itemId'] = data['itemId']
        goods_dict['sellerId'] = data['sellerId']
        goods_dict['imageurl'] = data['商品图片']
        goods_dict['audiourl'] = ''
        # logger.log(31, '************************正在抓取的商品是:%s................' % goods_dict)
        self.goods_data(goods_dict)

    # 抓取商品详情
    def goods_data(self, goods_dict):
        try:
            id = goods_dict['itemId']
            url = 'https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?jsv=2.5.7&appKey=12574478&sign=fdd00ab68c3566e514d61534041592d7&api=mtop.taobao.detail.getdetail&v=6.0&isSec=0&ecode=0&AntiFlood=true&AntiCreep=true&H5Request=true&ttid=2018%40taobao_h5_9.9.9&type=jsonp&dataType=jsonp&data=%7B%22spm%22%3A%22a230r.1.14.48.6c1d4af9UmOqpx%22%2C%22id%22%3A%22{}%22%2C%22ns%22%3A%221%22%2C%22abbucket%22%3A%226%22%2C%22itemNumId%22%3A%22{}%22%2C%22itemId%22%3A%22{}%22%2C%22exParams%22%3A%22%7B%5C%22spm%5C%22%3A%5C%22a230r.1.14.48.6c1d4af9UmOqpx%5C%22%2C%5C%22id%5C%22%3A%5C%22{}%5C%22%2C%5C%22ns%5C%22%3A%5C%221%5C%22%2C%5C%22abbucket%5C%22%3A%5C%226%5C%22%7D%22%2C%22detail_v%22%3A%228.0.0%22%2C%22utdid%22%3A%221%22%7D'.format(
                id, id, id, id)

            headers = {'User-Agent': random.choice(user_agent_list)}

            try:
                time.sleep(0.3)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
            json_data = json.loads(
                json.loads(response.text, strict=False)['data']['apiStack'][0]
                ['value'])['item']  # 通过是否有apiStack判断商品是否下架

            commentCount = json.loads(
                response.text)['data']['item']['commentCount']
            favcount = json.loads(response.text)['data']['item']['favcount']
            SellCount = json_data['vagueSellCount']
            goods_dict['人气数'] = favcount
            goods_dict['评价人数'] = commentCount
            goods_dict['月销量'] = SellCount

            if int(self.re_not_number(goods_dict['评价人数'])) == 0:
                # logger.log(31, '-----------该商品没有评论数据--------------')
                pass
            else:
                pages_num = int(math.ceil(float(int(goods_dict['评价人数']) / 20)))
                response.close()
                self.goods_comments(goods_dict, pages_num)
        except:
            print(5555555555555555555555, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, pages_num):
        try:
            is_break = self.is_break
            # print(goods_dict)
            itemId = goods_dict['itemId']
            sellerId = goods_dict['sellerId']

            headers = {
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer':
                'https://item.taobao.com/item.htm?spm=a230r.1.14.31.26804e4c03W4qw&id=563490255667&ns=1&abbucket=1',
                'User-Agent': random.choice(user_agent_list)
            }

            if int(pages_num) >= 99:
                pages = 99
            else:
                pages = pages_num
            # print('----------------商品评论总页数是： %s -----------------------' % pages_num)
            # 抓取商品评论链接(总共99页,从1开始)
            for i in range(1, int(pages) + 1):
                comment_url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId={}&userNumId={}&currentPageNum={}&pageSize=20&rateType=&orderType=feedbackdate&attribute=&sku=&hasSku=false&folded=0'.format(
                    itemId, sellerId, i)
                # print(comment_url)
                # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                comment_data = response.text
                # print('开始抓取评论')
                # print(comment_data)
                comment = re.search(
                    r'{"qnaDisabled":true,"watershed":.*"maxPage":.*}',
                    comment_data)
                # print(comment.group())
                items = json.loads(comment.group())['comments']
                # print(items)
                goods_data = dict()
                # logger.log(31, '--------********开始写入商品数据********--------')
                for item in items:
                    # if item['date'] != None:
                    # time_test = item['date'].split(' ')[0].replace('年', '-').replace('月', '-').replace('日', '') + ' ' + item['date'].split(' ')[1] + ':00'
                    date_data = item['date'].split(' ')[0].replace(
                        '年', '-').replace('月', '-').replace('日', '')
                    try:
                        time_data = item['date'].split(' ')[1] + ':00'
                    except:
                        time_data = ''
                    # print('评论时间', date_data, time_data)
                    try:
                        content = item['content']
                    except:
                        content = ''
                    # 追加评论
                    try:
                        comments_2 = item['appendList'][0]['content']
                    except:
                        comments_2 = ''

                    # 判断评论时间是否在规定的抓取时间内
                    if self.start_time <= date_data.strip():
                        goods_data['platform'] = goods_dict['平台']
                        goods_data['date'] = date_data.strip()
                        goods_data['time'] = time_data.strip()
                        goods_data['keyword'] = goods_dict['关键词']
                        goods_data['name'] = goods_dict['商品名']
                        goods_data['url'] = goods_dict['URL']
                        goods_data['shop_name'] = goods_dict['shop_name']
                        goods_data['user_name'] = item['user']['nick']
                        goods_data['content'] = content + ';' + comments_2
                        goods_data['content_id'] = str(item['rateId'])
                        goods_data['brand'] = goods_dict['品牌']
                        goods_data['price'] = goods_dict['价格']
                        goods_data['sales'] = goods_dict['月销量']
                        goods_data['focus_count'] = goods_dict['人气数']
                        goods_data['comment_num'] = goods_dict['评价人数']
                        goods_data['views'] = ''
                        goods_data['likes'] = item['useful']
                        goods_data['comments_count'] = ''
                        goods_data['author_id'] = ''
                        goods_data['reposts_count'] = ''
                        goods_data['topic_id'] = str(goods_dict['itemId'])
                        # 判断size和type 颜色分类:黑色高帮&nbsp;&nbsp尺码:37
                        test_data = item['auction']['sku']
                        if '码' in test_data:
                            goods_data['type'] = test_data.split(
                                ':')[1].replace('尺码', '').replace(
                                    '&nbsp;&nbsp',
                                    '').replace('鞋码', '').replace(';尺码', '')
                            goods_data['size'] = test_data.split(':')[2]
                        else:
                            goods_data['type'] = ''
                            goods_data['size'] = ''
                        goods_data['imageurl'] = goods_dict['imageurl']
                        goods_data['audiourl'] = goods_dict['audiourl']
                        goods_data['file_code'] = '55'
                        # print(goods_data)
                        # items = json.dumps(dict(goods_data), ensure_ascii=False) + '\n'
                        # with open('./bu_cai/55_{}_taobao_nike_{}_1.json'.format(time.strftime('%Y%m%d'), self.pid), 'ab') as f:
                        #     f.write(items.encode("utf-8"))
                        item = json.dumps(dict(goods_data),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_daily/ecommerce/{}/55_{}_TaoBao_nike{}.json'
                            .format(time.strftime('%Y%m%d'),
                                    time.strftime('%Y%m%d'), self.pid),
                            item,
                            encoding='utf-8')
                    if date_data.strip() < self.start_time:
                        is_break = True
                if is_break:
                    break
        except:
            print(66666666666666666666, traceback.format_exc())

    def run(self, lock):
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('taobao_day_url')
            if str(redis_url_num) == '0':
                print('*******Redis消息队列中url为空，程序等待中...进程 {} 等待中...*******'.
                      format(str(os.getpid())))
            item = self.redis_example.brpop('taobao_day_url', timeout=600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print(item)
            self.parse_goods_url(item1)

Exemple #23

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """

    def __init__(self):

        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段：{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段：{}到{}'.format(start_time_data, end_time))

        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time
        self.pid = os.getpid()

        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 标记爬虫工作
        self.is_break = False
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******')
        self.hdfsclient.makedirs('/user/cspider_daily/nike_2h/ecommerce/{}/{}'.format(wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub('<a.*></a>', '', ret1)
            return ret2
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    def parse_url(self, data):
        # 创建一个字典接收数据
        goods_dict = dict()
        goods_dict['平台'] = data['平台']
        goods_dict['关键词'] = data['关键词']
        goods_dict['商品名'] = data['商品名']
        goods_dict['商品图片'] = data['商品图片']
        goods_dict['URL'] = data['URL']
        goods_dict['shop_name'] = data['shop_name']
        goods_dict['goods_id'] = data['goods_id']
        goods_dict['品牌'] = data['品牌']
        goods_dict['月销量'] = data['月销量']
        # logger.log(31, '--------********正在抓取的商品是：%s********--------' % goods_dict)
        self.parse_goods_price(goods_dict)

    # 解析商品价格信息
    def parse_goods_price(self, goods_dict):
        try:
            goods_url = 'https://p.3.cn/prices/mgets?callback=jQuery6465631&source=jshop&skuids=J_{}'.format(goods_dict['goods_id'])
            headers = {
                'content-type': 'application/json;charset=utf-8',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                'accept-encoding': 'gzip, deflate, br',
                'accept-language': 'zh-CN,zh;q=0.9',
                'cache-control': 'no-cache',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                time.sleep(0.2)
                response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=goods_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
            price_data = re.search(r'\[{".*"}]', response.text)
            goods_dict['价格'] = json.loads(price_data.group())[0]['p']
            # print(goods_dict)
            self.parse_comment_num(goods_dict)
        except:
            print(22222222222222222222222, traceback.format_exc())

    # 抓取商品评论数
    def parse_comment_num(self, goods_dict):
        try:
            productId = goods_dict['goods_id']
            referer_url = goods_dict['URL']
            comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page=0&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(productId)
            headers = {
                'content-type': 'text/html;charset=GBK',
                'authority': 'sclub.jd.com',
                'method': 'GET',
                # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1',
                'referer': referer_url,
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            try:
                time.sleep(0.2)
                response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
            except:
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                except:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
            comment_data = re.search(r'{"productAttr":null.*]}', response.text)
            comment_number = json.loads(comment_data.group())['productCommentSummary']['commentCount']
            goods_dict['comment_num'] = json.loads(comment_data.group())['productCommentSummary']['commentCountStr']
            # print(comment_number)
            if int(comment_number) == 0:
                logger.log(31, '************该商品没有评论数据*********')
            else:
                pages = int(math.ceil(float(int(comment_number) / 10)))
                self.goods_comments(goods_dict, pages)
        except:
            print(33333333333333333333333, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, pages):
        try:
            is_break = self.is_break
            # print(goods_dict)
            productId = goods_dict['goods_id']
            referer_url = goods_dict['URL']
            headers = {
                'content-type': 'text/html;charset=GBK',
                'authority': 'sclub.jd.com',
                'method': 'GET',
                # 'cookie': 'shshshfpa=32a16413-dbf0-50ea-e5b3-fc0700600c82-1555380265; shshshfpb=xpfj85AdZf7nEIXa%2FfPnKQA%3D%3D; user-key=76e73b75-478f-450a-843d-e6bc97ab6f57; TrackID=1JkU9AvzDgHTRRBHhgHdYahMQFpg9HwywXxp4mumaDTg3wgCwgl-Om3llO2sZlBTQ7ojPYO3q3E7f1jiEFu3roH67lDo9yP-tEUKh5hPh0R0; pinId=0ng4x50EOTPaVd8k7Hb6MA; pin=t15239619067; unick=t152*****067; _tp=WXVubGec3KjciXDtJzPQhA%3D%3D; _pst=t15239619067; mt_xid=V2_52007VwMWVllaW1scThxaBGIDEFFYXlRbGEwdbARlBkJVVVBVRhwZHV4ZYgRGVEEIVgpMVRxbAWYEQlNfUFQPF3kaXQVvHxNXQVhaSx9JEl8NbAAbYl9oUmoWQRhYBGULEFRVWltTGkkcWgZiMxdb; unpl=V2_ZzNtbRBSRkd2CBFULxxcBmIBFV0SUxYRfFsTAHweWAdiChReclRCFX0UR1FnGVQUZwYZXktcQRRFCEdkeB5fA2AFEFlBZxVLK14bADlNDEY1WnwHBAJfFn0PTlJ7GFQFYwIabXJUQyV1CXZUfx1YB24CEVpHUUIQdQpFUX0fXQJiByJtRWdzJXEMQFF6GGwEVwIiHxYLSxV2CkdTNhlYAWMBG1xBUEYTdA1GVngcWgNmBBdZclZzFg%3d%3d; __jdv=122270672|google-search|t_262767352_googlesearch|cpc|kwd-296971091509_0_c44c21f1e4124361a5d58bde66534872|1555655309636; cn=1; _gcl_au=1.1.1967935789.1555659711; __jdc=122270672; areaId=2; __jdu=15553802647041324770645; __jda=122270672.15553802647041324770645.1555380265.1556415731.1556518168.15; ipLoc-djd=2-2830-51800-0; wlfstk_smdl=zn0664dqolt95jf7g1wjtft1hao7l0yl; 3AB9D23F7A4B3C9B=HPX726VSHMRMSR3STZRR7N5NRDNPYWVN43VETWWM5H7ZKTJNQRUDNAN3OFAJHRA4GMFUVMZ4HQPSNV63PBO6R5QDQI; shshshfp=4a332a1f062877da491a157dabe360b2; shshshsID=60254c5e3d13551f63eed3d934c61d6d_8_1556519503209; __jdb=122270672.11.15553802647041324770645|15.1556518168; JSESSIONID=831DC446C63444F227CAFCFFA4085E88.s1',
                'referer': referer_url,
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
            }
            if int(pages) >= 50:
                pages_num = 50
            else:
                pages_num = pages
            # 抓取商品评论链接(总共50页,第一页从0开始)
            for i in range(0, int(pages_num)):
                comment_url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv46&productId={}&score=0&sortType=6&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1'.format(productId, i)
                # print(comment_url)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                except:
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                    except:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url, headers=headers, proxies=proxies, allow_redirects=False, timeout=30)
                # print("正在抓取的页面是: %s" % comment_url)
                comments = response.text
                # print(comments)  # {"productAttr":null,.*]}
                comment = re.search(r'{"productAttr":null.*]}', comments)
                # print(comment.group())
                items = json.loads(comment.group())['comments']
                if int(len(items)) == 0:
                    break
                else:
                    goods_comment_dict = dict()
                    for item in items:
                        date_data = item['creationTime'].split(' ', 1)[0]
                        time_data = item['creationTime'].split(' ', 1)[1]
                        # print(date, time)
                        try:
                            content = self.re_html(item['content'])
                        except:
                            content = ''
                        # 追加评论
                        try:
                            comments_2 = item['afterUserComment']['content']
                        except:
                            comments_2 = ''
                        # 判断评论时间是否在规定的抓取时间内
                        if self.start_time <= item['creationTime']:
                            goods_comment_dict['platform'] = goods_dict['平台']
                            goods_comment_dict['date'] = date_data.strip()
                            goods_comment_dict['time'] = time_data.strip()
                            goods_comment_dict['keyword'] = goods_dict['关键词']
                            goods_comment_dict['name'] = goods_dict['商品名']
                            goods_comment_dict['imageurl'] = goods_dict['商品图片']
                            goods_comment_dict['audiourl'] = ''
                            goods_comment_dict['url'] = goods_dict['URL']
                            goods_comment_dict['shop_name'] = goods_dict['shop_name']
                            goods_comment_dict['user_name'] = item['nickname']
                            goods_comment_dict['author_id'] = ''
                            goods_comment_dict['content'] = content + ';' + comments_2
                            goods_comment_dict['content_id'] = str(item['id'])
                            goods_comment_dict['brand'] = goods_dict['品牌']
                            goods_comment_dict['price'] = goods_dict['价格']
                            goods_comment_dict['sales'] = goods_dict['月销量']
                            goods_comment_dict['focus_count'] = ''
                            goods_comment_dict['comment_num'] = goods_dict['comment_num']
                            goods_comment_dict['views'] = ''
                            goods_comment_dict['likes'] = item['usefulVoteCount']
                            try:
                                goods_comment_dict['comments_count'] = item['replyCount']
                            except:
                                goods_comment_dict['comments_count'] = ''
                            goods_comment_dict['reposts_count'] = ''
                            goods_comment_dict['topic_id'] = str(goods_dict['goods_id'])
                            try:
                                goods_comment_dict['type'] = item['productColor']
                            except:
                                goods_comment_dict['type'] = ''
                            try:
                                goods_comment_dict['size'] = item['productSize']
                            except:
                                goods_comment_dict['size'] = ''
                            goods_comment_dict['file_code'] = '51'
                            # print('********--------开始写入商品数据--------********')
                            # print(goods_comment_dict)
                            item = json.dumps(dict(goods_comment_dict), ensure_ascii=False) + '\n'
                            self.hdfsclient.new_write('/user/cspider_daily/nike_2h/ecommerce/{}/{}/51_{}_jingdong_nike{}.json'.format(self.date_time, self.h2_name, time.strftime('%Y%m%d'), self.pid), item, encoding='utf-8')
                        if date_data.strip() < self.start_time:
                            is_break = True
                    if is_break:
                        break
        except:
            print(444444444444444444444444444, traceback.format_exc())

    def run(self, data):
        self.parse_url(data)

Exemple #24

0

Afficher le fichier

class TouTiaoSpider(object):
    """
    今日头条的爬虫，主要采集和汽车有关的新闻
    """
    def __init__(self):

        # 'cookie':'uuid="w:d0214807f672416fb7d3ee0431aa13a3"; UM_distinctid=1674ef3a9800-0bce565d4c8dc4-414f0120-15f900-1674ef3a981290; _ga=GA1.2.823209007.1543222670; _gid=GA1.2.547615301.1543222670; CNZZDATA1259612802=603836554-1543213069-%7C1543218469; __tasessionId=tpisw88851543281460530; csrftoken=d9a6dad7de6c1fbbf3ddd1a3de811481; tt_webid=6628070185327625741',
        # ':authority':'www.toutiao.com',
        # ':method':'GET',
        # ':path':'/api/pc/feed/?category=news_car&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1E56B7F8CD9B35&cp=5BFC39BB43B5DE1&_signature=pMmtcAAA.0TvpJ9rFvhWIKTJrW',
        # ':scheme':'https',
        # 'cache-control': 'max-age=0',
        # 'cookie': 'tt_webid=6628733243796178436; tt_webid=6628733243796178436; csrftoken=3a6f2dc0f315bd1fe957319a75bba4ed; uuid="w:2203d39caf3249c0bcda19ee5839b850"; UM_distinctid=1675827673a27a-0dd556679b3f63-3a3a5d0c-15f900-1675827673b22c; __tasessionId=qb2c0x9mb1543386267822; CNZZDATA1259612802=992935523-1543369669-%7C1543385869',
        # 'referer': 'https://www.toutiao.com/ch/news_car/',
        self.headers_one = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
        }

        self.start_url = 'https://www.toutiao.com/api/pc/feed/'
        # 评论接口模板
        self.commnet_port_url = ''

        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]
        yesterday = datetime.now() - timedelta(days=1)  # 昨天时间
        yesterday = str(yesterday).split(' ')[0]
        print('爬取时间段：{}到{}'.format(news_start_time, yesterday))

        logging.info('爬取时间段：{}到{}'.format(news_start_time, yesterday))
        # 定义开始时间 y-m-d  离现在时间远
        self.start_time = news_start_time
        # self.start_time = '2019-09-09'
        # 定义结束时间 y-m-d  离现在时间近
        self.end_time = yesterday
        # self.end_time = '2019-09-16'
        print('爬取时间段：{}到{}'.format(self.start_time, self.end_time))

        # 标记爬虫工作
        self.is_work = True
        # 评论页数
        self.comment_page_num = 1
        # 去重列表
        self.set_list = []
        # 代理ip
        self.proxies = [
            '112.245.235.249:4243',
            # '59.53.47.4:4249'
        ]
        # 搜集问答类网页的列表
        self.questions_list = []

        # 读取url列表
        # with open('./../toutiao/new_url_file.json', 'r') as f:
        #     self.url_list = f.readlines()
        self.file_name_time = self.get_file_name_time()
        try:
            os.mkdir('./../toutiao/json_file/{}'.format(self.file_name_time.split(' ')[0]))
        except:
            pass
        self.file_path = '/user/cspider_daily/nike_daily/article'
        self.comment_apth = '/user/cspider_daily/nike_daily/articlecomments'
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000', user='******')
        hour = str(datetime.now()).split(' ')[-1].split(':')[0]
        if str(hour) != '00':
            two_hour_ago = int(hour) - 2
            if len(str(two_hour_ago)) == 1:
                two_hour_ago = '0' + str(two_hour_ago)
            self.hour_name = str(two_hour_ago) + '_' + str(hour)
        else:
            self.hour_name = '22_24'
        self.hdfsclient.makedirs('{}/{}'.format(self.file_path, self.file_name_time.split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.hdfsclient.makedirs('{}/{}'.format(self.comment_apth, self.file_name_time.split(' ')[0].replace('-', '')))  # 创建每日文件夹
        self.time_time = str(time.time()).split('.')[0]
        # 代理服务器
        proxyHost = "http-dyn.abuyun.com"
        proxyPort = "9020"

        # 代理隧道验证信息
        # proxyUser = "******"
        # proxyUser = "******"
        proxyUser = "******"
        # proxyPass = "******"
        # proxyPass = "******"
        proxyPass = "******"

        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }

        self.proxies = {
            # "http": proxyMeta,
            "https": proxyMeta
        }

    def get_news_page(self, url):
        user_agent = [
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
        ]
        headers_one = {
            'accept': 'textml,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cookie': '__tasessionId=0k1ayrc511577344635809',
            'sec-fetch-user': '******',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': '{}'.format(random.choice(user_agent))
        }

        item = {}
        response = requests.get(url, headers=headers_one, proxies=self.proxies, timeout=60)  #, proxies={'https': ip}
        stutus_code = response.status_code
        if str(stutus_code) == '200':
            data_all = response.content.decode()
            try:
                data = re.search(r"articleInfo: {([\s\S]*time: '\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2})", data_all).group(1)
                data = '{' + data + "'}}"
                data = re.sub('\n', '', data)
                data = unescape(data)
                data = data.replace('&quot;', '"').replace('&#x3D;', '=')
                content = re.search('content: ([\s\S]*)groupId', data).group(1).strip()[1:][:-2]
                content = etree.HTML(content)
                text = content.xpath('.//p//text()')
                text_con = ''.join(text)
                # print(text_con)
                text_con = re.sub(r'class=.*?class', '', text_con)
                # print(text_con)
                text_con = re.sub(r'\\u003C.*?\\u003E', '', text_con).replace('.slice(6, -6', '')
                # print(text_con)
                date, create_time = re.search('(\d{4}-\d{1,2}-\d{1,2}) (\d{1,2}:\d{1,2}:\d{1,2})', data).group(1, 2)
                id_num = re.search("groupId: '(\d{1,50}).*itemId", data).group(1)  # 新闻的标识id
                source = re.search("source: '(.*)time", data).group(1).strip()[:-2]  # 来源
                comment_count = re.search("commentCount: '(\d{0,10})[\s\S]*ban_comment", data_all).group(1)
                title = re.search("title: '([\s\S])*content", data).group(0).split("'")[1]
                item['platform'] = '今日头条'
                item['date'] = date
                item['time'] = create_time
                item['title'] = title.replace('"', '')
                item['article_source'] = ''  # 文章来源
                item['article_author'] = source  # 文章作者
                item['content'] = text_con
                item['comments_count'] = comment_count
                item['clicks'] = ''
                item['views'] = ''
                item['likes'] = ''
                item['keyword'] = ''
                item['article_url'] = url  # 文章详情URL
                item['dislikes'] = ''  # 踩人数
                item['series_url'] = ''  # 车系首页
                item['list_url'] = 'https://www.toutiao.com/ch/news_car/'  # 文章列表URL
                item['article_type_1st'] = ''  # 文章类型
                item['article_type_2nd'] = ''  # 文章类型
                item['insert_time'] = str(datetime.now()).split('.')[0]  # 初始爬取时间
                item['update_time'] = str(datetime.now()).split('.')[0]  # 最后爬取时间
                content_id = url.split('/')[-1]
                item['content_id'] = str(content_id)  # 文章id
                item['topic_id'] = str(content_id)  # 主贴id
                item['author_id'] = ''  # 作者id
                item['file_code'] = '24'
                item['reposts_count'] = ''
                print(item)
                self.write_news_jsonfile(item)
                self.get_comment_info(url, title, date, create_time)

            except AttributeError:
                print('问答类网页', url)
                self.questions_list.append(url)
                print(self.questions_list)

        else:
            print('网页404错误', url)

    # 获取评论
    # http://lf.snssdk.com/article/v1/tab_comments/?count=50&item_id=6629460454148145678&group_id=6629460454148145678&offset=0
    def get_comment_info(self, source_url, source_title, source_date, source_time, page_id="0"):
        user_agent = [
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/536.6',
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.1',
        ]
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            # 'Connection': 'keep-alive',
            'Host': 'lf.snssdk.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': '{}'.format(random.choice(user_agent))
        }
        headers_two = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            # 'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
            'Cache-Control': 'no-cache',
            'Cookie': 'csrftoken=4be00616a67933bdef5696b162e70937; tt_webid=6762029080495375880',
            # 'Connection': 'keep-alive',
            # 'Host': 'lf.snssdk.com',
            # 'Upgrade-Insecure-Requests': '1',
            'Connection': 'keep-alive',
            'User-Agent': 'PostmanRuntime/7.20.1'
        }

        url_id = source_url.split('/')[-1][1:]
        news_comment_url = 'https://www.toutiao.com/article/v2/tab_comments/?aid=24&app_name=toutiao-web&group_id={}&item_id={}&offset={}&count=50'.format(url_id, url_id, page_id)
        comment_url = 'http://lf.snssdk.com/article/v1/tab_comments/?count=50&item_id={}&group_id={}&offset={}'.format(url_id, url_id, page_id)
        print('评论爬取中......')
        print(comment_url)
        # ip = random.choice(self.proxies)
        try:
            response = requests.get(news_comment_url, headers=headers_two, proxies=self.proxies)  # , proxies={'https': ip}
            datas = json.loads(response.content)
            print(datas)
            data_list = datas['data']
            if data_list:
                total_item = ''
                for comment in data_list:
                    print(1111111111111, comment)
                    item = dict()
                    item['platform'] = '今日头条'
                    item['source_date'] = source_date
                    item['source_time'] = source_time

                    content = comment['comment']['text']
                    date_all = comment['comment']['create_time']
                    # #转换成localtime
                    time_local = time.localtime(float(str(date_all)))
                    # 转换成新的时间格式(2016-05-05 20:28:54)
                    dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
                    date = dt.split(' ')[0]
                    comment_time = dt.split(' ')[1]
                    item['date'] = date
                    item['time'] = comment_time
                    item['title'] = source_title
                    author = comment['comment']['user_name']
                    item['author'] = author
                    item['content'] = content
                    item['source_url'] = source_url
                    item['comment_url'] = source_url
                    item['floor'] = ''
                    item['views'] = ''
                    item['comments_count'] = comment['comment']['reply_count']
                    item['keyword'] = ''
                    item['likes'] = comment['comment']['digg_count']
                    item['author_id'] = comment['comment']['user_id']  # 用户id
                    item['dislikes'] = ''  # 踩人数
                    item['insert_time'] = str(datetime.now()).split('.')[0]  # 初始爬取时间
                    item['update_time'] = str(datetime.now()).split('.')[0]  # 最后爬取时间
                    item['content_id'] = comment['comment']['id']
                    content_id = source_url.split('/')[-1]
                    item['topic_id'] = str(content_id)  # 主贴id
                    item['file_code'] = '38'
                    item['reposts_count'] = ''

                    item = json.dumps(dict(item), ensure_ascii=False) + '\n'

                    total_item += item
                print('写入评论中......')
                self.write_comment_jsonfile(total_item)
                if len(data_list) == 50:
                    page_id = int(page_id) + 50
                    print('爬取评论翻页信息.....')
                    time.sleep(2)
                    self.get_comment_info(source_url, source_title, source_date, source_time, page_id=str(page_id))
        except requests.exceptions.ConnectionError:
            print('获取评论时发生链接错误,程序暂停100s后爬取')
            time.sleep(100)
            self.get_comment_info(source_url, source_title, source_date, source_time, page_id=str(page_id))
            logging.error('获取评论时发生链接错误,程序暂停100s后爬取，get_comment error:{}'.format(traceback.format_exc()))


    # 写入json文件
    def write_news_jsonfile(self, item):
        item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.hdfsclient.new_write('{}/{}/24_{}_{}_toutiao_news.json'.format(self.file_path, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8')

    def write_comment_jsonfile(self, item):
        # item = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.hdfsclient.new_write('{}/{}/38_{}_{}_toutiao_comment.json'.format(self.comment_apth, str(datetime.now()).split(' ')[0].replace('-', ''), str(datetime.now()).split(' ')[0].replace('-', '_'), self.time_time), item,encoding='utf-8')


    def get_file_name_time(self):
        a = str(datetime.now())
        hour = a.split(' ')[-1].split(':')[0]
        num = int(hour) / 3
        num = int(num) * 3
        if num == 0:
            num = 24
            a = str(datetime.now() - timedelta(days=1))  # 昨天时间
        num = a.split(' ')[0] + ' ' + str(num)
        return num

    def run(self):
        set_list = []
        logger.info('开始读取url文件,进行新闻爬取')
        for url in open('./../toutiao/json_file/{}/{}_comment_url.json'.format(self.file_name_time.split(' ')[0], self.file_name_time.split(' ')[0])):
        # for url in open('./../toutiao/json_file/{}/{}_comment_url.json'.format('2020-03-23', '2020-03-23')):
            if url in set_list:
                continue
            else:
                set_list.append(url)
            if url:
                logger.info('打开new_url_file.json文件，读取要爬取的url')
                url = url.strip()
                print('一个爬虫正在爬取网址{}'.format(url))
                logger.info('一个爬虫正在爬取网址{}'.format(url))
                try:
                    self.get_news_page(url)
                except:
                    print(traceback.format_exc())
                    try:
                        self.get_news_page(url)
                    except:
                        print('错误')
            print('一个网址爬虫结束.....')
        logger.info('爬取完毕......')

Exemple #25

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self, redis_example):
        # 时间部分
        # 爬虫开始抓取的日期
        date = datetime.now() - timedelta(days=1)
        news_start_time = str(date).split(' ')[0]
        # 爬虫结束的抓取日期
        current_time = datetime.now()  # 当前日期
        current_day = str(current_time).split(' ')[0]
        print('爬取时间段：{}到{}'.format(news_start_time, current_day))
        logging.info('爬取时间段：{}到{}'.format(news_start_time, current_day))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = news_start_time
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = current_day

        # 标记爬虫工作
        self.is_break = False
        self.redis_example = redis_example
        self.pid = os.getpid()
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.209:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_daily/ecommerce/{}'.format(
                time.strftime('%Y%m%d')))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\n', '', ret1)
            ret3 = re.sub(r'\u3000', '', ret2)
            ret4 = re.sub(r'品牌:', '', ret3)
            ret5 = re.sub(r'\xa0', '', ret4)
            ret6 = re.sub(r'&rarr;_&rarr;', '', ret5)
            ret7 = re.sub(r'&hellip;&hellip;', '', ret6)
            ret8 = re.sub(r'":', '', ret7)
            return ret8
        except:
            pass

    # 过滤月销量里面的非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 过滤url里面的#detail
    def re_detail(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'#detail', '', message)
            return ret1
        except:
            pass

    # 解析请求得到的商品信息
    def parse_goods_url(self, data):
        goods_dict = dict()
        goods_dict['平台'] = '天猫'
        goods_dict['URL'] = data['URL']
        goods_dict['商品名'] = data['商品名']
        goods_dict['价格'] = data['价格']
        goods_dict['shop_name'] = data['shop_name']
        goods_dict['关键词'] = data['关键词']
        goods_dict['品牌'] = data['品牌']
        goods_dict['itemId'] = data['itemId']
        goods_dict['sellerId'] = data['sellerId']
        goods_dict['imageurl'] = data['商品图片']
        goods_dict['audiourl'] = ''
        # logger.log(31, '***************************正在抓取的商品是:%s.................' % goods_dict)
        self.parse_goods_details(goods_dict)

    # 解析商品详情信息
    def parse_goods_details(self, goods_dict):
        try:
            url = 'https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?jsv=2.4.8&appKey=12574478&sign=88ccc8f2a1382304046c12960f2711c9&api=mtop.taobao.detail.getdetail&v=6.0&dataType=jsonp&ttid=2017%40taobao_h5_6.6.0&AntiCreep=true&type=jsonp&data=%7B%22itemNumId%22%3A%22{}%22%7D'.format(
                goods_dict['itemId'])
            headers = {'User-Agent': random.choice(user_agent_list)}
            try:
                time.sleep(0.3)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        timeout=30)
            except:
                try:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            timeout=30)
                except:
                    time.sleep(0.3)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            timeout=30)
            if 'apiStack' not in response.text:
                # print('******该商品已下架******')
                pass
            else:
                json_data = json.loads(
                    json.loads(
                        response.text,
                        strict=False)['data']['apiStack'][0]['value'])['item']

                commentCount = json.loads(
                    response.text)['data']['item']['commentCount']  # 评价数
                favcount = json.loads(
                    response.text)['data']['item']['favcount']  # 收藏数
                sellCount = json_data['sellCount']  # 月销量
                spuId = json_data['spuId']
                goods_dict['spuid'] = spuId
                goods_dict['月销量'] = sellCount
                goods_dict['人气数'] = favcount
                goods_dict['评价人数'] = commentCount
                if int(self.re_not_number(goods_dict['评价人数'])) == 0:
                    # logger.log(31, '---------该商品没有评价数据-------')
                    response.close()
                    pass
                else:
                    pages_num = int(
                        math.ceil(float(int(goods_dict['评价人数']) / 20)))
                    response.close()
                    self.goods_comments(goods_dict, pages_num)
        except:
            print(5555555555555555555555, traceback.format_exc())

    # 解析商品评论
    def goods_comments(self, goods_dict, pages_num):
        try:
            is_break = self.is_break
            # print(goods_dict)
            itemId = goods_dict['itemId']
            sellerId = goods_dict['sellerId']
            spuId = goods_dict['spuid']

            headers = {
                'cookie':
                't=b5285c592f5c5d2760bbc606138d8cf0; UM_distinctid=16a1fadfa62540-0819221c6d91c7-47e1137-232800-16a1fadfa636f7; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=vn%2BuDgMTTmiEXbq1S%2Byw3qmgOc2O1Fw5PzezL1S7UyTFAqMoepiGRIdTY9msHIOrzffqeq9FLJt5WAGM7ENyvA%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; mt=ci=0_0; cna=At46FUkyQjACAWVWN1V9/Wdy; v=0; cookie2=1ae734c4e8a03d4591a230e3913026b6; _tb_token_=f46b387e3f77e; uc1=cookie14=UoTaHP3MzJiakA%3D%3D; x5sec=7b22726174656d616e616765723b32223a223438663436333231316138653834636332653635613664633664666437363037434b654168656f4645495062705a43337566765a6d51453d227d; _m_h5_tk=b2a5536512217126c542d930817469b0_1564567924778; _m_h5_tk_enc=9e3f2f1eca52726de7c74dd14a9869fa; l=cBaOcPD7qg21z1C9BOCwlurza77ORIRAguPzaNbMi_5dk6Ls857OkSG2UFp6cjWd9pTB41hTyPJ9-etkmI1E1Cmj2s7V.; isg=BC4udOYY_1h3OAv3hZZEU1m4f4Qwh_Mi8XPFVFj3ujHsO8-VwL_0ODk59-dy4-pB',
                'pragma': 'no-cache',
                'upgrade-insecure-requests': '1',
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer':
                'https://detail.tmall.com/item.htm?spm=a230r.1.14.16.26804e4ck29eWS&id=597034992998&ns=1&abbucket=1',
                'User-Agent': random.choice(user_agent_list)
            }
            if int(pages_num) >= 99:
                pages = 99
            else:
                pages = pages_num
            # logger.log(31, '-------------评论总页数是：%s --------------' % pages)
            # 抓取商品评论链接(总共99页,从1开始)
            for i in range(1, int(pages) + 1):
                comment_url = 'https://rate.tmall.com/list_detail_rate.htm?itemId={}&spuId={}&sellerId={}&order=1&currentPage={}'.format(
                    itemId, spuId, sellerId, i)
                # print(comment_url)
                # response = requests.get(url=comment_url, headers=headers, proxies=random.choice(proxies), timeout=10)
                try:
                    time.sleep(0.2)
                    response = requests.get(url=comment_url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    try:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        time.sleep(0.2)
                        response = requests.get(url=comment_url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                comment_data = response.text
                # logger.log(31, '开始抓取评论')
                # print(comment_data)
                comment = re.search(
                    r'{"rateDetail":{"rateCount":{"total":.*"tags":\[]}}',
                    comment_data)
                # print(comment.group())
                items = json.loads(comment.group())['rateDetail']['rateList']
                # print(items)
                goods_data = dict()
                logger.log(31, '--------********开始写入商品数据********--------')
                for item in items:
                    date_data = item['rateDate'].split(' ', 1)[0]
                    time_data = item['rateDate'].split(' ', 1)[1]
                    # print('评论时间', date_data, time_data)
                    try:
                        content = item['appendComment']['content']
                    except:
                        content = ''

                    # 判断评论时间是否在规定的抓取时间内
                    if self.start_time <= date_data.strip():
                        goods_data['platform'] = goods_dict['平台']
                        goods_data['date'] = date_data.strip()
                        goods_data['time'] = time_data.strip()
                        goods_data['keyword'] = goods_dict['关键词']
                        goods_data['name'] = goods_dict['商品名']
                        goods_data['url'] = goods_dict['URL']
                        goods_data['shop_name'] = goods_dict['shop_name']
                        goods_data['user_name'] = item['displayUserNick']
                        goods_data['content'] = self.re_html(
                            item['rateContent']) + ';' + str(
                                self.re_html(content))
                        goods_data['content_id'] = str(item['id'])
                        goods_data['brand'] = goods_dict['品牌']
                        goods_data['price'] = goods_dict['价格']
                        goods_data['sales'] = goods_dict['月销量']
                        goods_data['focus_count'] = goods_dict['人气数']
                        goods_data['comment_num'] = goods_dict['评价人数']
                        goods_data['views'] = ''
                        goods_data['likes'] = ''
                        goods_data['comments_count'] = ''
                        goods_data['author_id'] = ''
                        goods_data['reposts_count'] = ''
                        goods_data['topic_id'] = str(goods_dict['itemId'])
                        # 判断size和type
                        test_data = item['auctionSku']
                        if '分类' in test_data:
                            goods_data['type'] = test_data.split(
                                ':')[1].replace(';尺码:',
                                                '').replace(';鞋码', '').replace(
                                                    ';尺码', '')
                            try:
                                goods_data['size'] = test_data.split(
                                    ':')[2].split(';')[0]
                            except:
                                try:
                                    goods_data['size'] = test_data.split(
                                        ':')[2]
                                except:
                                    goods_data['size'] = ''
                        else:
                            goods_data['type'] = ''
                            goods_data['size'] = ''
                        goods_data['imageurl'] = goods_dict['imageurl']
                        goods_data['audiourl'] = goods_dict['audiourl']
                        goods_data['file_code'] = '50'
                        # print(goods_data)
                        # item = json.dumps(dict(goods_data), ensure_ascii=False) + '\n'
                        # with open('./json_data/{}_tmall_goods_data_{}.json'.format(time.strftime('%Y%m%d'), self.pid), 'ab') as f:
                        #     f.write(item.encode("utf-8"))
                        item = json.dumps(dict(goods_data),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_daily/ecommerce/{}/50_{}_{}_Tmall_nike{}.json'
                            .format(time.strftime('%Y%m%d'),
                                    time.strftime('%Y%m%d'), self.time_data,
                                    self.pid),
                            item,
                            encoding='utf-8')
                    if date_data.strip() < self.start_time:
                        is_break = True
                if is_break:
                    break
        except:
            print(7777777777777777777, traceback.format_exc())

    def run(self, lock):
        for num in range(1000000):
            lock.acquire()
            redis_url_num = self.redis_example.llen('Tmall_day_url')
            if str(redis_url_num) == '0':
                print('*******Redis消息队列中url为空，程序等待中...进程 {} 等待中....******'.
                      format(str(os.getpid())))
            item = self.redis_example.brpop('Tmall_day_url', timeout=600)[1]
            lock.release()
            item1 = json.loads(item.decode())
            # print('正在抓取商品：', item1)
            self.parse_goods_url(item1)

Exemple #26

0

Afficher le fichier

class Spider(object):
    """
    这是一个爬虫模板
    """
    def __init__(self):
        # 时间部分,按小时抓取
        date_time = str(datetime.now() - timedelta(days=1)).split('.')[0]
        start_time_test = time.strftime('%Y-%m-%d 00:00:00')

        end_time = time.strftime('%Y-%m-%d %H:%M:%S')
        a = end_time.split(' ')[1].split(':')[0]

        if a == '00':
            start_time_data = date_time
            hours_name = '22_24'
            wen_jian_jia_date = str(datetime.now() - timedelta(
                days=1)).split('.')[0].split(' ')[0].replace('-', '')
        else:
            two_hours_ago = int(a) - 2
            if len(str(two_hours_ago)) == 1:
                two_hour_ago = '0' + str(two_hours_ago)
            else:
                two_hour_ago = str(two_hours_ago)
            hours_name = str(two_hour_ago) + '_' + str(a)
            start_time_data = start_time_test
            wen_jian_jia_date = time.strftime('%Y%m%d')
        print('爬取时间段：{}到{}'.format(start_time_data, end_time))
        logging.info('爬取时间段：{}到{}'.format(start_time_data, end_time))
        # 定义开始时间 y-m-d  离现在时间远  news_start_time
        self.start_time = start_time_data
        # 定义结束时间 y-m-d  离现在时间近  yesterday
        self.end_time = end_time
        # 标记爬虫工作
        self.is_break = False
        self.h2_name = hours_name
        self.date_time = wen_jian_jia_date
        # 链接hdfs
        self.hdfsclient = HdfsClient(url='http://192.168.1.205:14000',
                                     user='******')
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/video/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.hdfsclient.makedirs(
            '/user/cspider_daily/nike_2h/videocomments/{}/{}'.format(
                wen_jian_jia_date, hours_name))  # 创建每日文件夹
        self.time_data = str(time.time()).split('.')[0]

    # 替换所有的HTML标签
    def re_html(self, data):
        # 替换抓取数据中的html标签 .replace('[', '').replace(']', '').replace(',', '').replace("'", '')
        try:
            message = str(data)
            re_h = re.compile('</?\w+[^>]*>')  # html标签
            ret1 = re_h.sub('', message)
            ret2 = re.sub(r'\[', '', ret1)
            ret3 = re.sub(r'\]', '', ret2)
            ret4 = re.sub(r',', '', ret3)
            ret5 = re.sub(r"'", '', ret4)
            ret6 = re.sub(r'                    ', '', ret5)
            return ret6
        except:
            pass

    # 过滤非数字
    def re_not_number(self, data):
        try:
            message = str(data)
            ret1 = re.sub(r'\D', '', message)
            return ret1
        except:
            pass

    # 匹配具体时间
    def clean_date(self, x):
        now = datetime.now()
        if str(x).find('昨天') != -1:
            x = datetime.strftime(now + timedelta(days=-1),
                                  '%Y-%m-%d %H:%M:%S')
        elif str(x).find('前天') != -1:
            x = datetime.strftime(now + timedelta(days=-2),
                                  '%Y-%m-%d %H:%M:%S')
        elif str(x).find('天前') != -1:
            x = datetime.strftime(
                now + timedelta(days=-int(str(x).replace('天前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('小时前') != -1:
            x = datetime.strftime(
                now + timedelta(hours=-int(str(x).replace('小时前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('分钟前') != -1:
            x = datetime.strftime(
                now + timedelta(minutes=-int(str(x).replace('分钟前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('今天') != -1:
            x = str(x).replace('今天', now.strftime('%Y-%m-%d') + ' ')
        elif str(x).find('刚刚') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('秒前') != -1:
            x = now.strftime('%Y-%m-%d %H:%M:%S')
        elif str(x).find('月前') != -1:
            x = datetime.strftime(
                now + timedelta(weeks=-4 * int(str(x).replace('月前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('周前') != -1:
            x = datetime.strftime(
                now + timedelta(weeks=-int(str(x).replace('周前', ''))),
                '%Y-%m-%d %H:%M:%S')
        elif str(x).find('[') != -1:
            x = x.replace('[', '').replace(']', '')
        elif str(x).find('月') != -1:
            x = x.replace('月', '-').replace('日', '')
        return x

    # 根据关键词搜索请求得到商品第一页信息
    def parse_page1(self, key_word):
        try:
            # 根据关键词,例如：洗发水,抓取商品信息
            url = 'https://www.meipai.com/search/all?q={}'.format(key_word)
            # print(url)
            headers = {
                'Content-Type':
                'text/html; charset=utf-8',
                'Cookie':
                'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565584700; searchStr=AJ%7C361%E5%BA%A6%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E8%80%90%E5%85%8B%7C',
                'Host':
                'www.meipai.com',
                'Pragma':
                'no-cache',
                # 'Referer': '{}'.format(url),
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                # time.sleep(0.1)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False,
                                            timeout=30)
            # print(response.text)
            # 将响应转换成一个element对象
            html = etree.HTML(response.text)
            # 获取视频列表信息
            video_data_list = html.xpath('//ul[@id="mediasList"]/li')
            video_dict = dict()
            # logger.log(31, '***********关键词%s的*****第1页数据***********' % (key_word))
            for node in video_data_list:
                video_dict['platform'] = '美拍'
                video_dict['title'] = node.xpath('./img/@alt')[0].replace(
                    '\n', ' ')
                video_dict['keyword'] = key_word
                video_dict['url'] = 'https://www.meipai.com' + node.xpath(
                    './div[1]/a/@href')[0]
                video_dict['imageurl'] = 'https:' + node.xpath('./img/@src')[0]
                video_dict['audiourl'] = video_dict['url']
                video_dict['topic_id'] = str(
                    self.re_not_number(node.xpath('./div[1]/a/@href')[0]))
                video_dict['source_author'] = node.xpath(
                    './div[2]/p/a/@title')[0]
                video_dict['上传者url'] = 'https://www.meipai.com' + node.xpath(
                    './div[2]/p/a/@href')[0]
                video_dict['author_id'] = str(
                    self.re_not_number(node.xpath('./div[2]/p/a/@href')[0]))
                video_dict['categroy'] = ''
                # print(video_dict)
                response.close()
                self.parse_video_data(video_dict, key_word)
        except:
            print(111111111111111111111111, traceback.format_exc())

    # 根据关键词搜索请求得到商品后1页信息
    def parse_page2(self, key_word):
        try:
            for i in range(2, 3):
                # 根据关键词,例如：洗发水,抓取商品信息
                url = 'https://www.meipai.com/search/mv?q={}&page={}&fromAll=1'.format(
                    key_word, i)
                # logger.log(31, '***********关键词%s的******第%s页数据*********** +  %s' % (key_word, i, url))
                # print(url)
                headers = {
                    'Content-Type':
                    'text/html; charset=utf-8',
                    'Cookie':
                    'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565584700; searchStr=AJ%7C361%E5%BA%A6%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E8%80%90%E5%85%8B%7C',
                    'Host':
                    'www.meipai.com',
                    'Pragma':
                    'no-cache',
                    # 'Referer': url,
                    'Upgrade-Insecure-Requests':
                    '1',
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
                }
                try:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    try:
                        # time.sleep(0.1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        # time.sleep(0.1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                allow_redirects=False,
                                                timeout=30)
                # 将响应转换成一个element对象
                html = etree.HTML(response.text)
                # 判断是否有视频数据
                test_data = html.xpath(
                    '//div[@class="search-result-null break"]/p//text()')
                if test_data != []:
                    break
                else:
                    # 获取视频列表信息
                    video_data_list = html.xpath('//ul[@id="mediasList"]/li')
                    video_dict = dict()

                    for node in video_data_list:
                        video_dict['platform'] = '美拍'
                        video_dict['title'] = node.xpath(
                            './img/@alt')[0].replace('\n', ' ').replace(
                                '\r', ' ').replace('\u200b', '').replace(
                                    '\u200e', '').replace('\u200c', '')
                        video_dict['keyword'] = key_word
                        video_dict[
                            'url'] = 'https://www.meipai.com' + node.xpath(
                                './div[1]/a/@href')[0]
                        video_dict['imageurl'] = 'https:' + node.xpath(
                            './img/@src')[0]
                        video_dict['audiourl'] = video_dict['url']
                        video_dict['topic_id'] = str(
                            self.re_not_number(
                                node.xpath('./div[1]/a/@href')[0]))
                        video_dict['source_author'] = node.xpath(
                            './div[2]/p/a/@title')[0]
                        video_dict[
                            '上传者url'] = 'https://www.meipai.com' + node.xpath(
                                './div[2]/p/a/@href')[0]
                        video_dict['author_id'] = str(
                            self.re_not_number(
                                node.xpath('./div[2]/p/a/@href')[0]))
                        video_dict['categroy'] = ''
                        # print(video_dict)
                        response.close()
                        self.parse_video_data(video_dict, key_word)
        except:
            print(222222222222222222222222, traceback.format_exc())

    # 进入视频页面，抓取数据信息
    def parse_video_data(self, video_dict, key_word):
        try:
            url = video_dict['url']
            headers = {
                'Content-Type':
                'text/html; charset=utf-8',
                'Cookie':
                'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565584700; searchStr=AJ%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E6%9D%8E%E5%AE%81%7C%E8%80%90%E5%85%8B%7C361%E5%BA%A6%7C',
                'Host':
                'www.meipai.com',
                'Pragma':
                'no-cache',
                # 'Referer': 'https://www.meipai.com/search/all?q={}'.format(key_word),
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            # logger.log(31,  '视频链接是: %s' % url)
            try:
                # time.sleep(0.1)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False,
                                            timeout=30)
            # 将响应转换成一个element对象
            html = etree.HTML(response.text)
            # 发布日期时间数据
            date_time_data = html.xpath(
                '//div[@class="detail-time pa"]/strong/text()')[0]
            data = self.clean_date(date_time_data)
            # print(data)
            date_data = data.split(' ')[0]
            time_data = data.split(' ')[1]
            if len(date_data.split('-')) == 3:
                video_dict['date'] = date_data.strip()
            else:
                video_dict['date'] = (time.strftime('%Y') + '-' +
                                      date_data).strip()

            if len(time_data.split(':')) == 3:
                video_dict['time'] = time_data.strip()
            else:
                video_dict['time'] = (time_data + ':00').strip()
            test_date_time = video_dict['date'] + ' ' + video_dict['time']
            # print(test_date_time)
            if self.start_time <= test_date_time <= self.end_time:
                # 视频描述
                content = html.xpath(
                    '//div[@class="detail-info pr"]/h1//text()')
                # print('99999999999999999999999999999999999999999999999999999')
                # print(content)
                try:
                    video_dict['description'] = self.re_html(content).replace(
                        '\\u200d', '').replace('\\n', '')
                except:
                    video_dict['description'] = ''
                video_dict['content_id'] = video_dict['topic_id']
                # print(video_dict['description'])
                video_dict['clicks'] = ''
                # 播放量
                try:
                    play_data = html.xpath(
                        '//div[@class="detail-location"]/text()')[1].replace(
                            '播放', '').replace('\n', '').replace(' ', '')
                except:
                    play_data = ''
                video_dict['play'] = play_data
                # 评论数
                comment_num = html.xpath(
                    '//span[@id="commentCount"]/text()')[0]
                if comment_num == '评论':
                    video_dict['comments_count'] = 0
                else:
                    video_dict['comments_count'] = comment_num
                # 点赞数
                likes_data = html.xpath(
                    '//span[@itemprop="ratingCount"]/text()')[0]
                video_dict['likes'] = likes_data
                video_dict['reposts_count'] = ''
                # print(video_dict)
                response.close()
                self.parse_followers_count(video_dict)
            else:
                pass
        except:
            print(333333333333333333333333333, traceback.format_exc())

    # 抓取作者粉丝数
    def parse_followers_count(self, video_dict):
        try:
            url = video_dict['上传者url']
            # print(url)
            headers = {
                'Content-Type':
                'text/html; charset=utf-8',
                'Cookie':
                'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; searchStr=AJ%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E6%9D%8E%E5%AE%81%7C%E8%80%90%E5%85%8B%7C361%E5%BA%A6%7C; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565590100',
                'Host':
                'www.meipai.com',
                'Pragma':
                'no-cache',
                # 'Referer': url,
                'Upgrade-Insecure-Requests':
                '1',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
            }
            try:
                # time.sleep(0.1)
                response = requests.get(url=url,
                                        headers=headers,
                                        proxies=proxies,
                                        allow_redirects=False,
                                        timeout=30)
            except:
                try:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            allow_redirects=False,
                                            timeout=30)
            # 将响应转换成一个element对象
            html = etree.HTML(response.text)
            # 粉丝数
            followers_count = html.xpath(
                '//div[@class="user-num"]/a[4]/span[1]//text()')
            video_dict['followers_count'] = self.re_html(
                followers_count).replace(' ', '')
            video_dict['file_code'] = '165'
            video_data = video_dict.pop('上传者url')
            # logger.log(31, '--------------------------------开始录入视频主贴数据------------------------------------')
            # print(video_dict)
            response.close()
            # self.write_topic_jsonfile(video_dict)
            item = json.dumps(dict(video_dict), ensure_ascii=False) + '\n'
            self.hdfsclient.new_write(
                '/user/cspider_daily/nike_2h/video/{}/{}/165_{}_{}_MeiPai_Nike.json'
                .format(self.date_time, self.h2_name, time.strftime('%Y%m%d'),
                        self.time_data),
                item,
                encoding='utf-8')

            if int(video_dict['comments_count']) == 0:
                logger.log(31, '此主贴没有视频评论回复。。。。。。')
            else:
                pages = int(
                    math.ceil(float(int(video_dict['comments_count']) / 10)))
                # logger.log(31, '~~~~~~~~~~~~~~~~视频回帖数：%s , 回帖总页数: %s ~~~~~~~~~~~~~' % (video_dict['comments_count'], pages))
                self.parse_comment_data(video_dict, pages)
        except:
            print(4444444444444444444, traceback.format_exc())

    # 抓取视频回复数据
    def parse_comment_data(self, video_dict, pages):
        try:
            is_break = self.is_break
            headers = {
                'Content-Type': 'application/json; charset=utf-8',
                'Cookie':
                'MUSID=ta4877m5ongth47s2n7kt0km13; virtual_device_id=d8afe1d9634ad1f6591e3486d4312976; pvid=imZ0YWzzY7TAFPWAQnp71Vl1bDpOXY91; sid=ta4877m5ongth47s2n7kt0km13; UM_distinctid=16c84237ad71af-096aa6932eb12f-37c143e-1fa400-16c84237ad870f; searchStr=AJ%7C%E9%98%BF%E8%BF%AA%E8%BE%BE%E6%96%AF%7C%E6%9D%8E%E5%AE%81%7C%E8%80%90%E5%85%8B%7C361%E5%BA%A6%7C; CNZZDATA1256786412=2077818170-1565584700-https%253A%252F%252Fwww.baidu.com%252F%7C1565590100',
                'Host': 'www.meipai.com',
                'Pragma': 'no-cache',
                # 'Referer': video_dict['url'],
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36',
                'X-Requested-With': 'XMLHttpRequest'
            }
            for i in range(1, int(pages) + 1):
                url = 'https://www.meipai.com/medias/comments_timeline?page={}&count=10&id={}'.format(
                    i, video_dict['topic_id'])
                # print(url)
                try:
                    # time.sleep(0.1)
                    response = requests.get(url=url,
                                            headers=headers,
                                            proxies=proxies,
                                            allow_redirects=False,
                                            timeout=30)
                except:
                    try:
                        # time.sleep(0.1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                proxies=proxies,
                                                allow_redirects=False,
                                                timeout=30)
                    except:
                        # time.sleep(0.1)
                        response = requests.get(url=url,
                                                headers=headers,
                                                allow_redirects=False,
                                                timeout=30)
                comments_data = json.loads(response.text)
                video_comment = dict()

                for item in comments_data:
                    date_time_data = item['created_at']
                    # print(date_time_data, self.clean_date(date_time_data))
                    date_data = self.clean_date(date_time_data).split(' ')[0]
                    time_data = self.clean_date(date_time_data).split(' ')[1]
                    if len(date_data.split('-')) == 3:
                        date_data_test = date_data.strip()
                    else:
                        date_data_test = (time.strftime('%Y') + '-' +
                                          date_data).strip()

                    if len(time_data.split(':')) == 3:
                        time_data_test = time_data.strip()
                    else:
                        time_data_test = (time_data + ':00').strip()

                    date_time_test = date_data_test + ' ' + time_data_test
                    if self.start_time <= date_time_test <= self.end_time:
                        video_comment['platform'] = video_dict['platform']
                        video_comment['source_date'] = video_dict['date']
                        video_comment['source_time'] = video_dict['time']
                        video_comment['date'] = date_data_test
                        video_comment['time'] = time_data_test
                        video_comment['title'] = video_dict['title']
                        video_comment['author'] = item['user']['screen_name']
                        video_comment['author_id'] = str(item['user']['id'])
                        video_comment['content'] = item['content_origin']
                        video_comment['content_id'] = str(item['id'])
                        video_comment['floor'] = ''
                        video_comment['keyword'] = video_dict['keyword']
                        video_comment['comment_url'] = url
                        video_comment['source_url'] = video_dict['url']
                        video_comment['comments_count'] = ''
                        video_comment['likes'] = ''
                        video_comment['views'] = ''
                        video_comment['reposts_count'] = ''
                        video_comment['topic_id'] = video_dict['topic_id']
                        video_comment['imageurl'] = ''
                        video_comment['audiourl'] = ''
                        video_comment['file_code'] = '166'
                        # logger.log(31, '--------------------------------开始录入视频回贴内容------------------------------------')
                        # print(video_comment)
                        # self.write_comment_jsonfile(video_comment)
                        response.close()
                        item = json.dumps(dict(video_comment),
                                          ensure_ascii=False) + '\n'
                        self.hdfsclient.new_write(
                            '/user/cspider_daily/nike_2h/videocomments/{}/{}/166_{}_{}_MeiPai_nike.json'
                            .format(self.date_time, self.h2_name,
                                    time.strftime('%Y%m%d'), self.time_data),
                            item,
                            encoding='utf-8')
                    if date_data_test < self.start_time:
                        is_break = True
                if is_break:
                    break

        except:
            print(5555555555555555555555, traceback.format_exc())

    # 读取excel获取关键词
    def parse_xlsx(self):
        # 设置路径
        path = './快消采集关键词_v12_20200119.xlsx'
        # 打开execl
        workbook = xlrd.open_workbook(path)

        # 根据sheet索引或者名称获取sheet内容
        Data_sheet = workbook.sheets()[0]  # 通过索引获取

        rowNum = Data_sheet.nrows  # sheet行数
        colNum = Data_sheet.ncols  # sheet列数

        # 获取所有单元格的内容
        list = []
        for i in range(rowNum):
            rowlist = []
            for j in range(colNum):
                rowlist.append(Data_sheet.cell_value(i, j))
            list.append(rowlist)

        for data in list[1::]:
            brand = data[0]
            # print(brand)
            yield {
                '关键词': brand,
            }

    def run(self):
        key_word_list = []
        for item in self.parse_xlsx():
            # print(item)
            key_word_list.append(item)
        for item_data1 in key_word_list:
            print(item_data1['关键词'])
            self.parse_page1(item_data1['关键词'])
            self.parse_page2(item_data1['关键词'])