Beispiel #1
0
class YiHaoDianSpider(scrapy.Spider):
    name = 'YiHaoDian'
    cur_page = None
    productPage = {
        '莫斯利安酸奶': 1,
        '莫斯利安酸牛奶': 1,
        '乳粉': 1,
        '光明奶粉': 1,
        '光明牛奶饮品': 1,
        '光明酸牛奶': 1,
        '光明酸奶': 1,
        '光明乳酸菌饮品': 1,
        '光明纯牛奶': 1,
        '光明优+': 1,
        '莫斯利安双发酵酸乳': 1,
        '莫斯利安两果三蔬': 1,
    }

    headers2 = {
        'accept-encoding': "gzip, deflate, br",
        'accept-language': "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
        'user-agent':
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
        'accept':
        "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
        'referer': "https://item.yhd.com/{}.html",
        'authority': "item.yhd.com",
        'x-requested-with': "XMLHttpRequest",
        'Cache-Control': "no-cache"
    }

    headers = {
        'method': 'GET',
        'accept':
        'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'referer': None,
        'charset': 'UTF-8',
        'user-agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest'
    }

    pars = {
        'productId': None,
        'pagenationVO.currentPage': None,
        'pagenationVO.rownumperpage': 10,
    }

    def __init__(self):
        self.pipline = TravellerspPipeline()

    def start_requests(self):
        for a in self.productPage.keys():
            for p in range(1, self.productPage[a] + 1):
                homePge_tmp = [
                    # 'https://search.yhd.com/c0-0/k{}'.format(a),
                    "https://search.yhd.com/c0-0/mbname{k1}-b/a-s2-v4-p{p}-price-d0-f0b-m1-rt0-pid-mid0-color-size-k{k2}/"
                    .format(k1=a, k2=a, p=p),

                    # "https://search.yhd.com/searchPage/c0-0/mbname{k1}-b/a-s2-v4-p{p}-price-d0-f0b-m1-rt0-pid-mid0-color-size-k{k2}/?isGetMoreProducts=1&moreProductsFashionCateType=2&fashionCateType=2".format(k1=a,k2=a,p=p)
                    'https://search.yhd.com/searchPage/c0-0/mbname{k1}-b/a-s1-v4-p1-price-d0-f0b-m1-rt0-pid-mid0-color-size-k{k2}/?isGetMoreProducts=1&moreProductsFashionCateType=2&fashionCateType=2'
                    .format(k1=a, k2=a)
                ]
                self.headers[
                    'referer'] = "https://search.yhd.com/c0-0/mbname{}-b/a-s2-v4-p1-price-d0-f0b-m1-rt0-pid-mid0-color-size-k{}/"
                for url in homePge_tmp:
                    if homePge_tmp.index(url) == 1:
                        tmpheads = self.headers
                        type = 1
                    else:
                        tmpheads = None
                        type = 0
                    yield Request(url=url,
                                  callback=self.prodPage_parse,
                                  dont_filter=True,
                                  meta={
                                      'hd': tmpheads,
                                      'type': type,
                                      'a': a
                                  })

    def prodPage_parse(self, response):
        print('响应内容--{a}-产品搜索结果页---{l}\n'.format(l=len(response.text),
                                                 a=response.meta['a']))
        if response.meta['type'] == 1:
            sele = etree.HTML(json.loads(response.text)['value'])
            items = sele.xpath('//div[@class="mod_search_pro"]/div')
            for each in items:
                comment_count = each.xpath(
                    './p[@class="proPrice"]/span[@class="comment"]/a/@experiencecount'
                )[0]
                if int(comment_count) > 0:
                    product_id = each.xpath('./@comproid')[0]
                    self.headers[
                        'referer'] = 'https://item.yhd.com/{}.html'.format(
                            product_id)
                    self.pars['productId'] = product_id
                    self.pars['pagenationVO.currentPage'] = 1
                    self.cur_page = 1
                    url = 'https://item.yhd.com/squ/comment/getCommentDetail.do?{}'.format(
                        getUrlWithPars(self.pars))
                    yield Request(url=url,
                                  callback=self.loop_ParsAndRequest,
                                  dont_filter=True,
                                  meta={
                                      'hd': self.headers,
                                      'pid': product_id
                                  })
        else:
            items = response.css('.mod_search_pro')
            for each in items:
                comment_count = each.css('.comment a').xpath(
                    '@experiencecount').extract_first()
                if int(comment_count) > 0:
                    product_id = each.css('.itemBox').xpath(
                        '@comproid').extract_first()
                    self.headers[
                        'referer'] = 'https://item.yhd.com/{}.html'.format(
                            product_id)
                    self.pars['productId'] = product_id
                    self.pars['pagenationVO.currentPage'] = 1
                    self.cur_page = 1
                    url = 'https://item.yhd.com/squ/comment/getCommentDetail.do?{}'.format(
                        getUrlWithPars(self.pars))
                    yield Request(url=url,
                                  callback=self.loop_ParsAndRequest,
                                  dont_filter=True,
                                  meta={
                                      'hd': self.headers,
                                      'pid': product_id
                                  })

    def loop_ParsAndRequest(self, response):
        if len(response.text) < 5000:
            return
        else:
            self.analysis(response)
            self.cur_page = self.cur_page + 1
            self.headers = response.headers
            self.pars['productId'] = response.meta['pid']
            self.pars['pagenationVO.currentPage'] = self.cur_page
            url = 'https://item.yhd.com/squ/comment/getCommentDetail.do?{}'.format(
                getUrlWithPars(self.pars))
            yield Request(url=url,
                          callback=self.loop_ParsAndRequest,
                          dont_filter=True,
                          meta={
                              'hd': self.headers,
                              'pid': response.meta['pid']
                          })

    def analysis(self, response):
        print(
            '响应内容---PID:{p}评论结果{cp}页---{l}\n'.format(p=response.meta['pid'],
                                                     l=len(response.text),
                                                     cp=self.cur_page),
            response.text)
        html = json.loads(response.text)['value']
        selector = etree.HTML(html)
        items = selector.xpath('//div[@class="item good-comment"]')
        for each in items:
            piplineItem = Yihaodian()
            piplineItem['content'] = each.xpath(
                './dl/dd[@class="clearfix"]/span[@class="text comment_content_text"]/text()'
            )[0]
            piplineItem['name'] = each.xpath(
                './div[@class="nameBox"]/span[@class="name"]/@username')[0]
            tmp = each.xpath(
                './div[@class="nameBox"]/span[@class="name"]/@id')[0]
            piplineItem['userid'] = re.search('userName(\d+)', tmp).group(1)
            piplineItem['star'] = each.xpath(
                './dl/dt[@class="user_info"]/span[2]/@class')[0]  # 星评
            tmp2 = each.xpath(
                './dl/dd[@class="replyBtn_con clearfix"]/span[@class="date"]/text()'
            )[0]
            piplineItem['date'] = re.search('\d+\-\d+\-\d+\s+\d+\:\d+\:\d+',
                                            tmp2).group(0)
            piplineItem['crawlTime'] = get_locationtime()

            print(piplineItem)

            self.pipline.process_item(item=piplineItem, spider=None)
            print(piplineItem)
Beispiel #2
0
 def __init__(self):
     self.pipline = TravellerspPipeline()
Beispiel #3
0
class YilongSpider(scrapy.Spider):
    name = 'yilong'
    # allowed_domains = ['qunar.com.com']

    headers = {
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'accept-encoding':
        'gzip',  # 只要gzip的压缩格式
        'accept-language':
        'zh-CN,zh;q=0.9',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'
    }

    name_map = {
        'haohefengjingqu': '南通濠河景区',
        'nantonglangshan': '南通狼山风景名胜区',
        'shuihuiyuan': '南通如皋水绘园景区',
        'nantonghaidishijie': '南通海底世界旅游',
        'zhangjianjinianguan': '张謇纪念馆',
        'nantongbowuyuan': '南通博物苑景区',
        'qidong': '南通启东吕四渔港',
        'ntyybly': '南通园艺博览园',
        'seyuan': '南通啬园景区',
        'ntfttxwg': '南通方特探险王国',
    }

    def start_requests(self):
        for i in self.name_map.keys():
            url = 'http://trip.elong.com/{place}/tour/list-0-{index}.html'.format(
                place=i, index=1)
            yield Request(url=url,
                          callback=self.loop_request,
                          meta={
                              'curpage': 1,
                              'curname': i
                          })

    def __init__(self):
        self.pipline = TravellerspPipeline()

    def loop_request(self, response):
        body = json.loads(response.text)
        print(body)
        if body['errno'] != 0:
            print('errno != 0')
            return
        else:
            print('翻页')
            for item in self.analysis_list(jsonBy=body, meta=response.meta):
                self.throw_request(item=item)
            page = response.meta['curpage'] + 1
            name = response.meta['curname']
            url = 'http://trip.elong.com/{place}/tour/list-0-{index}.html'.format(
                place=name, index=page)
            return Request(url=url,
                           callback=self.loop_request,
                           meta={
                               'curpage': page,
                               'curname': name
                           })

    def throw_request(self, item):
        source = requests.get(url=item['url'], headers=self.headers)
        self.analysis_article(source.text, item)

    def analysis_article(self, source, item):
        selector = etree.HTML(source)
        content = selector.xpath('string(//div[@class="article_center"])')
        item['content'] = content
        self.pipline.process_item(item, None)

    def analysis_list(self, jsonBy, meta):
        print('进入分析。。。。。。。。。。。。。。。。。。。。。')
        itemspipline = TravellerspItem()

        for i in jsonBy['data']['notes_list']:
            itemspipline['id'] = i['nid']
            itemspipline['url'] = i['share_url']
            itemspipline['platform'] = '艺龙'
            itemspipline['viewType'] = '文章'
            itemspipline['searchWord'] = self.name_map[meta['curname']]
            itemspipline['title'] = i['title']
            itemspipline['crawlTime'] = help.get_locationtime()
            itemspipline['publishTime'] = i['create_time']
            itemspipline['level'] = 1
            itemspipline['like'] = ''
            itemspipline['authorName'] = i['author']['name']
            itemspipline['authorID'] = i['author']['url']
            itemspipline['content'] = ''
            yield itemspipline
Beispiel #4
0
class XiechengSpider(scrapy.Spider):
    name = 'xiecheng_v2'
    allowed_domains = ['ctrip.com']
    # start_urls = ['http://ctrip.com/']
    param = {
        'poiID': None,
        'districtId': 85,
        'districtEName': 'Nantong',
        'pagenow': None,
        'order': '3.0',
        'star': '0.0',
        'tourist': '0.0',
        'resourcetype': 2
    }

    poiId_list = {
        '76177': '南通濠河景区',
        '96090': '南通狼山风景名胜区',
        '76178': '南通如皋水绘园景区',
        '87809': '张謇纪念馆',
        '76180': '南通博物苑景区',
        '87835': '南通启东吕四渔港',
        '92329': '南通园艺博览园',
        '76256': '南通啬园',
        '101266': '南通方特探险王国'
    }

    def __init__(self):
        self.pipline = TravellerspPipeline()

    def start_requests(self):
        for i in self.poiId_list.keys():
            self.param['poiID'] = i
            self.param['pagenow'] = 1
            url = 'http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView'
            # yield Request(url=url,callback=self.loop_request,meta={'data':self.param,'curpage':1,'curid':i},dont_filter=True)

            log.msg('发出 {name} 第1页请求'.format(name=self.poiId_list[i]),
                    log.INFO)
            yield Request(url=url,
                          callback=self.first_parse,
                          meta={
                              'data': self.param,
                              'curid': i
                          },
                          dont_filter=True)

    def first_parse(self, response):
        curid = response.meta['curid']
        name = self.poiId_list[curid]
        self.analysis(response=response)

        ttd_pager = response.xpath('//div[@class="ttd_pager cf"]').extract()
        if ttd_pager:
            numpage = response.css('.numpage').xpath('text()').extract_first()
            log.msg('{n}有{p}页'.format(n=name, p=numpage), log.INFO)

            for i in range(2, int(numpage) + 1):
                # self.param['poiID'] = curid
                # self.param['pagenow'] = i
                loop_url = 'http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView'
                yield Request(url=loop_url,
                              callback=self.analysis,
                              meta={
                                  'data': {
                                      'poiID': curid,
                                      'districtId': 85,
                                      'districtEName': 'Nantong',
                                      'pagenow': i,
                                      'order': '3.0',
                                      'star': '0.0',
                                      'tourist': '0.0',
                                      'resourcetype': 2
                                  },
                                  'curpage': i,
                                  'curid': curid
                              },
                              dont_filter=True)

        else:
            log.msg('{name} 只有一页,结束'.format(name=name), log.INFO)
            return

    # def loop_request(self, response):
    #     if not response.css('.comment_single').extract():
    #         print('no content')
    #         return
    #     else:
    #         print('翻页')
    #         self.analysis(response=response)
    #         curid = response.meta['curid']
    #         curpage = response.meta['curpage'] + 1
    #         self.param['poiID'] = curid
    #         self.param['pagenow'] = curpage
    #         loop_url = 'http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView'
    #         return Request(url=loop_url, callback=self.loop_request,meta={'data': self.param, 'curpage': curpage, 'curid': curid}, dont_filter=True)

    def analysis(self, response):
        name = self.poiId_list[response.meta['curid']]
        status = response.xpath(
            '//div[@class="ttd_pager cf"]/p/text()').extract()
        # log.msg('目前分析 {name}:{status}'.format(name=name,status=status[0] if status else '无status'), log.INFO)

        itemspipline = TravellerspItem()
        items = response.css('.comment_ctrip .comment_single')
        for item in items:
            log.msg(
                '{name}:{status}'.format(
                    name=name, status=status[0] if status else '无status'),
                log.INFO)

            id = item.css('.useful a').xpath('@data-id').extract()
            publishTime = item.css('.time_line').xpath('string(.)').extract()
            like = item.css('.useful em').xpath('string(.)').extract()
            authorName = item.css('.userimg .ellipsis a').xpath(
                'string(.)').extract()
            authorID = item.xpath(
                './div[@class="userimg"]/span[@class="ellipsis"]/a[@itemprop="author"]/@href'
            ).extract()
            content = item.css('.main_con .heightbox').xpath(
                'string(.)').extract()

            # itemspipline['id'] = '{name}:{status}'.format(name=name,status=status[0] if status else '无status')
            itemspipline['id'] = id[0] if id else ''
            itemspipline['url'] = response.url
            itemspipline['platform'] = '携程'
            itemspipline['viewType'] = '评论'
            itemspipline['searchWord'] = name
            itemspipline['title'] = name
            itemspipline['crawlTime'] = help.get_locationtime()
            itemspipline['publishTime'] = publishTime[0] if publishTime else ''
            itemspipline['level'] = 1
            itemspipline['like'] = like[0] if like else ''
            itemspipline['authorName'] = authorName[0] if authorName else ''
            itemspipline['authorID'] = authorID[0] if authorID else ''
            itemspipline['content'] = content[0] if content else ''
            print(itemspipline, '\n')
            self.pipline.process_item(item=itemspipline, spider=None)
        return
Beispiel #5
0
class XiechengSpider(scrapy.Spider):
    name = 'xiecheng'
    allowed_domains = ['ctrip.com']
    # start_urls = ['http://ctrip.com/']
    param = {
        'poiID': None,
        'districtId': 85,
        'districtEName': 'Nantong',
        'pagenow': None,
        'order': '3.0',
        'star': '0.0',
        'tourist': '0.0',
        'resourcetype': 2
    }

    poiId_list = {
        '76177': '南通濠河景区',
        '96090': '南通狼山风景名胜区',
        '76178': '南通如皋水绘园景区',
        '87809': '张謇纪念馆',
        '76180': '南通博物苑景区',
        '87835': '南通启东吕四渔港',
        '92329': '南通园艺博览园',
        '76256': '南通啬园',
        '101266': '南通方特探险王国'
    }

    def __init__(self):
        self.pipline = TravellerspPipeline()

    def start_requests(self):
        for i in self.poiId_list.keys():
            self.param['poiID'] = i
            self.param['pagenow'] = 1
            url = 'http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView'
            yield Request(url=url,
                          callback=self.loop_request,
                          meta={
                              'data': self.param,
                              'curpage': 1,
                              'curid': i
                          },
                          dont_filter=True)

    def loop_request(self, response):
        if not response.css('.comment_single').extract():
            print('no content')
            return
        else:
            print('翻页')
            self.analysis(response=response)
            curid = response.meta['curid']
            curpage = response.meta['curpage'] + 1
            self.param['poiID'] = curid
            self.param['pagenow'] = curpage
            loop_url = 'http://you.ctrip.com/destinationsite/TTDSecond/SharedView/AsynCommentView'
            return Request(url=loop_url,
                           callback=self.loop_request,
                           meta={
                               'data': self.param,
                               'curpage': curpage,
                               'curid': curid
                           },
                           dont_filter=True)

    def analysis(self, response):
        print('进入分析。。。。。。。。。。。。。。。。。。。。。')
        itemspipline = TravellerspItem()
        items = response.css('.comment_ctrip .comment_single')
        for item in items:
            publishTime = item.css('.time_line').xpath('string(.)').extract()
            like = item.css('.useful em').xpath('string(.)').extract()
            authorName = item.css('.userimg .ellipsis a').xpath(
                'string(.)').extract()
            # authorID = item.css('.userimg .ellipsis a').xpath('/@href').extract()
            authorID = item.xpath(
                './div[@class="userimg"]/span[@class="ellipsis"]/a[@itemprop="author"]/@href'
            ).extract()
            content = item.css('.main_con .heightbox').xpath(
                'string(.)').extract()

            itemspipline['id'] = ''
            itemspipline['url'] = response.url
            itemspipline['platform'] = '携程'
            itemspipline['viewType'] = '评论'
            itemspipline['searchWord'] = self.poiId_list[
                response.meta['curid']]
            itemspipline['title'] = self.poiId_list[response.meta['curid']]
            itemspipline['crawlTime'] = help.get_locationtime()
            itemspipline['publishTime'] = publishTime[0] if publishTime else ''
            itemspipline['level'] = 1
            itemspipline['like'] = like[0] if like else ''
            itemspipline['authorName'] = authorName[0] if authorName else ''
            itemspipline['authorID'] = authorID[0] if authorID else ''
            itemspipline['content'] = content[0] if content else ''
            print(itemspipline)
            self.pipline.process_item(item=itemspipline, spider=None)
        return
class MafangwoSpider(scrapy.Spider):
    name = 'mafangwo'

    id_map = {
        '6327040': '南通濠河景区',
        '5430520': '南通狼山风景名胜区',
        '6325267': '南通如皋水绘园景区',
        '5426931': '南通海底世界旅游',
        '3721876': '张謇纪念馆',
        '5503589': '南通博物苑景区',
        # 'qidong': '南通启东吕四渔港',
        '5503589': '南通园艺博览园',
        '5429244': '南通啬园景区',
        '7052809': '南通方特探险王国'
    }

    par = {
        'params': '{"poi_id":"6327040","page":2,"just_comment":1}',
    }

    def __init__(self):
        self.pipline = TravellerspPipeline()

    def start_requests(self):
        for i in self.id_map.keys():
            self.par['params'] = '{"poi_id":"' + str(
                i) + '","page":' + '1' + ',"just_comment":1}'
            url = 'http://pagelet.mafengwo.cn/poi/pagelet/poiCommentListApi?{}'.format(
                urlencode(self.par))
            yield Request(url=url,
                          callback=self.loop_request,
                          meta={
                              'curpage': 1,
                              'curid': i
                          })

    def loop_request(self, response):
        body = json.loads(response.text)

        if re.findall(r'暂无内容', body['data']['html']):
            print('暂无内容。。。。。。。。。')
            return
        else:
            self.analysis(jsonBy=body, url=response.url, meta=response.meta)
            print('翻页')
            page = response.meta['curpage'] + 1
            curid = response.meta['curid']
            self.par['params'] = '{"poi_id":"' + str(
                curid) + '","page":' + str(page) + ',"just_comment":1}'
            loop_url = 'http://pagelet.mafengwo.cn/poi/pagelet/poiCommentListApi?{}'.format(
                urlencode(self.par))
            return Request(url=loop_url,
                           callback=self.loop_request,
                           meta={
                               'curpage': page,
                               'curid': curid
                           })

    def analysis(self, jsonBy, url, meta):
        print('进入分析。。。。。。。。。。。。。。。。。。。。。')
        itemspipline = TravellerspItem()
        selector = etree.HTML(jsonBy['data']['html'])
        list = selector.xpath(
            '//div[@class="rev-list"]/ul/li[@class="rev-item comment-item clearfix"]'
        )
        for i in list:
            id = i.xpath('./a[@class="useful"]/@data-id')
            name = i.xpath('./a[@class="name"]/text()')
            authorID = i.xpath('./div[@class="user"]/a[@class="avatar"]/@href')
            content = i.xpath('./p[@class="rev-txt"]/text()')
            time = i.xpath(
                './div[@class="info clearfix"]/span[@class="time"]/text()')
            like = i.xpath(
                './a[@class="useful"]/span[@class="useful-num"]/text()')

            itemspipline['id'] = id[0] if id else ''
            itemspipline['url'] = url
            itemspipline['platform'] = '马蜂窝'
            itemspipline['viewType'] = '问答'
            itemspipline['searchWord'] = self.id_map[meta['curid']]
            itemspipline['title'] = self.id_map[meta['curid']]
            itemspipline['crawlTime'] = help.get_locationtime()
            itemspipline['publishTime'] = time[0] if time else ''
            itemspipline['level'] = 1
            itemspipline['like'] = like[0] if like else ''
            itemspipline['authorName'] = name[0] if name else ''
            itemspipline['authorID'] = authorID[0] if authorID else ''
            itemspipline['content'] = content[0] if content else ''
            print(itemspipline)
            self.pipline.process_item(item=itemspipline, spider=None)
        return