Ejemplo n.º 1
0
    def analysis(self, response):
        print(
            '响应内容---PID:{p}评论结果{cp}页---{l}\n'.format(p=response.meta['pid'],
                                                     l=len(response.text),
                                                     cp=self.cur_page),
            response.text)
        html = json.loads(response.text)['value']
        selector = etree.HTML(html)
        items = selector.xpath('//div[@class="item good-comment"]')
        for each in items:
            piplineItem = Yihaodian()
            piplineItem['content'] = each.xpath(
                './dl/dd[@class="clearfix"]/span[@class="text comment_content_text"]/text()'
            )[0]
            piplineItem['name'] = each.xpath(
                './div[@class="nameBox"]/span[@class="name"]/@username')[0]
            tmp = each.xpath(
                './div[@class="nameBox"]/span[@class="name"]/@id')[0]
            piplineItem['userid'] = re.search('userName(\d+)', tmp).group(1)
            piplineItem['star'] = each.xpath(
                './dl/dt[@class="user_info"]/span[2]/@class')[0]  # 星评
            tmp2 = each.xpath(
                './dl/dd[@class="replyBtn_con clearfix"]/span[@class="date"]/text()'
            )[0]
            piplineItem['date'] = re.search('\d+\-\d+\-\d+\s+\d+\:\d+\:\d+',
                                            tmp2).group(0)
            piplineItem['crawlTime'] = get_locationtime()

            print(piplineItem)

            self.pipline.process_item(item=piplineItem, spider=None)
            print(piplineItem)
Ejemplo n.º 2
0
    def analysis(self, response):
        print('进入分析。。。。。。。。。。。。。。。。。。。。。')
        itemspipline = TravellerspItem()
        items = response.css('.comment_ctrip .comment_single')
        for item in items:
            publishTime = item.css('.time_line').xpath('string(.)').extract()
            like = item.css('.useful em').xpath('string(.)').extract()
            authorName = item.css('.userimg .ellipsis a').xpath(
                'string(.)').extract()
            # authorID = item.css('.userimg .ellipsis a').xpath('/@href').extract()
            authorID = item.xpath(
                './div[@class="userimg"]/span[@class="ellipsis"]/a[@itemprop="author"]/@href'
            ).extract()
            content = item.css('.main_con .heightbox').xpath(
                'string(.)').extract()

            itemspipline['id'] = ''
            itemspipline['url'] = response.url
            itemspipline['platform'] = '携程'
            itemspipline['viewType'] = '评论'
            itemspipline['searchWord'] = self.poiId_list[
                response.meta['curid']]
            itemspipline['title'] = self.poiId_list[response.meta['curid']]
            itemspipline['crawlTime'] = help.get_locationtime()
            itemspipline['publishTime'] = publishTime[0] if publishTime else ''
            itemspipline['level'] = 1
            itemspipline['like'] = like[0] if like else ''
            itemspipline['authorName'] = authorName[0] if authorName else ''
            itemspipline['authorID'] = authorID[0] if authorID else ''
            itemspipline['content'] = content[0] if content else ''
            print(itemspipline)
            self.pipline.process_item(item=itemspipline, spider=None)
        return
    def analysis(self, jsonBy, url, meta):
        print('进入分析。。。。。。。。。。。。。。。。。。。。。')
        itemspipline = TravellerspItem()
        selector = etree.HTML(jsonBy['data']['html'])
        list = selector.xpath(
            '//div[@class="rev-list"]/ul/li[@class="rev-item comment-item clearfix"]'
        )
        for i in list:
            id = i.xpath('./a[@class="useful"]/@data-id')
            name = i.xpath('./a[@class="name"]/text()')
            authorID = i.xpath('./div[@class="user"]/a[@class="avatar"]/@href')
            content = i.xpath('./p[@class="rev-txt"]/text()')
            time = i.xpath(
                './div[@class="info clearfix"]/span[@class="time"]/text()')
            like = i.xpath(
                './a[@class="useful"]/span[@class="useful-num"]/text()')

            itemspipline['id'] = id[0] if id else ''
            itemspipline['url'] = url
            itemspipline['platform'] = '马蜂窝'
            itemspipline['viewType'] = '问答'
            itemspipline['searchWord'] = self.id_map[meta['curid']]
            itemspipline['title'] = self.id_map[meta['curid']]
            itemspipline['crawlTime'] = help.get_locationtime()
            itemspipline['publishTime'] = time[0] if time else ''
            itemspipline['level'] = 1
            itemspipline['like'] = like[0] if like else ''
            itemspipline['authorName'] = name[0] if name else ''
            itemspipline['authorID'] = authorID[0] if authorID else ''
            itemspipline['content'] = content[0] if content else ''
            print(itemspipline)
            self.pipline.process_item(item=itemspipline, spider=None)
        return
Ejemplo n.º 4
0
 def article_parse(self, response):
     piplineitem = WeixinItem()
     piplineitem['name'] = self.map[self.dict['__biz']]
     piplineitem['title'] = response.xpath(
         '//h2[@id="activity-name"]/text()').extract_first().strip()
     piplineitem['content'] = response.css('#js_content').xpath(
         'string(.)').extract_first().strip()
     piplineitem['publish_time'] = re.search(
         'var\s+publish_time\s+=\s*\"(\S*)\"', response.text).group(1)
     piplineitem['crawlTime'] = help.get_locationtime()
     return piplineitem
Ejemplo n.º 5
0
    def analysis_list(self, jsonBy, meta):
        print('进入分析。。。。。。。。。。。。。。。。。。。。。')
        itemspipline = TravellerspItem()

        for i in jsonBy['data']['notes_list']:
            itemspipline['id'] = i['nid']
            itemspipline['url'] = i['share_url']
            itemspipline['platform'] = '艺龙'
            itemspipline['viewType'] = '文章'
            itemspipline['searchWord'] = self.name_map[meta['curname']]
            itemspipline['title'] = i['title']
            itemspipline['crawlTime'] = help.get_locationtime()
            itemspipline['publishTime'] = i['create_time']
            itemspipline['level'] = 1
            itemspipline['like'] = ''
            itemspipline['authorName'] = i['author']['name']
            itemspipline['authorID'] = i['author']['url']
            itemspipline['content'] = ''
            yield itemspipline
Ejemplo n.º 6
0
    def analysis(self, response):
        name = self.poiId_list[response.meta['curid']]
        status = response.xpath(
            '//div[@class="ttd_pager cf"]/p/text()').extract()
        # log.msg('目前分析 {name}:{status}'.format(name=name,status=status[0] if status else '无status'), log.INFO)

        itemspipline = TravellerspItem()
        items = response.css('.comment_ctrip .comment_single')
        for item in items:
            log.msg(
                '{name}:{status}'.format(
                    name=name, status=status[0] if status else '无status'),
                log.INFO)

            id = item.css('.useful a').xpath('@data-id').extract()
            publishTime = item.css('.time_line').xpath('string(.)').extract()
            like = item.css('.useful em').xpath('string(.)').extract()
            authorName = item.css('.userimg .ellipsis a').xpath(
                'string(.)').extract()
            authorID = item.xpath(
                './div[@class="userimg"]/span[@class="ellipsis"]/a[@itemprop="author"]/@href'
            ).extract()
            content = item.css('.main_con .heightbox').xpath(
                'string(.)').extract()

            # itemspipline['id'] = '{name}:{status}'.format(name=name,status=status[0] if status else '无status')
            itemspipline['id'] = id[0] if id else ''
            itemspipline['url'] = response.url
            itemspipline['platform'] = '携程'
            itemspipline['viewType'] = '评论'
            itemspipline['searchWord'] = name
            itemspipline['title'] = name
            itemspipline['crawlTime'] = help.get_locationtime()
            itemspipline['publishTime'] = publishTime[0] if publishTime else ''
            itemspipline['level'] = 1
            itemspipline['like'] = like[0] if like else ''
            itemspipline['authorName'] = authorName[0] if authorName else ''
            itemspipline['authorID'] = authorID[0] if authorID else ''
            itemspipline['content'] = content[0] if content else ''
            print(itemspipline, '\n')
            self.pipline.process_item(item=itemspipline, spider=None)
        return