예제 #1
0
    def parse(self, response):
        res=response.xpath('//div[@class="result"]/div[@class="box-result clearfix"]')
        if len(res)==0:
            self.end=True
            return

        for each in res:
            title=each.xpath('div[@class="r-info r-info2"]/h2/a//text()').extract()
            title=''.join(title)
            if title=='':
                title = each.xpath('h2/a//text()').extract()
                title = ''.join(title)
            # print(title)
            if title in self.flag:
                self.end=True
                return
            self.flag.add(title)

            author='新华网'

            excerpt=each.xpath('div[@class="r-info r-info2"]/p//text()').extract()
            excerpt=''.join(excerpt)
            if excerpt=='':
                excerpt = each.xpath('div[@class="r-info  r-info2"]/p//text()').extract()
                excerpt = ''.join(excerpt)
            #print(excerpt)

            release_time=each.xpath('div[@class="r-info r-info2"]/h2//text()').extract()
            if len(release_time)==0:
                release_time = each.xpath('h2//text()').extract()
            temp=release_time[-1]
            release_time=temp[-20:-9]
            # print(release_time)

            url=each.xpath('div[@class="r-info r-info2"]/h2/a/@href').extract()
            url=''.join(url)
            if url=='':
                url = each.xpath('h2/a/@href').extract()
                url = ''.join(url)
            html = requests.get(url).content
            selector = lxml.html.document_fromstring(html)
            content = selector.xpath('//p//text()')
            content = ''.join(content).replace('\'', '')

            img_url = 'http://seopic.699pic.com/photo/50045/7863.jpg_wh1200.jpg'

            item = NewsspiderItem()
            item['title'] = title
            item['author'] = author
            item['release_time'] = release_time
            item['excerpt'] = excerpt
            item['content'] = content
            item['img_url'] = img_url
            yield item

        if not self.end:
            self.page = self.page + 1
            # print(URL.format(day1=beginTime,day2=endTime,start=self.page*20))
            yield Request(URL.format(q=keyword, stime=self.beginTime, etime=self.endTime, page=self.page), self.parse,
                          dont_filter=True)
예제 #2
0
    def parse_photonews(self, response):
        item = NewsspiderItem()
        item['title'] = response.xpath("//h1/text()").extract()
        item['date'] = response.xpath("//div[@class='post_time_source']/text()").re(
            r'[0-9]*-[0-9]*-[0-9]* [0-9]*:[0-9]*:[0-9]*')
        item['source'] = response.xpath("//a[@id='ne_article_source']/text()").extract()
        item['content'] = ''.join(response.xpath("//div[@class='picinfo-text']/p[not(@class)]/span/text()").extract()).replace('\n', '')
        item['url'] = response.url

        yield item
예제 #3
0
 def page_parser(self, response):
     data = self.parser.extract_news(response.text)
     if data:
         item = NewsspiderItem()
         item['keyword'] = self.keyword
         item['news_url'] = response.meta['url']
         item['news_time'] = data['news_pubtime']
         item['news_date'] = data['news_date']
         item['news_title'] = data['news_title']
         item['news_content'] = data['news_content']
         yield item
     return
예제 #4
0
    def parse(self, response):
        res=response.xpath('//div[@id="news_list"]/table')
        if len(res)==0:
            self.end=True
            return

        for each in res:
            title=each.xpath('tr/td/ul/li[@class="news_title"]/a//text()').extract()
            title=''.join(title)
            if title in self.flag:
                self.end=True
                return
            self.flag.add(title)

            author='中新网'

            ans=each.xpath('tr/td/ul/li[@class="news_other"]/text()').extract_first().split()
            release_time=ans[1]+" "+ans[2]

            excerpt=each.xpath('tr/td/ul/li[@class="news_content"]//text()').extract()
            excerpt=''.join(excerpt).replace('    ','').replace('\r','').replace('\t','')
            excerpt=excerpt.lstrip().replace(' ','')

            #排除发现中新网的搜索结果显示界面和新闻显示界面编码不同
            url=ans[0]
            html=requests.get(url).content.decode('GBK')
            selector = lxml.html.document_fromstring(html)
            content=selector.xpath('//p//text()')
            content=''.join(content).replace('\'','')
            content.encode('utf-8')

            img_url=each.xpath('tr/td/a/img[@class="rsimg"]/@src').extract()
            img_url=''.join(img_url)
            #设置默认图片
            if img_url=="":
                img_url='http://seopic.699pic.com/photo/50045/7863.jpg_wh1200.jpg'
            else:
                img_url=img_url


            item=NewsspiderItem()
            item['title'] = title
            item['author'] = author
            item['release_time'] = release_time
            item['excerpt'] = excerpt
            item['content']=content
            item['img_url']=img_url
            yield item

        if not self.end:
            self.page=self.page+1
            # print(URL.format(day1=beginTime,day2=endTime,start=self.page*20))
            yield Request(URL.format(q=keyword,day1=self.beginTime,day2=self.endTime,start=self.page*20),self.parse,dont_filter=True)
예제 #5
0
    def parse_news_list(self, response):
        # 爬取每个url
        json_array = "".join(
            response.text[14:-1].split())  # 去掉前面的"data_callback"
        news_array = json.loads(json_array)
        category = response.meta['category']
        for row in enumerate(news_array):
            news_item = NewsspiderItem()
            row_data = row[1]
            news_item["url"] = row_data["tlink"]

            yield scrapy.Request(news_item["url"],
                                 meta={"news_item": news_item},
                                 callback=self.parse_news_content)
예제 #6
0
    def parse_item(self, response):
        item = NewsspiderItem()
        if 'special' not in response.url:  # 不是新闻列表
            item['title'] = response.xpath("//h1/text()").extract()
            item['date'] = response.xpath(
                "//div[@class='post_time_source']/text()").re(
                    r'[0-9]*-[0-9]*-[0-9]* [0-9]*:[0-9]*:[0-9]*')
            item['source'] = response.xpath(
                "//a[@id='ne_article_source']/text()").extract()
            # item['content'] = ''.join(response.xpath("//div[@id='endText']/p[not(@class)]").xpath('string(.)').extract())
            item['content'] = ''.join(
                response.xpath("//div[@id='endText']/p[not(@class)]/text()").
                extract()).replace('\n', '')
            item['url'] = response.url

        yield item
예제 #7
0
파일: news.py 프로젝트: scutzzy/NewsSpider
 def parse(self, response):
     json_raw = response.text[9:-1]
     json_dic = json.loads(json_raw)
     for key in json_dic.keys():
         for i in range(0, 20):
             newsUrl = json_dic[key][i]["url"]
             if (re.match(r"http://3g.163.com", newsUrl)):
                 item = NewsspiderItem()
                 item['digest'] = json_dic[key][i]["digest"]  #str
                 item['title'] = json_dic[key][i]["title"]  #str
                 item['time'] = json_dic[key][i]["ptime"]  #str
                 item['commentCount'] = json_dic[key][i][
                     "commentCount"]  #int
                 item['source'] = json_dic[key][i]["source"]  #str
                 yield Request(newsUrl,
                               callback=self.parse_content,
                               meta={'item': item})
예제 #8
0
    def parse(self, response):
        #爬取内容解析
        sel = Selector(response)
        items = NewsspiderItem()
        # for data in response.body[15:-13].split('","'):
        #     tmp = data.split(',')
        #     item = NewsspiderItem()
        #新闻标题
        #     item['news_Title'] = 9
        # 新闻发布时间
        #     item['news_PublishDate'] = tmp[1]
        # 新闻链接--系统内部使用
        #     item['news_Url'] = tmp[2]
        # 新闻来源链接
        item['news_FromUrl'] = str(response.url)
        # 入库时间
        #     item['news_CreateDate'] = ''

        #     items.append(item)

        return items
예제 #9
0
    def parse(self, response):
        ret_data = json.loads(response.text)
        for i in ret_data['data']:
            item = NewsspiderItem()
            item['author'] = i['author']
            item['fpTime'] = datetime.datetime.utcfromtimestamp(int(i['fpTime'])).strftime('%Y.%m.%d %H:%M:%S')
            item['title'] = i['title']
            item['tags'] = str({'tags':i['tags']})
            item['url'] = i['url_https']
            yield scrapy.Request(
                item['url'],
                callback=self.parse_content,
                meta={'item':item}
            )

        self.page += 1
        next_url = 'https://cre.mix.sina.com.cn/api/v3/get?cre=tianyi&mod=pctech&offset={page}'.format(page=str(self.page))
        yield scrapy.Request(
            next_url,
            callback=self.parse
        )
예제 #10
0
    def parse(self, response):
        res=response.xpath('//div[@class="mainM"]/div[@class="searchResults"]')
        # print(res)
        if len(res)==0:
            self.end=True
            return

        for each in res:
            title=each.xpath('p[@class="fz16 line24"]//text()').extract()
            title=''.join(title)
            # print(title)
            # if title in self.flag:
            #     self.end=True
            #     return
            self.flag.add(title)

            author='凤凰资讯'

            excerpt=each.xpath('p//text()').extract()
            release_time = excerpt[-1].replace('\r','').replace('\t','').replace('\n','').split()
            release_time = release_time[1]
            excerpt=excerpt[0:-1]
            excerpt=''.join(excerpt)
            # print(excerpt)
            # print(release_time)
            if release_time[0]!=2:
                release_time= time.strftime("%Y-%m-%d", time.localtime())
            # 进行时间的判断
            rtime=int(time.mktime(time.strptime(release_time, "%Y-%m-%d")))


            url=each.xpath('p[@class="fz16 line24"]/a/@href').extract()
            url=''.join(url)
            # print(url)
            html = requests.get(url).content
            selector = lxml.html.document_fromstring(html)
            content = selector.xpath('//div[@id="main_content"]//text()')
            content = ''.join(content).replace('\'', '').replace('\r','').replace('\t','').replace('\n','')
            if content=='':
                content = selector.xpath('//div[@class="article"]/p//text()')
                content=''.join(content)
            if content=='':
                content=excerpt
            # print(content)

            img_url = 'http://seopic.699pic.com/photo/50045/7863.jpg_wh1200.jpg'
            item = NewsspiderItem()
            #如果在此时间范围内,存取这个item
            if rtime>=self.bt and rtime<=self.et:
                item['title'] = title
                item['author'] = author
                item['release_time'] = release_time
                item['excerpt'] = excerpt
                item['content'] = content
                item['img_url'] = img_url
                yield item

        # print(self.end)
        if self.page!=12:
            self.page=self.page+1
            yield Request(URL.format(q=keyword,p=self.page),self.parse,dont_filter=True)