def crawl_tieba(self, response):
     trends = response.css('div.main > ul > li  a')
     for trend in trends:
         item = ScrapyNewsItem()
         item['source'] = 'tieba_spider'
         item['title'] = trend.css('a::text').extract_first()
         item['url'] = trend.css('a').attrib['href']
         item['remark'] = ''
         yield item
 def crawl_topbaidu(self, response):
     trends = response.css('td.keyword >a:nth-child(1) ')
     for trend in trends:
         item = ScrapyNewsItem()
         item['source'] = 'topbaidu_spider'
         item['title'] = trend.css('a::text').extract_first()
         item['url'] = trend.css('a').attrib['href']
         item['remark'] = ''
         yield item
 def crawl_douban(self, response):
     trends = response.css('ul.trend > li > a')
     for trend in trends:
         item = ScrapyNewsItem()
         item['source'] = 'douban_spider'
         item['title'] = trend.css('a::text').extract_first()
         item['url'] = trend.css('a').attrib['href']
         item['remark'] = ''
         yield item
Beispiel #4
0
 def parse_item(self, response):
     item = ScrapyNewsItem()
     item['title'] = response.xpath('/html/body/div[3]/div[2]/div[2]/p/text()').extract()
     item['date'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[2]/text()').extract()
     state = response.xpath('normalize-space(/html/body/div[3]/div[2]/div[2]/div[1]/span[3]/text())').extract()
     item['state'] = state
     item['number'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract()
     item['content'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract()
     item['url'] = response.url
     yield item
 def crawl_hupu(self, response):
     trends = response.css('div.list> ul > li >span:nth-child(1) >a')
     for trend in trends:
         item = ScrapyNewsItem()
         item['source'] = 'hupu_spider'
         item['title'] = trend.css('a').attrib['title']
         item['url'] = "https://bbs.hupu.com" + trend.css(
             'a').attrib['href']
         item['remark'] = ''
         yield item
 def crawl_weibo(self, response):
     trends = response.css('td.td-02 > a')
     for trend in trends:
         item = ScrapyNewsItem()
         item['source'] = 'weibo_spider'
         item['title'] = trend.css('a::text').extract_first()
         href = self.get_weibo_href(trend)
         item['url'] = "https://s.weibo.com" + href
         item['remark'] = ''
         yield item
 def crawl_github(self, response):
     trends = response.css('div> article.Box-row ')
     for trend in trends:
         item = ScrapyNewsItem()
         item['source'] = 'github_spider'
         title = "".join(trend.css('p::text').extract())
         re.sub(r'[\\*|“<>:/()()0123456789]', '', title)
         title.replace('\n', '').replace('  ', '')
         item['title'] = title
         item['url'] = "https://github.com" + trend.css(
             'h1>a').attrib['href']
         item['remark'] = ''
         yield item
    def agricultural_detail(self, response):
        item = ScrapyNewsItem()
        base_contents_list = response.xpath(
            '//div[@class="content"]/div[@class="TRS_Editor"]/div/text()'
        ).extract()
        tag_p_contents_list = response.xpath(
            '//div[@class="content"]/div[@class="TRS_Editor"]/p/text()'
        ).extract()
        tag_image_contents_list = response.xpath(
            '//div[@class="content"]/div[@class="TRS_Editor"]/div[@class="Custom_UnionStyle"]/p/text()'
        ).extract()
        base_contents = "\n".join(base_contents_list).strip()
        tag_p_contents = "\n".join(tag_p_contents_list).strip()
        tag_image_contents = "\n".join(tag_image_contents_list).strip()
        contents = base_contents + tag_p_contents + tag_image_contents
        if not contents:
            return
        try:
            item["image"] = response.xpath(
                '//div[@class="content"]/div[@class="TRS_Editor"]/div[@class="Custom_UnionStyle"]/p/img/@src'
            ).extract()[0]
        except:
            pass

        item["title"] = response.xpath(
            '//div[@class="zhengwen-left-container"]/h1[@class="wtitle"]/text()'
        ).extract()[0]
        item["contents"] = contents
        item["auth"] = response.xpath(
            u'//div[@class="content"]/div[contains(text(),"责任编辑")]/text()'
        ).extract()[0]
        item["occurd_time"] = response.xpath(
            '//div[@class="yui3-g"]/div[@class="yui3-u"]/p[@class="wlaiyuan"]/text()'
        ).extract()[0]
        item["source_url"] = response.url
        item["source"] = u"中国农业新闻网"

        yield item