コード例 #1
0
 def parse_item(self, response):
     item = RssItem()
     item.title = response.css(
         'meta[property="og:title"]::attr(content)').get()
     item.link = response.url
     item.guid = response.url
     item.pubDate = response.css(".title h6::text").get()
     item.author = "Liberal Victoria"
     item.description = "".join(response.css(".mr-content").extract())
     yield item
コード例 #2
0
ファイル: wa_gov_spider.py プロジェクト: gov-rss/scrape
 def parse_item(self, response):
     item = RssItem()
     item.title = response.css("title::text").get().split(" - ",
                                                          1)[-1].strip()
     item.link = response.url
     item.guid = response.url
     item.pubDate = response.css("div.newsCreatedDate::text").get().strip()
     item.author = " & ".join(
         response.css("img.ministersPic::attr(alt)").getall())
     item.description = response.css("div.ms-rtestate-field").get()
     yield item
コード例 #3
0
ファイル: sa_prem_spider.py プロジェクト: gov-rss/scrape
 def parse_item(self, response):
     item = RssItem()
     item.title = response.css(
         'meta[name="dcterms.title"]::attr(content)').get()
     item.link = response.url
     item.guid = response.url
     item.pubDate = response.css(
         'meta[name="dcterms.issued"]::attr(content)').get()
     item.author = response.css(
         'meta[name="article.minister"]::attr(content)').get()
     summary = response.css("div.news-detail__summary p").getall()
     body = response.css("div.news-detail__body p").getall()
     item.description = "".join(summary + body)
     return item
コード例 #4
0
ファイル: nsw_prem_spider.py プロジェクト: gov-rss/scrape
    def parse_item(self, response):
        item = RssItem()
        item.title = response.css("title::text").get().split(" | ")[0]
        item.link = response.url
        item.guid = response.url
        item.pubDate = response.css(
            'meta[name="dcterms.date"]::attr(content)').get()
        item.description = response.css("div.nsw-wysiwyg-content").get()
        author = response.css(
            "div.standard-header__released_by div::text").getall()
        if author:
            item.author = author[-1].strip()
        else:
            item.author = "NSW Government"

        yield item
コード例 #5
0
 def parse_item(self, response):
     item = RssItem()
     item.title = (
         response.css('meta[name="DCTERMS.title"]::attr(content)').get().strip()
     )
     item.link = response.url
     item.guid = response.url
     item.pubDate = (
         response.css("script::text")
         .re_first(r'"datePublished": ".*"')
         .split(":", 1)[-1]
         .strip(' "')
     )
     author = response.css("p.statement-ministers::text").getall()
     item.author = " & ".join(author)
     description = response.css("div div p").getall()
     cutoff = 2  # publish date & author
     if len(author) > 1:
         cutoff += len(author)
     item.description = "".join(description[cutoff:])
     yield item