コード例 #1
0
class ArtsciBiologyGooseSpider(CrawlSpider):
    name = "artsci_biology_goose"
    allowed_domains = ["concordia.ca"]
    start_urls = [
        'http://www.concordia.ca/artsci/biology.html',
    ]

    rules = (
        Rule(LinkExtractor(allow=allowed_links(start_urls[0])), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        article = Goose().extract(raw_html=response.body)

        yield Article(title=article.title,
                      text=article.cleaned_text,
                      url=response.url,
                      field=self.name)
コード例 #2
0
class ArtsciMysterySpider(CrawlSpider):
    name = "artsci_mystery"
    allowed_domains = ["concordia.ca"]
    start_urls = [
        'https://www.concordia.ca/artsci/science-college/about/life-at-the-college.html',
    ]

    rules = (
        Rule(LinkExtractor(allow=allowed_links(start_urls[0])), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        title = response.css('title::text').extract_first()

        extractor = Extractor(extractor='ArticleExtractor', html=response.body)

        yield Article(title=title,
                      text=extractor.getText(),
                      url=response.url,
                      field=self.name)
コード例 #3
0
class ArtsciGeographySpider(CrawlSpider):
    name = "artsci_geography"
    allowed_domains = ["concordia.ca"]
    start_urls = [
        'http://www.concordia.ca/artsci/geography-planning-environment.html',
    ]

    rules = (
        Rule(LinkExtractor(allow=allowed_links(start_urls[0])), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        title = response.css('title::text').extract_first()

        extractor = Extractor(extractor='ArticleExtractor', html=response.body)

        yield Article(title=title,
                      text=extractor.getText(),
                      url=response.url,
                      field=self.name)
コード例 #4
0
class ArtsciBiologyXpathSpider(CrawlSpider):
    name = "artsci_biology_xpath"
    allowed_domains = ["concordia.ca"]
    start_urls = [
        'http://www.concordia.ca/artsci/biology.html',
    ]

    rules = (
        Rule(LinkExtractor(allow=allowed_links(start_urls[0])), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        title = response.css('title::text').extract_first()

        body = parse_body(response)

        yield Article(title=title,
                      text=body,
                      url=response.url,
                      field=self.name)