class ArtsciBiologyGooseSpider(CrawlSpider): name = "artsci_biology_goose" allowed_domains = ["concordia.ca"] start_urls = [ 'http://www.concordia.ca/artsci/biology.html', ] rules = ( Rule(LinkExtractor(allow=allowed_links(start_urls[0])), callback='parse_item', follow=True), ) def parse_item(self, response): article = Goose().extract(raw_html=response.body) yield Article(title=article.title, text=article.cleaned_text, url=response.url, field=self.name)
class ArtsciMysterySpider(CrawlSpider): name = "artsci_mystery" allowed_domains = ["concordia.ca"] start_urls = [ 'https://www.concordia.ca/artsci/science-college/about/life-at-the-college.html', ] rules = ( Rule(LinkExtractor(allow=allowed_links(start_urls[0])), callback='parse_item', follow=True), ) def parse_item(self, response): title = response.css('title::text').extract_first() extractor = Extractor(extractor='ArticleExtractor', html=response.body) yield Article(title=title, text=extractor.getText(), url=response.url, field=self.name)
class ArtsciGeographySpider(CrawlSpider): name = "artsci_geography" allowed_domains = ["concordia.ca"] start_urls = [ 'http://www.concordia.ca/artsci/geography-planning-environment.html', ] rules = ( Rule(LinkExtractor(allow=allowed_links(start_urls[0])), callback='parse_item', follow=True), ) def parse_item(self, response): title = response.css('title::text').extract_first() extractor = Extractor(extractor='ArticleExtractor', html=response.body) yield Article(title=title, text=extractor.getText(), url=response.url, field=self.name)
class ArtsciBiologyXpathSpider(CrawlSpider): name = "artsci_biology_xpath" allowed_domains = ["concordia.ca"] start_urls = [ 'http://www.concordia.ca/artsci/biology.html', ] rules = ( Rule(LinkExtractor(allow=allowed_links(start_urls[0])), callback='parse_item', follow=True), ) def parse_item(self, response): title = response.css('title::text').extract_first() body = parse_body(response) yield Article(title=title, text=body, url=response.url, field=self.name)