def parse_klexikon_article(self, response):
        if 'https://klexikon.zum.de/wiki/Datei' in str(response.url):
            return

        title = processor.extract_title(response)
        content = processor.extract_content(
            response, '//div[@class="mw-content-ltr"]/*')
        content = processor.preprocess_content(content)

        article_item = ArticleItem()
        article_item['url'] = response.url
        article_item['title'] = title
        article_item['content'] = content
        article_item['datasource'] = 'klexikon'
        yield article_item

        wikipedia_link = str(response.url).replace(
            'https://klexikon.zum.de/', 'https://de.wikipedia.org/')
        yield scrapy.Request(wikipedia_link,
                             callback=self.parse_wikipedia_article)

        stupipedia_link = str(response.url).replace(
            'https://klexikon.zum.de/wiki/',
            'https://www.stupidedia.org/stupi/')
        yield scrapy.Request(stupipedia_link,
                             callback=self.parse_stupipedia_article)
    def parse_stupipedia_article(self, response):
        title = processor.extract_title(response)
        content = processor.extract_content(
            response, '//div[@class="mw-content-ltr"]/*')
        content = processor.preprocess_content(content)

        article_item = ArticleItem()
        article_item['url'] = response.url
        article_item['title'] = title
        article_item['content'] = content
        article_item['datasource'] = 'stupipedia'
        yield article_item
Beispiel #3
0
    def parse_klexikon_article(self, response):
        if 'https://klexikon.zum.de/wiki/Datei' in str(response.url):
            return

        title = processor.extract_title(response)
        content = processor.extract_content(
            response, '//div[@class="mw-content-ltr"]/*')
        content = processor.preprocess_content(content)

        article_item = ArticleItem()
        article_item['url'] = response.url
        article_item['title'] = title
        article_item['content'] = content
        article_item['datasource'] = 'Klexikon'
        yield article_item