コード例 #1
0
    def parse_node(self, response, node):

        # Clean up namespace to allow for tags to be accessed
        node.remove_namespaces()
        links = node.xpath('//item//link/text()').extract()

        # Fetch actual article
        if len(links) > 0:
            url = links[0]
            article_ex = ArticleExtractor(url, response)
            article_item = article_ex.get_article_item()
            article_item['source'] = 'aljazeera'
            return article_item
コード例 #2
0
    def parse_node(self, response, node):

        # Clean up namespace to allow for tags to be accessed
        node.remove_namespaces()
        links = node.xpath('//item//link/text()').extract()

        # Fetch actual article
        if len(links) > 0:
            url = links[0]
            article_ex = ArticleExtractor(url,response)
            article_item = article_ex.get_article_item()
            article_item['source'] = 'aljazeera'
            return article_item
コード例 #3
0
    def parse_node(self, response, node):
        # Clean up namespace to allow for tags to be accessed
        links = node.xpath('//item//link/text()').extract()

        #TODO custom parsing might be required to extract larger # of articles.

        if len(links) > 0:
            url = links[0].replace("\n", "").strip()
            article_ex = ArticleExtractor(url, response)
            article_item = article_ex.get_article_item()
            article_item['source'] = 'the_peoples_daily'
            # Drop html for this crawler.
            article_item['html_content'] = None
            return article_item
コード例 #4
0
    def parse_node(self, response, node):
        # Clean up namespace to allow for tags to be accessed
        links = node.xpath('//item//link/text()').extract()


        #TODO custom parsing might be required to extract larger # of articles.

        if len(links) > 0:
            url = links[0].replace("\n", "").strip()
            article_ex = ArticleExtractor(url,response)
            article_item = article_ex.get_article_item()
            article_item['source'] = 'the_peoples_daily'
            # Drop html for this crawler. 
            article_item['html_content'] = None
            return article_item
コード例 #5
0
    def parse_node(self, response, node):
        # Clean up namespace to allow for tags to be accessed
 
        node.remove_namespaces()
        #titles = node.xpath('//title/text()').extract()
        #title= titles[2]
        #description = node.xpath('//*[name()="media:description"]/text()').extract()
        #description = node.xpath('//description/text()').extract()
        links = node.xpath('//link[@rel="standout"]/@href').extract()
        pub_date = node.xpath('//pubDate/text()').extract()[0]
        small_img = node.xpath('//*[name()="media:content"]/@url').extract()
        
        #TODO Convert date

        # Fetch actual article
        if len(links) > 0:

            url = links[0]

            # Cookie workaround for NY Times. NY times a ...
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
            response = opener.open(url)
            raw_html = response.read()

            # Goose cannot extract clean text from html of NY Times
            soup = BeautifulSoup(raw_html,"lxml")
            story_body =  soup.findAll("p", class_="story-body-text")
            story_texts = []
            for story_text in story_body:
                story_texts.append(story_text.get_text())

            cleaned_text = ' '.join(story_texts)

            article_ex = ArticleExtractor(url,response,raw_html)
            article_item = article_ex.get_article_item()

            # Override since Goose was not able to extract correctly
            article_item['text_content'] = cleaned_text
            article_item['date_published'] = pub_date
            article_item['source'] = 'nytimes'
            return article_item