def parse_node(self, response, node): # Clean up namespace to allow for tags to be accessed node.remove_namespaces() links = node.xpath('//item//link/text()').extract() # Fetch actual article if len(links) > 0: url = links[0] article_ex = ArticleExtractor(url, response) article_item = article_ex.get_article_item() article_item['source'] = 'aljazeera' return article_item
def parse_node(self, response, node): # Clean up namespace to allow for tags to be accessed node.remove_namespaces() links = node.xpath('//item//link/text()').extract() # Fetch actual article if len(links) > 0: url = links[0] article_ex = ArticleExtractor(url,response) article_item = article_ex.get_article_item() article_item['source'] = 'aljazeera' return article_item
def parse_node(self, response, node): # Clean up namespace to allow for tags to be accessed links = node.xpath('//item//link/text()').extract() #TODO custom parsing might be required to extract larger # of articles. if len(links) > 0: url = links[0].replace("\n", "").strip() article_ex = ArticleExtractor(url, response) article_item = article_ex.get_article_item() article_item['source'] = 'the_peoples_daily' # Drop html for this crawler. article_item['html_content'] = None return article_item
def parse_node(self, response, node): # Clean up namespace to allow for tags to be accessed links = node.xpath('//item//link/text()').extract() #TODO custom parsing might be required to extract larger # of articles. if len(links) > 0: url = links[0].replace("\n", "").strip() article_ex = ArticleExtractor(url,response) article_item = article_ex.get_article_item() article_item['source'] = 'the_peoples_daily' # Drop html for this crawler. article_item['html_content'] = None return article_item
def parse_node(self, response, node): # Clean up namespace to allow for tags to be accessed node.remove_namespaces() #titles = node.xpath('//title/text()').extract() #title= titles[2] #description = node.xpath('//*[name()="media:description"]/text()').extract() #description = node.xpath('//description/text()').extract() links = node.xpath('//link[@rel="standout"]/@href').extract() pub_date = node.xpath('//pubDate/text()').extract()[0] small_img = node.xpath('//*[name()="media:content"]/@url').extract() #TODO Convert date # Fetch actual article if len(links) > 0: url = links[0] # Cookie workaround for NY Times. NY times a ... opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(url) raw_html = response.read() # Goose cannot extract clean text from html of NY Times soup = BeautifulSoup(raw_html,"lxml") story_body = soup.findAll("p", class_="story-body-text") story_texts = [] for story_text in story_body: story_texts.append(story_text.get_text()) cleaned_text = ' '.join(story_texts) article_ex = ArticleExtractor(url,response,raw_html) article_item = article_ex.get_article_item() # Override since Goose was not able to extract correctly article_item['text_content'] = cleaned_text article_item['date_published'] = pub_date article_item['source'] = 'nytimes' return article_item