def extract_from_source(self, source): news = NpSource(source, verbose=True) news.clean_memo_cache() news.build() logging.info('...build done!') for url in news.article_urls(): if self.is_available_url(url): article = self._extract_articles(url) if self.is_available_article(article): self._store_article(article)
def paper(self): ''' get newspaper articles, default source is `herald` newspaper defaults to articles of this month and year import newspaperzw news = newspaperzw.news() ''' if self.summary and self.nlp == False: # raise exception. `nltk` module missing raise Exception(self.error_msg) news_source = Providers().getUrl(self.provider).strip() name = Source(news_source, self.config) name.build() name.download() name.parse() name.download_articles() # do logging logging.debug(f"News Source build and downloaded. url: {news_source}") news_data = {} news_article = [] counter = 0 for article in name.article_urls(): images = "" keywords = "" try: name.articles[counter].download() name.articles[counter].parse() # log logging.debug( f"Article #{counter} downloaded and parsed successfuly") except: counter += 1 # log logging.error( f"Error download and parsing article #{counter}. continue.." ) continue # get in data title = name.articles[counter].title date_pub = name.articles[counter].publish_date top_image = name.articles[counter].top_image link = name.articles[counter].url text = name.articles[counter].text if (self.nlp): # do nlp stuff name.articles[counter].nlp() summary = name.articles[counter].summary for words in name.articles[counter].keywords: keywords += str(words) + ',' # log logging.debug( f"summary flag enabled. NLP summary obtained successfuly") # add to news pool, only add news of this year and month # data_pub format = 10-04-2018 21:28:09 data = {} if (self.nlp): data.update({ "article_id": randint(555, 999), "title": title, "published": date_pub, "image": top_image, "news": text, "summary": summary, "keywords": keywords.rstrip(','), "url": link }) # log logging.debug("article data with summary saved to news pool!") else: data.update({ "article_id": randint(555, 999), "title": title, "published": date_pub, "image": top_image, "news": text, "url": link }) # log logging.debug("article data added to news pool") news_article.append(data) data = {} # increment to next articles counter += 1 # build main news storage news_data.update({ 'source': name.brand, 'domain': name.domain, 'news': news_article }) # log logging.debug("News main data pool created on success") return news_data