def extract_from_source(self, source):
     news = NpSource(source, verbose=True)
     news.clean_memo_cache()
     news.build()
     logging.info('...build done!')
     for url in news.article_urls():
         if self.is_available_url(url):
             article = self._extract_articles(url)
             if self.is_available_article(article):
                 self._store_article(article)
Beispiel #2
0
    def paper(self):
        '''
            get newspaper articles, default source is `herald` newspaper
            defaults to articles of this month and year
            import newspaperzw

            news = newspaperzw.news()
        '''

        if self.summary and self.nlp == False:
            # raise exception. `nltk` module missing
            raise Exception(self.error_msg)

        news_source = Providers().getUrl(self.provider).strip()

        name = Source(news_source, self.config)
        name.build()
        name.download()
        name.parse()
        name.download_articles()

        # do logging
        logging.debug(f"News Source build and downloaded. url: {news_source}")

        news_data = {}
        news_article = []

        counter = 0
        for article in name.article_urls():
            images = ""
            keywords = ""

            try:
                name.articles[counter].download()
                name.articles[counter].parse()

                # log
                logging.debug(
                    f"Article #{counter} downloaded and parsed successfuly")

            except:
                counter += 1

                # log
                logging.error(
                    f"Error download and parsing article #{counter}. continue.."
                )
                continue

            # get in data
            title = name.articles[counter].title
            date_pub = name.articles[counter].publish_date
            top_image = name.articles[counter].top_image
            link = name.articles[counter].url
            text = name.articles[counter].text

            if (self.nlp):
                # do nlp stuff
                name.articles[counter].nlp()
                summary = name.articles[counter].summary

                for words in name.articles[counter].keywords:
                    keywords += str(words) + ','

                # log
                logging.debug(
                    f"summary flag enabled. NLP summary obtained successfuly")

            # add to news pool, only add news of this year and month
            # data_pub format = 10-04-2018 21:28:09
            data = {}
            if (self.nlp):
                data.update({
                    "article_id": randint(555, 999),
                    "title": title,
                    "published": date_pub,
                    "image": top_image,
                    "news": text,
                    "summary": summary,
                    "keywords": keywords.rstrip(','),
                    "url": link
                })

                # log
                logging.debug("article data with summary saved to news pool!")

            else:
                data.update({
                    "article_id": randint(555, 999),
                    "title": title,
                    "published": date_pub,
                    "image": top_image,
                    "news": text,
                    "url": link
                })

                # log
                logging.debug("article data added to news pool")

            news_article.append(data)
            data = {}

            # increment to next articles
            counter += 1

        # build main news storage
        news_data.update({
            'source': name.brand,
            'domain': name.domain,
            'news': news_article
        })

        # log
        logging.debug("News main data pool created on success")

        return news_data