Python Source.article_urls Beispiele

Programmiersprache: Python

Namespace / Paketname: newspaper

Klasse / Typ: Source

Methode / Funktion: article_urls

Beispiele auf hotexamples.com: 2

Python Source.article_urls - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die newspaper.Source.article_urls, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Source(25)

build(8)

parse(7)

categories(6)

download(6)

set_categories(5)

category_urls(5)

clean_memo_cache(5)

size(4)

set_feeds(3)

article_urls(2)

download_feeds(2)

generate_articles(2)

parse_categories(2)

download_categories(2)

feed_urls(1)

html(1)

download_articles(1)

articles(1)

Beispiel #1

Datei anzeigen

Datei: scraping_newspapers.py Projekt: azotdata/azot-event-extractor

 def extract_from_source(self, source):
     news = NpSource(source, verbose=True)
     news.clean_memo_cache()
     news.build()
     logging.info('...build done!')
     for url in news.article_urls():
         if self.is_available_url(url):
             article = self._extract_articles(url)
             if self.is_available_article(article):
                 self._store_article(article)

Beispiel #2

Datei anzeigen

    def paper(self):
        '''
            get newspaper articles, default source is `herald` newspaper
            defaults to articles of this month and year
            import newspaperzw

            news = newspaperzw.news()
        '''

        if self.summary and self.nlp == False:
            # raise exception. `nltk` module missing
            raise Exception(self.error_msg)

        news_source = Providers().getUrl(self.provider).strip()

        name = Source(news_source, self.config)
        name.build()
        name.download()
        name.parse()
        name.download_articles()

        # do logging
        logging.debug(f"News Source build and downloaded. url: {news_source}")

        news_data = {}
        news_article = []

        counter = 0
        for article in name.article_urls():
            images = ""
            keywords = ""

            try:
                name.articles[counter].download()
                name.articles[counter].parse()

                # log
                logging.debug(
                    f"Article #{counter} downloaded and parsed successfuly")

            except:
                counter += 1

                # log
                logging.error(
                    f"Error download and parsing article #{counter}. continue.."
                )
                continue

            # get in data
            title = name.articles[counter].title
            date_pub = name.articles[counter].publish_date
            top_image = name.articles[counter].top_image
            link = name.articles[counter].url
            text = name.articles[counter].text

            if (self.nlp):
                # do nlp stuff
                name.articles[counter].nlp()
                summary = name.articles[counter].summary

                for words in name.articles[counter].keywords:
                    keywords += str(words) + ','

                # log
                logging.debug(
                    f"summary flag enabled. NLP summary obtained successfuly")

            # add to news pool, only add news of this year and month
            # data_pub format = 10-04-2018 21:28:09
            data = {}
            if (self.nlp):
                data.update({
                    "article_id": randint(555, 999),
                    "title": title,
                    "published": date_pub,
                    "image": top_image,
                    "news": text,
                    "summary": summary,
                    "keywords": keywords.rstrip(','),
                    "url": link
                })

                # log
                logging.debug("article data with summary saved to news pool!")

            else:
                data.update({
                    "article_id": randint(555, 999),
                    "title": title,
                    "published": date_pub,
                    "image": top_image,
                    "news": text,
                    "url": link
                })

                # log
                logging.debug("article data added to news pool")

            news_article.append(data)
            data = {}

            # increment to next articles
            counter += 1

        # build main news storage
        news_data.update({
            'source': name.brand,
            'domain': name.domain,
            'news': news_article
        })

        # log
        logging.debug("News main data pool created on success")

        return news_data