Python get_rss Exemples, scrapers.utils.get_rss Python Exemples

Exemple #1

0

Afficher le fichier

    def parse_source(self, existing_ids=None):
        article_urls = []
        feed_content = get_rss(self.VAL202_RSS_URL)
        for feed_entry in feed_content.entries:
            link = feed_entry["link"]
            guid = feed_entry["guid"]
            if existing_ids and get_sha_hash(guid) in existing_ids:
                logger.debug("Skipping %s", guid)
                continue

            published_date = time_to_datetime(feed_entry["published_parsed"])
            try:
                text = feed_entry["content"][0]["value"]
                # Strip HTML
                soup = bs4.BeautifulSoup(text)
                text = soup.text
            except KeyError:
                return

            title = feed_entry["title"]
            author = feed_entry.get("author", None)

            article_urls.append((link, {
                "guid": guid,
                "published": published_date,
                "title": title,
                "text": text,
                "author": author
            }))

        return article_urls

Exemple #2

0

Afficher le fichier

Fichier : finance_parser.py Projet : izacus/newsbuddy

    def parse_source(self, existing_ids=None):
        news = []
        feed_content = get_rss(self.FINANCE_RSS_URL)

        for feed_entry in feed_content.entries:
            link = feed_entry["link"]

            if existing_ids and get_sha_hash(link) in existing_ids:
                logger.debug("Skipping %s", link)
                continue

            published_date = time_to_datetime(feed_entry["published_parsed"])
            news.append((link, {"published": published_date}))

        return news

Exemple #3

0

Afficher le fichier

Fichier : delo_scraper.py Projet : izacus/newsbuddy

    def parse_source(self, existing_ids=None):
        feed_content = get_rss(self.DELO_RSS_URL)
        article_urls = []

        for feed_entry in feed_content.entries:
            link = feed_entry["link"]

            if existing_ids and (get_hash(link) in existing_ids
                                 or get_sha_hash(link) in existing_ids):
                logger.debug("Skipping %s", link)
                continue

            published_date = time_to_datetime(feed_entry["published_parsed"])
            article_urls.append((link, {"published": published_date}))

        return article_urls

Exemple #4

0

Afficher le fichier

Fichier : rtv_scraper.py Projet : izacus/newsbuddy

    def parse_source(self, existing_ids=None):
        news = []
        for rss_feed in self.RTV_RSS_URLS:
            logger.debug("Parsing %s", rss_feed)
            feed_content = get_rss(rss_feed)
            for feed_entry in feed_content.entries:
                # Download article
                link = feed_entry["link"]

                if existing_ids and (get_hash(link) in existing_ids
                                     or get_sha_hash(link) in existing_ids):
                    logger.debug("Skipping %s", link)
                    continue

                published_date = time_to_datetime(
                    feed_entry["published_parsed"])
                news.append((link, {"published": published_date}))

        return news

Exemple #5

0

Afficher le fichier

Fichier : monitor_scraper.py Projet : izacus/newsbuddy

    def parse_source(self, existing_ids=None):
        article_urls = []
        feed_content = get_rss(self.MONITOR_RSS_URL)
        for feed_entry in feed_content.entries:
            link = feed_entry["link"]
            guid = feed_entry["guid"]

            if existing_ids and get_sha_hash(guid) in existing_ids:
                logger.debug("Skipping %s", guid)
                return

            published_date = time_to_datetime(feed_entry["published_parsed"])
            title = feed_entry["title"]

            article_urls.append((link, {
                "guid": guid,
                "title": title,
                "published": published_date
            }))

        return article_urls

Exemple #6

0

Afficher le fichier

    def parse_source(self, existing_ids=None):
        news = []
        feed_content = get_rss(self.DNEVNIK_RSS_URL)

        max_counter = 30
        for feed_entry in feed_content.entries:
            link = feed_entry["link"]

            if existing_ids and (get_hash(link) in existing_ids
                                 or get_sha_hash(link) in existing_ids):
                logger.debug("Skipping %s", link)
                continue

            published_date = time_to_datetime(feed_entry["published_parsed"])
            title = feed_entry["title"]
            news.append((link, {"published": published_date, "title": title}))

            max_counter -= 1
            if max_counter <= 0:
                break

        return news