Python get_sha_hash Examples, scrapers.utils.get_sha_hash Python Examples

Example #1

0

Show file

    def parse_source(self, existing_ids=None):
        article_urls = []
        feed_content = get_rss(self.VAL202_RSS_URL)
        for feed_entry in feed_content.entries:
            link = feed_entry["link"]
            guid = feed_entry["guid"]
            if existing_ids and get_sha_hash(guid) in existing_ids:
                logger.debug("Skipping %s", guid)
                continue

            published_date = time_to_datetime(feed_entry["published_parsed"])
            try:
                text = feed_entry["content"][0]["value"]
                # Strip HTML
                soup = bs4.BeautifulSoup(text)
                text = soup.text
            except KeyError:
                return

            title = feed_entry["title"]
            author = feed_entry.get("author", None)

            article_urls.append((link, {
                "guid": guid,
                "published": published_date,
                "title": title,
                "text": text,
                "author": author
            }))

        return article_urls

Example #2

0

Show file

File: finance_parser.py Project: izacus/newsbuddy

    def parse_article(self, article_url):
        link, data = article_url
        article = self.get_article_text(link)

        if article is None: return
        published_date = data["published"]
        article["published"] = published_date
        article["source"] = "Finance"
        article["source_url"] = link
        article["language"] = "si"
        article["id"] = get_sha_hash(link)
        return article

Example #3

0

Show file

File: rtv_scraper.py Project: izacus/newsbuddy

    def parse_article(self, article_url):
        link, data = article_url
        article_id = link[link.rfind("/") + 1:]

        news_item = self.get_article_text(article_id)
        published_date = data["published"]
        news_item["published"] = published_date
        news_item["source"] = "RTVSlo"
        news_item["source_url"] = link
        news_item["language"] = "si"
        news_item["author"] = None
        news_item["id"] = get_sha_hash(link)
        return news_item

Example #4

0

Show file

File: zurnal_scraper.py Project: izacus/newsbuddy

    def parse_article(self, article_url):
        link, data = article_url
        article_id = link[link.rfind("-") + 1:]
        article = self.get_article_text(article_id)

        published_date = data["published"]
        article["published"] = published_date
        article["source"] = "Zurnal24"
        article["source_url"] = link
        article["language"] = "si"
        # Generate ID from link
        article["id"] = get_sha_hash(link)
        return article

Example #5

0

Show file

    def parse_article(self, article_url):
        link, data = article_url
        article = self.get_article(link)

        if article is None: return

        published_date = data["published"]
        article["title"] = data["title"]
        article["published"] = published_date
        article["source"] = "Vecer"
        article["source_url"] = link
        article["language"] = "si"
        # Generate ID from link
        article["id"] = get_sha_hash(link)
        return article

Example #6

0

Show file

File: finance_parser.py Project: izacus/newsbuddy

    def parse_source(self, existing_ids=None):
        news = []
        feed_content = get_rss(self.FINANCE_RSS_URL)

        for feed_entry in feed_content.entries:
            link = feed_entry["link"]

            if existing_ids and get_sha_hash(link) in existing_ids:
                logger.debug("Skipping %s", link)
                continue

            published_date = time_to_datetime(feed_entry["published_parsed"])
            news.append((link, {"published": published_date}))

        return news

Example #7

0

Show file

File: monitor_scraper.py Project: izacus/newsbuddy

    def parse_article(self, article_url):
        link, data = article_url
        guid = data["guid"]

        article = self.get_article(link)
        if article is None:
            return

        article["title"] = data["title"]
        article["published"] = data["published"]
        article["source"] = "Monitor"
        article["source_url"] = link
        article["language"] = "si"
        # Generate ID from link
        article["id"] = get_sha_hash(guid)
        return article

Example #8

0

Show file

File: delo_scraper.py Project: izacus/newsbuddy

    def parse_source(self, existing_ids=None):
        feed_content = get_rss(self.DELO_RSS_URL)
        article_urls = []

        for feed_entry in feed_content.entries:
            link = feed_entry["link"]

            if existing_ids and (get_hash(link) in existing_ids
                                 or get_sha_hash(link) in existing_ids):
                logger.debug("Skipping %s", link)
                continue

            published_date = time_to_datetime(feed_entry["published_parsed"])
            article_urls.append((link, {"published": published_date}))

        return article_urls

Example #9

0

Show file

File: rtv_scraper.py Project: izacus/newsbuddy

    def parse_source(self, existing_ids=None):
        news = []
        for rss_feed in self.RTV_RSS_URLS:
            logger.debug("Parsing %s", rss_feed)
            feed_content = get_rss(rss_feed)
            for feed_entry in feed_content.entries:
                # Download article
                link = feed_entry["link"]

                if existing_ids and (get_hash(link) in existing_ids
                                     or get_sha_hash(link) in existing_ids):
                    logger.debug("Skipping %s", link)
                    continue

                published_date = time_to_datetime(
                    feed_entry["published_parsed"])
                news.append((link, {"published": published_date}))

        return news

Example #10

0

Show file

File: monitor_scraper.py Project: izacus/newsbuddy

    def parse_source(self, existing_ids=None):
        article_urls = []
        feed_content = get_rss(self.MONITOR_RSS_URL)
        for feed_entry in feed_content.entries:
            link = feed_entry["link"]
            guid = feed_entry["guid"]

            if existing_ids and get_sha_hash(guid) in existing_ids:
                logger.debug("Skipping %s", guid)
                return

            published_date = time_to_datetime(feed_entry["published_parsed"])
            title = feed_entry["title"]

            article_urls.append((link, {
                "guid": guid,
                "title": title,
                "published": published_date
            }))

        return article_urls

Example #11

0

Show file

    def parse_article(self, article_url):
        link, data = article_url
        article = {}

        try:
            article_html = get_article(link)
            article["raw_html"] = article_html
        except Exception as e:
            logger.warn("Failed to parse article %s", link, exc_info=True)
            return

        article["text"] = data["text"]
        article["title"] = data["title"]
        article["published"] = data["published"]
        article["source"] = "Val202"
        article["source_url"] = link
        article["language"] = "si"
        article["author"] = data["author"]

        # Generate ID from link
        article["id"] = get_sha_hash(data["guid"])
        return article

Example #12

0

Show file

    def parse_source(self, existing_ids=None):
        news = []
        feed_content = get_rss(self.DNEVNIK_RSS_URL)

        max_counter = 30
        for feed_entry in feed_content.entries:
            link = feed_entry["link"]

            if existing_ids and (get_hash(link) in existing_ids
                                 or get_sha_hash(link) in existing_ids):
                logger.debug("Skipping %s", link)
                continue

            published_date = time_to_datetime(feed_entry["published_parsed"])
            title = feed_entry["title"]
            news.append((link, {"published": published_date, "title": title}))

            max_counter -= 1
            if max_counter <= 0:
                break

        return news