def parse_source(self, existing_ids=None): article_urls = [] feed_content = get_rss(self.VAL202_RSS_URL) for feed_entry in feed_content.entries: link = feed_entry["link"] guid = feed_entry["guid"] if existing_ids and get_sha_hash(guid) in existing_ids: logger.debug("Skipping %s", guid) continue published_date = time_to_datetime(feed_entry["published_parsed"]) try: text = feed_entry["content"][0]["value"] # Strip HTML soup = bs4.BeautifulSoup(text) text = soup.text except KeyError: return title = feed_entry["title"] author = feed_entry.get("author", None) article_urls.append((link, { "guid": guid, "published": published_date, "title": title, "text": text, "author": author })) return article_urls
def parse_article(self, article_url): link, data = article_url article = self.get_article_text(link) if article is None: return published_date = data["published"] article["published"] = published_date article["source"] = "Finance" article["source_url"] = link article["language"] = "si" article["id"] = get_sha_hash(link) return article
def parse_article(self, article_url): link, data = article_url article_id = link[link.rfind("/") + 1:] news_item = self.get_article_text(article_id) published_date = data["published"] news_item["published"] = published_date news_item["source"] = "RTVSlo" news_item["source_url"] = link news_item["language"] = "si" news_item["author"] = None news_item["id"] = get_sha_hash(link) return news_item
def parse_article(self, article_url): link, data = article_url article_id = link[link.rfind("-") + 1:] article = self.get_article_text(article_id) published_date = data["published"] article["published"] = published_date article["source"] = "Zurnal24" article["source_url"] = link article["language"] = "si" # Generate ID from link article["id"] = get_sha_hash(link) return article
def parse_article(self, article_url): link, data = article_url article = self.get_article(link) if article is None: return published_date = data["published"] article["title"] = data["title"] article["published"] = published_date article["source"] = "Vecer" article["source_url"] = link article["language"] = "si" # Generate ID from link article["id"] = get_sha_hash(link) return article
def parse_source(self, existing_ids=None): news = [] feed_content = get_rss(self.FINANCE_RSS_URL) for feed_entry in feed_content.entries: link = feed_entry["link"] if existing_ids and get_sha_hash(link) in existing_ids: logger.debug("Skipping %s", link) continue published_date = time_to_datetime(feed_entry["published_parsed"]) news.append((link, {"published": published_date})) return news
def parse_article(self, article_url): link, data = article_url guid = data["guid"] article = self.get_article(link) if article is None: return article["title"] = data["title"] article["published"] = data["published"] article["source"] = "Monitor" article["source_url"] = link article["language"] = "si" # Generate ID from link article["id"] = get_sha_hash(guid) return article
def parse_source(self, existing_ids=None): feed_content = get_rss(self.DELO_RSS_URL) article_urls = [] for feed_entry in feed_content.entries: link = feed_entry["link"] if existing_ids and (get_hash(link) in existing_ids or get_sha_hash(link) in existing_ids): logger.debug("Skipping %s", link) continue published_date = time_to_datetime(feed_entry["published_parsed"]) article_urls.append((link, {"published": published_date})) return article_urls
def parse_source(self, existing_ids=None): news = [] for rss_feed in self.RTV_RSS_URLS: logger.debug("Parsing %s", rss_feed) feed_content = get_rss(rss_feed) for feed_entry in feed_content.entries: # Download article link = feed_entry["link"] if existing_ids and (get_hash(link) in existing_ids or get_sha_hash(link) in existing_ids): logger.debug("Skipping %s", link) continue published_date = time_to_datetime( feed_entry["published_parsed"]) news.append((link, {"published": published_date})) return news
def parse_source(self, existing_ids=None): article_urls = [] feed_content = get_rss(self.MONITOR_RSS_URL) for feed_entry in feed_content.entries: link = feed_entry["link"] guid = feed_entry["guid"] if existing_ids and get_sha_hash(guid) in existing_ids: logger.debug("Skipping %s", guid) return published_date = time_to_datetime(feed_entry["published_parsed"]) title = feed_entry["title"] article_urls.append((link, { "guid": guid, "title": title, "published": published_date })) return article_urls
def parse_article(self, article_url): link, data = article_url article = {} try: article_html = get_article(link) article["raw_html"] = article_html except Exception as e: logger.warn("Failed to parse article %s", link, exc_info=True) return article["text"] = data["text"] article["title"] = data["title"] article["published"] = data["published"] article["source"] = "Val202" article["source_url"] = link article["language"] = "si" article["author"] = data["author"] # Generate ID from link article["id"] = get_sha_hash(data["guid"]) return article
def parse_source(self, existing_ids=None): news = [] feed_content = get_rss(self.DNEVNIK_RSS_URL) max_counter = 30 for feed_entry in feed_content.entries: link = feed_entry["link"] if existing_ids and (get_hash(link) in existing_ids or get_sha_hash(link) in existing_ids): logger.debug("Skipping %s", link) continue published_date = time_to_datetime(feed_entry["published_parsed"]) title = feed_entry["title"] news.append((link, {"published": published_date, "title": title})) max_counter -= 1 if max_counter <= 0: break return news