async def parse_feed(user, feed): """ Fetch a feed. Update the feed and return the articles. """ parsed_feed = None up_feed = {} articles = [] resp = None # with (await sem): try: logger.info("Retrieving feed {}".format(feed.link)) resp = newspipe_get(feed.link, timeout=5) except Exception: logger.info("Problem when reading feed {}".format(feed.link)) return finally: if None is resp: return try: content = io.BytesIO(resp.content) parsed_feed = feedparser.parse(content) except Exception as e: up_feed["last_error"] = str(e) up_feed["error_count"] = feed.error_count + 1 logger.exception("error when parsing feed: " + str(e)) finally: up_feed["last_retrieved"] = datetime.now(dateutil.tz.tzlocal()) if parsed_feed is None: try: FeedController().update({"id": feed.id}, up_feed) except Exception as e: logger.exception("something bad here: " + str(e)) return if not is_parsing_ok(parsed_feed): up_feed["last_error"] = str(parsed_feed["bozo_exception"]) up_feed["error_count"] = feed.error_count + 1 FeedController().update({"id": feed.id}, up_feed) return if parsed_feed["entries"] != []: articles = parsed_feed["entries"] up_feed["error_count"] = 0 up_feed["last_error"] = "" # Feed information try: up_feed.update(construct_feed_from(feed.link, parsed_feed)) except: logger.exception("error when constructing feed: {}".format(feed.link)) if feed.title and "title" in up_feed: # do not override the title set by the user del up_feed["title"] try: FeedController().update({"id": feed.id}, up_feed) except: logger.exception("error when updating feed: {}".format(feed.link)) return articles
async def get_article_details(entry, fetch=True): article_link = entry.get("link") article_title = html.unescape(entry.get("title", "")) if ( fetch and application.config["CRAWLER_RESOLV"] and article_link or not article_title ): try: # resolves URL behind proxies (like feedproxy.google.com) response = newspipe_get(article_link, timeout=5) except MissingSchema: split, failed = urlsplit(article_link), False for scheme in "https", "http": try: new_link = urlunsplit(SplitResult(scheme, *split[1:])) response = newspipe_get(new_link, timeout=5) except Exception: failed = True continue failed = False article_link = new_link break if failed: return article_link, article_title or "No title" except Exception as error: logger.info( "Unable to get the real URL of %s. Won't fix " "link or title. Error: %s", article_link, error, ) return article_link, article_title or "No title" article_link = response.url if not article_title: bs_parsed = BeautifulSoup( response.content, "html.parser", parse_only=SoupStrainer("head") ) try: article_title = bs_parsed.find_all("title")[0].text except IndexError: # no title pass return article_link, article_title or "No title"
def _build_from_url(self, attrs): if "url" in attrs and "content" not in attrs: try: resp = newspipe_get(attrs["url"], timeout=5) attrs.update({ "url": resp.url, "mimetype": resp.headers.get("content-type", None), "content": base64.b64encode(resp.content).decode("utf8"), }) except requests.exceptions.ConnectionError: pass return attrs