def fetch_rss(item, rss): print(f"Parsing RSS: {rss}") feed = feedparser.parse(rss) print(f"Entries found: {len(feed.entries)}") for entry in feed.entries[:DEFAULT_ENTRIES_LIMIT]: entry_title = parse_title(entry) entry_link = parse_link(entry) if not entry_title or not entry_link: print("No entry title or link. Skipped") continue print(f"- article: '{entry_title}' {entry_link}") conditions = item.get("conditions") if conditions: is_valid = check_conditions(conditions, entry) if not is_valid: print(f"- condition {conditions} does not match. Skipped") continue created_at = parse_datetime(entry) if created_at <= datetime.utcnow() - DELETE_OLD_ARTICLES_DELTA: print(f"- article is too old. Skipped") continue article, is_created = Article.objects.get_or_create( board_id=item["board_id"], feed_id=item["id"], uniq_id=entry.get("id") or entry.get("guid") or entry_link, defaults=dict( url=entry_link[:2000], domain=parse_domain(entry_link)[:256], created_at=created_at, updated_at=datetime.utcnow(), title=entry_title[:256], image=str(parse_rss_image(entry) or "")[:512], description=entry.get("summary"), )) if is_created: print(f"- article is new, parsing metadata...") # parse heavy info text, lead_image = parse_rss_text_and_image(entry) if text: article.description = text[:1000] if lead_image: article.image = lead_image[:512] # get real url real_url, content_type, content_length = resolve_url(entry_link) # load and summarize article if item["is_parsable"] \ and content_length <= MAX_PARSABLE_CONTENT_LENGTH \ and content_type \ and content_type.startswith("text/"): # to not try to parse podcasts :D if real_url: article.url = real_url[:2000] article.domain = parse_domain(real_url)[:256] try: summary, summary_image = load_and_parse_full_article_text_and_image( article.url) except ArticleException: summary = None summary_image = None if summary: article.summary = summary if summary_image: article.image = summary_image[:512] article.save()
def initialize(config, board_slug, upload_favicons, always_yes): yaml_file = os.path.join(BASE_DIR, config) with open(yaml_file) as f: try: config = yaml.load(f.read(), Loader=yaml.FullLoader) except yaml.YAMLError as ex: print(f"Bad YAML file '{yaml_file}': {ex}") exit(1) if not always_yes: input( f"Initializing feeds from {yaml_file}. Press Enter to continue...") for board_index, board_config in enumerate(config.get("boards") or []): if board_slug and board_config["slug"] != board_slug: continue board_name = board_config.get("name") or board_config["slug"] print(f"Creating board: {board_name}...") board, is_created = Board.objects.update_or_create( slug=board_config["slug"], defaults=dict( name=board_name or board_config["slug"], avatar=board_config["curator"].get("avatar"), curator_name=board_config["curator"].get("name"), curator_title=board_config["curator"].get("title"), curator_footer=board_config["curator"].get("footer"), curator_bio=board_config["curator"].get("bio"), curator_url=board_config["curator"].get("url"), is_private=board_config.get("is_private"), is_visible=board_config.get("is_visible"), index=board_index, )) for block_index, block_config in enumerate( board_config.get("blocks") or []): block_name = block_config.get("name") or "" print(f"\nCreating block: {block_name}...") block, is_created = BoardBlock.objects.update_or_create( board=board, slug=block_config["slug"], defaults=dict( name=block_name, index=block_index, view=block_config.get("view") or BoardBlock.DEFAULT_VIEW, )) if not block_config.get("feeds"): continue updated_feed_urls = set() for feed_index, feed_config in enumerate( block_config.get("feeds") or []): feed_name = feed_config.get("name") feed_mix = feed_config.get("mix") if feed_mix: feed_url = feed_config.get( "url") or f"mix:{'|'.join(feed_mix)}" feed_rss = None else: feed_url = feed_config["url"] feed_rss = feed_config["rss"] updated_feed_urls.add(feed_url) print(f"Creating or updating feed {feed_name} ({feed_url})...") feed, is_created = BoardFeed.objects.update_or_create( board=board, block=block, url=feed_url, defaults=dict( rss=feed_rss, mix=feed_mix, name=feed_name, comment=feed_config.get("comment"), icon=feed_config.get("icon"), index=feed_index, columns=feed_config.get("columns") or 1, conditions=feed_config.get("conditions"), is_parsable=feed_config.get("is_parsable", True), view=feed_config.get("view") or BoardFeed.DEFAULT_VIEW, )) html = None if not feed.mix: if not feed.icon: feed.icon = DOMAIN_FAVICONS.get(parse_domain(feed_url)) if not feed.icon: html = html or load_page_html(feed_url) icon = feed_config.get("icon") if not icon: icon = find_favicon(feed_url, html) print(f"- found favicon: {icon}") if upload_favicons: icon = upload_image_from_url(icon) print(f"- uploaded favicon: {icon}") feed.icon = icon feed.save() # delete unused feeds BoardFeed.objects.filter( board=board, block=block).exclude(url__in=updated_feed_urls).delete() # delete unused blocks BoardBlock.objects.filter(board=board, ).exclude(slug__in={ block["slug"] for block in board_config.get("blocks") or [] }).delete() print("Done ✅")