コード例 #1
0
def fetch_rss(item, rss):
    print(f"Parsing RSS: {rss}")

    feed = feedparser.parse(rss)

    print(f"Entries found: {len(feed.entries)}")
    for entry in feed.entries[:DEFAULT_ENTRIES_LIMIT]:
        entry_title = parse_title(entry)
        entry_link = parse_link(entry)
        if not entry_title or not entry_link:
            print("No entry title or link. Skipped")
            continue

        print(f"- article: '{entry_title}' {entry_link}")

        conditions = item.get("conditions")
        if conditions:
            is_valid = check_conditions(conditions, entry)
            if not is_valid:
                print(f"- condition {conditions} does not match. Skipped")
                continue

        created_at = parse_datetime(entry)
        if created_at <= datetime.utcnow() - DELETE_OLD_ARTICLES_DELTA:
            print(f"- article is too old. Skipped")
            continue

        article, is_created = Article.objects.get_or_create(
            board_id=item["board_id"],
            feed_id=item["id"],
            uniq_id=entry.get("id") or entry.get("guid") or entry_link,
            defaults=dict(
                url=entry_link[:2000],
                domain=parse_domain(entry_link)[:256],
                created_at=created_at,
                updated_at=datetime.utcnow(),
                title=entry_title[:256],
                image=str(parse_rss_image(entry) or "")[:512],
                description=entry.get("summary"),
            ))

        if is_created:
            print(f"- article is new, parsing metadata...")

            # parse heavy info
            text, lead_image = parse_rss_text_and_image(entry)

            if text:
                article.description = text[:1000]

            if lead_image:
                article.image = lead_image[:512]

            # get real url
            real_url, content_type, content_length = resolve_url(entry_link)

            # load and summarize article
            if item["is_parsable"] \
                    and content_length <= MAX_PARSABLE_CONTENT_LENGTH \
                    and content_type \
                    and content_type.startswith("text/"):  # to not try to parse podcasts :D

                if real_url:
                    article.url = real_url[:2000]
                    article.domain = parse_domain(real_url)[:256]

                try:
                    summary, summary_image = load_and_parse_full_article_text_and_image(
                        article.url)
                except ArticleException:
                    summary = None
                    summary_image = None

                if summary:
                    article.summary = summary

                if summary_image:
                    article.image = summary_image[:512]

            article.save()
コード例 #2
0
def initialize(config, board_slug, upload_favicons, always_yes):
    yaml_file = os.path.join(BASE_DIR, config)
    with open(yaml_file) as f:
        try:
            config = yaml.load(f.read(), Loader=yaml.FullLoader)
        except yaml.YAMLError as ex:
            print(f"Bad YAML file '{yaml_file}': {ex}")
            exit(1)

    if not always_yes:
        input(
            f"Initializing feeds from {yaml_file}. Press Enter to continue...")

    for board_index, board_config in enumerate(config.get("boards") or []):
        if board_slug and board_config["slug"] != board_slug:
            continue

        board_name = board_config.get("name") or board_config["slug"]
        print(f"Creating board: {board_name}...")
        board, is_created = Board.objects.update_or_create(
            slug=board_config["slug"],
            defaults=dict(
                name=board_name or board_config["slug"],
                avatar=board_config["curator"].get("avatar"),
                curator_name=board_config["curator"].get("name"),
                curator_title=board_config["curator"].get("title"),
                curator_footer=board_config["curator"].get("footer"),
                curator_bio=board_config["curator"].get("bio"),
                curator_url=board_config["curator"].get("url"),
                is_private=board_config.get("is_private"),
                is_visible=board_config.get("is_visible"),
                index=board_index,
            ))

        for block_index, block_config in enumerate(
                board_config.get("blocks") or []):
            block_name = block_config.get("name") or ""
            print(f"\nCreating block: {block_name}...")
            block, is_created = BoardBlock.objects.update_or_create(
                board=board,
                slug=block_config["slug"],
                defaults=dict(
                    name=block_name,
                    index=block_index,
                    view=block_config.get("view") or BoardBlock.DEFAULT_VIEW,
                ))

            if not block_config.get("feeds"):
                continue

            updated_feed_urls = set()

            for feed_index, feed_config in enumerate(
                    block_config.get("feeds") or []):
                feed_name = feed_config.get("name")
                feed_mix = feed_config.get("mix")
                if feed_mix:
                    feed_url = feed_config.get(
                        "url") or f"mix:{'|'.join(feed_mix)}"
                    feed_rss = None
                else:
                    feed_url = feed_config["url"]
                    feed_rss = feed_config["rss"]

                updated_feed_urls.add(feed_url)

                print(f"Creating or updating feed {feed_name} ({feed_url})...")

                feed, is_created = BoardFeed.objects.update_or_create(
                    board=board,
                    block=block,
                    url=feed_url,
                    defaults=dict(
                        rss=feed_rss,
                        mix=feed_mix,
                        name=feed_name,
                        comment=feed_config.get("comment"),
                        icon=feed_config.get("icon"),
                        index=feed_index,
                        columns=feed_config.get("columns") or 1,
                        conditions=feed_config.get("conditions"),
                        is_parsable=feed_config.get("is_parsable", True),
                        view=feed_config.get("view") or BoardFeed.DEFAULT_VIEW,
                    ))

                html = None

                if not feed.mix:
                    if not feed.icon:
                        feed.icon = DOMAIN_FAVICONS.get(parse_domain(feed_url))

                    if not feed.icon:
                        html = html or load_page_html(feed_url)
                        icon = feed_config.get("icon")
                        if not icon:
                            icon = find_favicon(feed_url, html)
                            print(f"- found favicon: {icon}")

                            if upload_favicons:
                                icon = upload_image_from_url(icon)
                                print(f"- uploaded favicon: {icon}")

                        feed.icon = icon

                feed.save()

            # delete unused feeds
            BoardFeed.objects.filter(
                board=board,
                block=block).exclude(url__in=updated_feed_urls).delete()

        # delete unused blocks
        BoardBlock.objects.filter(board=board, ).exclude(slug__in={
            block["slug"]
            for block in board_config.get("blocks") or []
        }).delete()

    print("Done ✅")