Beispiel #1
def test_undocumented_canonical_url_tag():
    feed_text = """
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="/css/rss20.xsl" type="text/xsl"?>
<rss version="2.0" xmlns:whatever="">
        <item xmlns:atom="">
            <guid isPermaLink="false"/>

    feed = parse_feed(feed_text)
    assert feed, "Feed was parsed."

    for item in feed.items():
        assert == "", "URL is set."
Beispiel #2
def _test_feed_contents(feed_contents: str) -> None:
    feed = parse_feed(feed_contents)
    assert feed, "Feed was parsed."

    assert feed.title() == 'Test feed', "Feed title is set."
    assert len(feed.items()) == 2, "Feed has two items."

    first_item = feed.items()[0]

    assert first_item.title() == 'First item', "First item title."
    assert == '', "First item link."
    assert first_item.publish_date() == '2016-12-14T04:04:01Z', "First item publish date."

    # publish_date_sql() is dependent on machine's timezone (which shouldn't be the case, but it is)
    assert'^2016-12-1\d \d\d:\d\d:\d\d$', first_item.publish_date_sql()), "First item SQL publish date."

    assert first_item.guid() == '', "First item GUID."
    assert first_item.guid_if_valid() == '', "First item valid GUID."
    assert first_item.description() == 'This is a first item.', "First item description."

    second_item = feed.items()[1]

    assert second_item.title() == 'ɯǝʇı puoɔǝS', "Second item title."
    assert == '', "Second item link."
    assert second_item.publish_date() == '2016-12-14T04:05:01Z', "Second item publish date."

    # publish_date_sql() is dependent on machine's timezone (which shouldn't be the case, but it is)
    assert'^2016-12-1\d \d\d:\d\d:\d\d$', second_item.publish_date_sql()), "Second item SQL publish date."

    assert second_item.guid() == '', "Second item GUID."
    assert second_item.guid_if_valid() == '', "Second item valid GUID."
    assert second_item.description() == '<strong>This is a second item.</strong>', "Second item description with HTML."
Beispiel #3
def test_empty_feed():
    feed_text = """
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="/css/rss20.xsl" type="text/xsl"?>
<rss version="2.0">
        <item xmlns:atom="">
            <guid isPermaLink="false"/>

    feed = parse_feed(feed_text)
    assert feed, "Feed was parsed."

    for item in feed.items():
        assert item.title() == "", "Title is an empty string."
        assert is None, "URL is None."  # due to fallback on GUID which is also unset
        assert item.guid() == "", "GUID is an empty string."
        assert item.guid_if_valid() is None, "Valid GUID is None."
        assert item.publish_date() is None, "Publish date is None."
        assert item.publish_date_sql() is None, "Publish date SQL is None."
Beispiel #4
    def _get_stories_from_syndicated_feed(cls,
                                          content: str,
                                          media_id: int,
                                          download_time: str) -> List[Dict[str, Any]]:
        """Parse the feed. Return a list of (non-database-backed) story dicts for each story found in the feed."""
        feed = parse_feed(content)
        if not feed:
            raise McCrawlerFetcherSoftError("Unable to parse feed.")

        stories = []

        for item in feed.items():

            url =
            if not url:
                log.warning(f"URL for feed item is empty, skipping")

            guid = item.guid_if_valid()
            if not guid:
                guid = url

            title = item.title()
            if not title:
                title = '(no title)'

            description = item.description()

            publish_date = item.publish_date_sql()
            if not publish_date:
                publish_date = download_time

            enclosures = []

            for enclosure in item.enclosures():
                    'url': enclosure.url(),
                    'mime_type': enclosure.mime_type(),
                    'length': enclosure.length(),

            story = {
                'url': url,
                'guid': guid,
                'media_id': media_id,
                'publish_date': publish_date,
                'title': title,
                'description': description,
                'enclosures': enclosures,

        return stories
Beispiel #5
def test_atom_enclosures():
    atom_feed = """
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="">
    <title>Channel title</title>
    <link href="" />
        <name>Item author</name>
        <title>Item title</title>
        <link href="" />
        <summary>Item description</summary>
        <link rel="enclosure"
            title="MP3 file"
            length="123456789" />
        <link rel="enclosure"
            title="M4A file"
            length="234567890" />



    feed = parse_feed(atom_feed)
    assert feed, "Feed was parsed."

    assert len(feed.items()) == 1, "Exactly one item has to be found."
    item = feed.items()[0]

    assert len(item.enclosures()) == 2, "Two enclosures have to be found."

    enclosure_1 = item.enclosures()[0]
    assert enclosure_1.url() == ""
    assert enclosure_1.length() == 123456789
    assert enclosure_1.mime_type() == "audio/mpeg"

    enclosure_2 = item.enclosures()[1]
    assert enclosure_2.url() == ""
    assert enclosure_2.length() == 234567890
    assert enclosure_2.mime_type() == "audio/mp4"
Beispiel #6
def test_invalid_feeds():
    # noinspection PyTypeChecker
    assert parse_feed(None) is None, "Parsing None should have returned None."
    assert parse_feed('') is None, "Parsing empty string should have returned None."
    assert parse_feed('   ') is None, "Parsing whitespace should have returned None."

    assert parse_feed("""
            <title>Acme News</title>
            <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
            <h1>Acme News</h1>
                Blah blah yada yada.
            <hr />
                This page is totally not a valid RSS / Atom feed.
    """) is None, "Parsing HTML should have returned None."
Beispiel #7
def test_rss_weird_dates():
    weird_dates = [
        'Mon, 01 Jan 0001 00:00:00 +0100',

    at_least_one_valid_date_parsed = False

    for date in weird_dates:

        rss_feed = f"""
            <rss version="2.0">
                    <title>Weird dates</title>
                    <description>Weird dates</description>
                        <title>Weird date</title>
                        <description>Weird date</description>

        feed = parse_feed(rss_feed)
        assert feed, "Feed was parsed."

        for item in feed.items():

            if item.publish_date():

                at_least_one_valid_date_parsed = True

                # Try parsing the date
                except Exception as ex:
                    assert False, f"Unable to parse date {item.publish_date()}: {ex}"

                assert re.match(r'^\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d$', item.publish_date_sql())

    assert at_least_one_valid_date_parsed
Beispiel #8
def test_rss_enclosure():
    rss_feed = """
<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:itunes=""
xmlns:atom="" version="2.0">
        <title>Channel title</title>
        <lastBuildDate>Tue, 04 Feb 2020 16:01:20 -0500</lastBuildDate>
        <copyright>&#x2117; &amp; &#xA9; 2020 Channel copyright</copyright>
        <itunes:subtitle><![CDATA[Channel description]]></itunes:subtitle>
        <itunes:author>Channel author</itunes:author>
        <itunes:summary><![CDATA[Channel summary]]></itunes:summary>
        <description><![CDATA[Channel description]]></description>
            <itunes:name>Channel author</itunes:name>
            <itunes:email>[email protected]</itunes:email>
            <title>Channel title</title>
        <itunes:image href="" />
        <itunes:category text="Comedy" />
            <itunes:title>Item iTunes title</itunes:title>
            <title>Item title</title>
            <description><![CDATA[<p>Item description</p>]]></description>
            <content:encoded><![CDATA[<p>Item description</p>]]></content:encoded>
            <itunes:author>Item author</itunes:author>
            <enclosure url="" length="123456789" type="audio/mpeg" />
            <guid isPermaLink="false"></guid>
            <pubDate>Sat, 01 Feb 2020 10:00:00 -0500</pubDate>

    feed = parse_feed(rss_feed)
    assert feed, "Feed was parsed."

    assert len(feed.items()) == 1, "Exactly one item has to be found."
    item = feed.items()[0]

    assert len(item.enclosures()) == 1, "Exactly one enclosure has to be found."
    enclosure = item.enclosures()[0]

    assert enclosure.url() == ""
    assert enclosure.length() == 123456789
    assert enclosure.mime_type() == "audio/mpeg"