Beispiel #1
0
def test_undocumented_canonical_url_tag():
    feed_text = """
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="/css/rss20.xsl" type="text/xsl"?>
<rss version="2.0" xmlns:whatever="http://example.com/whatever/">
    <channel>
        <title>foo</title>
        <link>http://example.com</link>
        <description/>
        <language>en</language>
        <item xmlns:atom="http://www.w3.org/2005/Atom">
            <title><![CDATA[]]></title>
            <link/>
            <guid isPermaLink="false"/>
            <pubDate></pubDate>
            <whatever:canonical_url>http://www.example.com/item.html</whatever:canonical_url>
        </item>
    </channel>
</rss>
    """

    feed = parse_feed(feed_text)
    assert feed, "Feed was parsed."

    for item in feed.items():
        assert item.link() == "http://www.example.com/item.html", "URL is set."
Beispiel #2
0
def _test_feed_contents(feed_contents: str) -> None:
    feed = parse_feed(feed_contents)
    assert feed, "Feed was parsed."

    assert feed.title() == 'Test feed', "Feed title is set."
    assert len(feed.items()) == 2, "Feed has two items."

    first_item = feed.items()[0]

    assert first_item.title() == 'First item', "First item title."
    assert first_item.link() == 'http://www.example.com/first_item.html', "First item link."
    assert first_item.publish_date() == '2016-12-14T04:04:01Z', "First item publish date."

    # publish_date_sql() is dependent on machine's timezone (which shouldn't be the case, but it is)
    assert re.search(r'^2016-12-1\d \d\d:\d\d:\d\d$', first_item.publish_date_sql()), "First item SQL publish date."

    assert first_item.guid() == 'http://www.example.com/first_item.html', "First item GUID."
    assert first_item.guid_if_valid() == 'http://www.example.com/first_item.html', "First item valid GUID."
    assert first_item.description() == 'This is a first item.', "First item description."

    second_item = feed.items()[1]

    assert second_item.title() == 'ɯǝʇı puoɔǝS', "Second item title."
    assert second_item.link() == 'http://www.example.com/second_item.html', "Second item link."
    assert second_item.publish_date() == '2016-12-14T04:05:01Z', "Second item publish date."

    # publish_date_sql() is dependent on machine's timezone (which shouldn't be the case, but it is)
    assert re.search(r'^2016-12-1\d \d\d:\d\d:\d\d$', second_item.publish_date_sql()), "Second item SQL publish date."

    assert second_item.guid() == 'http://www.example.com/second_item.html', "Second item GUID."
    assert second_item.guid_if_valid() == 'http://www.example.com/second_item.html', "Second item valid GUID."
    assert second_item.description() == '<strong>This is a second item.</strong>', "Second item description with HTML."
Beispiel #3
0
def test_empty_feed():
    feed_text = """
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="/css/rss20.xsl" type="text/xsl"?>
<rss version="2.0">
    <channel>
        <title>foo</title>
        <link>http://example.com</link>
        <description/>
        <language>en</language>
        <item xmlns:atom="http://www.w3.org/2005/Atom">
            <title><![CDATA[]]></title>
            <link/>
            <guid isPermaLink="false"/>
            <pubDate></pubDate>
        </item>
    </channel>
</rss>
    """

    feed = parse_feed(feed_text)
    assert feed, "Feed was parsed."

    for item in feed.items():
        assert item.title() == "", "Title is an empty string."
        assert item.link() is None, "URL is None."  # due to fallback on GUID which is also unset
        assert item.guid() == "", "GUID is an empty string."
        assert item.guid_if_valid() is None, "Valid GUID is None."
        assert item.publish_date() is None, "Publish date is None."
        assert item.publish_date_sql() is None, "Publish date SQL is None."
Beispiel #4
0
    def _get_stories_from_syndicated_feed(cls,
                                          content: str,
                                          media_id: int,
                                          download_time: str) -> List[Dict[str, Any]]:
        """Parse the feed. Return a list of (non-database-backed) story dicts for each story found in the feed."""
        feed = parse_feed(content)
        if not feed:
            raise McCrawlerFetcherSoftError("Unable to parse feed.")

        stories = []

        for item in feed.items():

            url = item.link()
            if not url:
                log.warning(f"URL for feed item is empty, skipping")
                continue

            guid = item.guid_if_valid()
            if not guid:
                guid = url

            title = item.title()
            if not title:
                title = '(no title)'

            description = item.description()

            publish_date = item.publish_date_sql()
            if not publish_date:
                publish_date = download_time

            enclosures = []

            for enclosure in item.enclosures():
                enclosures.append({
                    'url': enclosure.url(),
                    'mime_type': enclosure.mime_type(),
                    'length': enclosure.length(),
                })

            story = {
                'url': url,
                'guid': guid,
                'media_id': media_id,
                'publish_date': publish_date,
                'title': title,
                'description': description,
                'enclosures': enclosures,
            }
            stories.append(story)

        return stories
Beispiel #5
0
def test_atom_enclosures():
    atom_feed = """
<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
    <title>Channel title</title>
    <updated>2003-12-13T18:30:02Z</updated>
    <link href="https://www.example.com/" />
    <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
    <author>
        <name>Item author</name>
    </author>
  
    <entry>
        <title>Item title</title>
        <link href="http://www.example.com/item" />
        <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
        <summary>Item description</summary>
        <updated>2003-12-13T18:30:02Z</updated>
        
        <link rel="enclosure"
            type="audio/mpeg"
            title="MP3 file"
            href="https://www.example.com/item.mp3"
            length="123456789" />
        <link rel="enclosure"
            type="audio/mp4"
            title="M4A file"
            href="https://www.example.com/item.m4a"
            length="234567890" />

    </entry>

</feed>
    """

    feed = parse_feed(atom_feed)
    assert feed, "Feed was parsed."

    assert len(feed.items()) == 1, "Exactly one item has to be found."
    item = feed.items()[0]

    assert len(item.enclosures()) == 2, "Two enclosures have to be found."

    enclosure_1 = item.enclosures()[0]
    assert enclosure_1.url() == "https://www.example.com/item.mp3"
    assert enclosure_1.length() == 123456789
    assert enclosure_1.mime_type() == "audio/mpeg"

    enclosure_2 = item.enclosures()[1]
    assert enclosure_2.url() == "https://www.example.com/item.m4a"
    assert enclosure_2.length() == 234567890
    assert enclosure_2.mime_type() == "audio/mp4"
Beispiel #6
0
def test_invalid_feeds():
    # noinspection PyTypeChecker
    assert parse_feed(None) is None, "Parsing None should have returned None."
    assert parse_feed('') is None, "Parsing empty string should have returned None."
    assert parse_feed('   ') is None, "Parsing whitespace should have returned None."

    assert parse_feed("""
        <html>
        <head>
            <title>Acme News</title>
            <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
        </head>
        <body>
            <h1>Acme News</h1>
            <p>
                Blah blah yada yada.
            </p>
            <hr />
            <p>
                This page is totally not a valid RSS / Atom feed.
            </p>
        </body>
        </html>
    """) is None, "Parsing HTML should have returned None."
Beispiel #7
0
def test_rss_weird_dates():
    weird_dates = [
        'Mon, 01 Jan 0001 00:00:00 +0100',
        '1875-09-17T00:00:00Z',
    ]

    at_least_one_valid_date_parsed = False

    for date in weird_dates:

        rss_feed = f"""
            <rss version="2.0">
                <channel>
                    <title>Weird dates</title>
                    <link>https://www.example.com/</link>
                    <description>Weird dates</description>
                    <item>
                        <title>Weird date</title>
                        <link>https://www.example.com/weird-date</link>
                        <description>Weird date</description>
                        <pubDate>{date}</pubDate>
                    </item>
                </channel>
            </rss>
        """

        feed = parse_feed(rss_feed)
        assert feed, "Feed was parsed."

        for item in feed.items():

            if item.publish_date():

                at_least_one_valid_date_parsed = True

                # Try parsing the date
                try:
                    parse_date(item.publish_date())
                except Exception as ex:
                    assert False, f"Unable to parse date {item.publish_date()}: {ex}"

                assert re.match(r'^\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d$', item.publish_date_sql())

    assert at_least_one_valid_date_parsed
Beispiel #8
0
def test_rss_enclosure():
    rss_feed = """
<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
    <channel>
        <title>Channel title</title>
        <lastBuildDate>Tue, 04 Feb 2020 16:01:20 -0500</lastBuildDate>
        <link>https://www.example.com/</link>
        <language>en</language>
        <copyright>&#x2117; &amp; &#xA9; 2020 Channel copyright</copyright>
        <itunes:subtitle><![CDATA[Channel description]]></itunes:subtitle>
        <itunes:author>Channel author</itunes:author>
        <itunes:summary><![CDATA[Channel summary]]></itunes:summary>
        <itunes:type>episodic</itunes:type>
        <itunes:explicit>false</itunes:explicit>
        <description><![CDATA[Channel description]]></description>
        <itunes:owner>
            <itunes:name>Channel author</itunes:name>
            <itunes:email>[email protected]</itunes:email>
        </itunes:owner>
        <image>
            <url>https://www.example.com/image.jpg</url>
            <title>Channel title</title>
            <link>https://www.example.com/</link>
        </image>
        <itunes:image href="https://www.example.com/image.jpg" />
        <itunes:category text="Comedy" />
        
        <item>
            <itunes:title>Item iTunes title</itunes:title>
            <title>Item title</title>
            <description><![CDATA[<p>Item description</p>]]></description>
            <link><![CDATA[http://www.example.com/item]]></link>
            <content:encoded><![CDATA[<p>Item description</p>]]></content:encoded>
            <itunes:author>Item author</itunes:author>
            <itunes:summary></itunes:summary>
            <enclosure url="https://www.example.com/item.mp3" length="123456789" type="audio/mpeg" />
            <guid isPermaLink="false">example.com-item</guid>
            <pubDate>Sat, 01 Feb 2020 10:00:00 -0500</pubDate>
            <itunes:duration>4479</itunes:duration>
            <itunes:keywords></itunes:keywords>
            <itunes:season></itunes:season>
            <itunes:episode></itunes:episode>
            <itunes:episodeType>full</itunes:episodeType>
            <itunes:explicit>false</itunes:explicit>
        </item>
    </channel>
</rss>
    """

    feed = parse_feed(rss_feed)
    assert feed, "Feed was parsed."

    assert len(feed.items()) == 1, "Exactly one item has to be found."
    item = feed.items()[0]

    assert len(item.enclosures()) == 1, "Exactly one enclosure has to be found."
    enclosure = item.enclosures()[0]

    assert enclosure.url() == "https://www.example.com/item.mp3"
    assert enclosure.length() == 123456789
    assert enclosure.mime_type() == "audio/mpeg"