Example #1
0
    def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self):
        """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt."""

        pages = {
            '/': 'This is a homepage.',

            '/robots.txt': {
                'header': 'Content-Type: ',
                'content': textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever
                    """.format(base_url=self.__test_url)).strip(),
            },
        }

        # noinspection PyArgumentList
        expected_sitemap_tree = IndexRobotsTxtSitemap(
            url='{}/robots.txt'.format(self.__test_url),
            sub_sitemaps=[],
        )

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        assert expected_sitemap_tree == actual_sitemap_tree
Example #2
0
    def setUp(self) -> None:
        super().setUp()

        self.__mock_data = os.urandom(1024 * 1024)

        # noinspection PyUnusedLocal
        def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]:
            response = "".encode('utf-8')
            response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
            response += "Content-Type: audio/mpeg\r\n".encode('utf-8')
            response += f"Content-Length: {len(self.__mock_data)}\r\n".encode(
                'utf-8')
            response += "\r\n".encode('utf-8')
            response += self.__mock_data
            return response

        port = random_unused_port()
        pages = {
            '/test.mp3': {
                'callback': __mp3_callback,
            }
        }

        self.__hs = HashServer(port=port, pages=pages)
        self.__hs.start()

        self.__url = f"http://127.0.0.1:{port}/test.mp3"

        self.__temp_dir = tempfile.mkdtemp('test')
        self.__dest_file = os.path.join(self.__temp_dir, 'test.mp3')
class MockAPIServer(object):
    __slots__ = [
        '__hs',
        '__port',
    ]

    def __init__(self, pages: Dict[str, Any]):
        self.__port = random_unused_port()
        self.__hs = HashServer(port=self.__port, pages=pages)
        self.__hs.start()

    def __del__(self):
        self.__hs.stop()

    def config(self) -> FacebookConfig:
        port = self.__port

        class MockFacebookConfig(FacebookConfig):
            @staticmethod
            def api_endpoint() -> str:
                return f'http://localhost:{port}/'

            @staticmethod
            def seconds_to_wait_between_retries() -> int:
                # Don't wait between retries
                return 0

        return MockFacebookConfig()
Example #4
0
def test_fetch_url():
    db = connect_to_db()

    def _meta_redirect(r):
        resp = ""
        resp += 'HTTP/1.0 200 OK\r\n'
        resp += 'Content-Type: text/html\r\n\r\n'
        resp += '<meta http-equiv="refresh" content="0; url=%s-foo">\n' % r.url(
        )
        return resp

    hs = HashServer(port=0,
                    pages={
                        '/foo': 'bar',
                        '/400': {
                            'http_status_code': 400
                        },
                        '/404': {
                            'http_status_code': 404
                        },
                        '/500': {
                            'http_status_code': 500
                        },
                        '/mr-foo': 'meta redirect target',
                        '/mr': {
                            'callback': _meta_redirect
                        },
                    })

    hs.start(delay=2)

    port = hs.port()

    timeout_args = {
        'network_down_host': 'localhost',
        'network_down_port': port,
        'network_down_timeout': 1,
        'domain_timeout': 0
    }

    # before delayed start, 404s and 500s should still return None
    assert not _fetch_url(db, hs.page_url('/404'), **timeout_args).is_success
    assert not _fetch_url(db, hs.page_url('/500'), **timeout_args).is_success

    # request for a valid page should make the call wait until the hs comes up
    assert _fetch_url(db, hs.page_url('/foo'), **timeout_args).content == 'bar'

    # and now a 400 should return a None
    assert not _fetch_url(db, hs.page_url('/400'), **timeout_args).is_success

    # make sure invalid url does not raise an exception
    assert not _fetch_url(db, 'this is not a url', **timeout_args) is None

    # make sure that requests follow meta redirects
    response = _fetch_url(db, hs.page_url('/mr'), **timeout_args)

    assert response.content == 'meta redirect target'
    assert response.last_requested_url == hs.page_url('/mr-foo')
    def setUp(self) -> None:
        self.db = connect_to_db()

        self.port = random_unused_port()

        self.__hs = HashServer(port=self.port, pages=self.hashserver_pages())
        self.__hs.start()

        self.media = create_test_story_stack(db=self.db,
                                             data={'A': {
                                                 'B': [1]
                                             }})
        self.feed = self.media['A']['feeds']['B']
    def test_all_url_variants_redirect_to_homepage(self):
        """Redirect to a homepage"""
        pages = {
            '/first': '<meta http-equiv="refresh" content="0; URL=/second%s" />' % self.CRUFT,
            '/second': '<meta http-equiv="refresh" content="0; URL=/',
        }
        hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages)
        hs.start()
        actual_url_variants = all_url_variants(db=self.db(), url=self.STARTING_URL)
        hs.stop()

        assert set(actual_url_variants) == {
            self.STARTING_URL,
            self.STARTING_URL_WITHOUT_CRUFT,
            '%s/second' % self.TEST_HTTP_SERVER_URL,
            '%s/second%s' % (self.TEST_HTTP_SERVER_URL, self.CRUFT,),
        }
    def test_all_url_variants_link_canonical(self):
        """<link rel="canonical" />"""
        pages = {
            '/first': '<meta http-equiv="refresh" content="0; URL=/second%s" />' % self.CRUFT,
            '/second': '<meta http-equiv="refresh" content="0; URL=/third%s" />' % self.CRUFT,
            '/third': '<link rel="canonical" href="%s/fourth" />' % self.TEST_HTTP_SERVER_URL,
        }
        hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages)
        hs.start()
        actual_url_variants = all_url_variants(db=self.db(), url=self.STARTING_URL)
        hs.stop()

        assert set(actual_url_variants) == {
            self.STARTING_URL,
            self.STARTING_URL_WITHOUT_CRUFT,
            '%s/third' % self.TEST_HTTP_SERVER_URL,
            '%s/third%s' % (self.TEST_HTTP_SERVER_URL, self.CRUFT,),
            '%s/fourth' % self.TEST_HTTP_SERVER_URL,
        }
Example #8
0
    def test_request(self) -> None:
        """Test requests with throttling."""
        pages = {'/test': 'Hello!', }
        port = 8888
        hs = HashServer(port=port, pages=pages)
        hs.start()

        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        test_url = hs.page_url('/test')

        # first request should work
        response = ua.get(test_url)
        assert response.decoded_content() == 'Hello!'

        # fail because we're in the timeout
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        self.assertRaises(McThrottledDomainException, ua.get, test_url)

        # succeed because it's a different domain
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        response = ua.get('http://127.0.0.1:8888/test')
        assert response.decoded_content() == 'Hello!'

        # still fail within the timeout
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        self.assertRaises(McThrottledDomainException, ua.get, test_url)

        time.sleep(2)

        # now we're outside the timeout, so it should work
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        response = ua.get(test_url)
        assert response.decoded_content() == 'Hello!'

        # and follow up request on the same ua object should work
        response = ua.get(test_url)
        assert response.decoded_content() == 'Hello!'

        # but then fail within the new timeout period with a new object
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        self.assertRaises(McThrottledDomainException, ua.get, test_url)

        hs.stop()

        # test domain_timeout assignment logic
        ua = ThrottledUserAgent(self.db(), domain_timeout=100)
        assert ua.domain_timeout == 100

        config = mediawords.util.config.get_config()

        config['mediawords']['throttled_user_agent_domain_timeout'] = 200
        ua = ThrottledUserAgent(self.db())
        assert ua.domain_timeout == 200

        del config['mediawords']['throttled_user_agent_domain_timeout']
        ua = ThrottledUserAgent(self.db())
        assert ua.domain_timeout == mediawords.util.web.user_agent.throttled._DEFAULT_DOMAIN_TIMEOUT
    def test_all_url_variants_basic(self):
        """Basic"""

        pages = {
            '/first': '<meta http-equiv="refresh" content="0; URL=/second%s" />' % self.CRUFT,
            '/second': '<meta http-equiv="refresh" content="0; URL=/third%s" />' % self.CRUFT,
            '/third': 'This is where the redirect chain should end.',
        }

        hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages)
        hs.start()
        actual_url_variants = all_url_variants(db=self.db(), url=self.STARTING_URL)
        hs.stop()

        assert set(actual_url_variants) == {
            self.STARTING_URL,
            self.STARTING_URL_WITHOUT_CRUFT,
            '%s/third' % self.TEST_HTTP_SERVER_URL,
            '%s/third%s' % (self.TEST_HTTP_SERVER_URL, self.CRUFT,)
        }
    def test_extract_article_html_from_page_html_connection_errors(self):
        """Try extracting with connection errors."""

        # Use multiprocessing.Value() because request might be handled in a fork
        self.is_first_response = multiprocessing.Value('i', 1)

        pages = {
            '/extract': {
                'callback': self.__extract_but_initially_fail,
            }
        }
        port = random_unused_port()

        hs = HashServer(port=port, pages=pages)
        hs.start()

        class MockExtractorCommonConfig(CommonConfig):
            """Mock configuration which points to our unstable extractor."""
            def extractor_api_url(self) -> str:
                return f'http://localhost:{port}/extract'

        extractor_response = extract_article_html_from_page_html(
            content='whatever', config=MockExtractorCommonConfig())

        hs.stop()

        assert extractor_response
        assert 'extracted_html' in extractor_response
        assert 'extractor_version' in extractor_response

        assert extractor_response[
            'extracted_html'] == self.expected_extracted_text

        assert not self.is_first_response.value, "Make sure the initial extractor call failed."
Example #11
0
    def test_all_url_variants_basic(self):
        """Basic"""

        pages = {
            '/first':
            '<meta http-equiv="refresh" content="0; URL=/second%s" />' %
            self.CRUFT,
            '/second':
            '<meta http-equiv="refresh" content="0; URL=/third%s" />' %
            self.CRUFT,
            '/third':
            'This is where the redirect chain should end.',
        }

        hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages)
        hs.start()
        actual_url_variants = all_url_variants(db=self.db(),
                                               url=self.STARTING_URL)
        hs.stop()

        assert set(actual_url_variants) == {
            self.STARTING_URL, self.STARTING_URL_WITHOUT_CRUFT,
            '%s/third' % self.TEST_HTTP_SERVER_URL,
            '%s/third%s' % (
                self.TEST_HTTP_SERVER_URL,
                self.CRUFT,
            )
        }
Example #12
0
    def test_all_url_variants_link_canonical(self):
        """<link rel="canonical" />"""
        pages = {
            '/first':
            '<meta http-equiv="refresh" content="0; URL=/second%s" />' %
            self.CRUFT,
            '/second':
            '<meta http-equiv="refresh" content="0; URL=/third%s" />' %
            self.CRUFT,
            '/third':
            '<link rel="canonical" href="%s/fourth" />' %
            self.TEST_HTTP_SERVER_URL,
        }
        hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages)
        hs.start()
        actual_url_variants = all_url_variants(db=self.db(),
                                               url=self.STARTING_URL)
        hs.stop()

        assert set(actual_url_variants) == {
            self.STARTING_URL,
            self.STARTING_URL_WITHOUT_CRUFT,
            '%s/third' % self.TEST_HTTP_SERVER_URL,
            '%s/third%s' % (
                self.TEST_HTTP_SERVER_URL,
                self.CRUFT,
            ),
            '%s/fourth' % self.TEST_HTTP_SERVER_URL,
        }
Example #13
0
    def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self):
        """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt."""

        pages = {
            '/': 'This is a homepage.',
            '/robots.txt': {
                'header':
                'Content-Type: ',
                'content':
                textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever
                    """.format(base_url=self.__test_url)).strip(),
            },
        }

        # noinspection PyArgumentList
        expected_sitemap_tree = IndexRobotsTxtSitemap(
            url='{}/robots.txt'.format(self.__test_url),
            sub_sitemaps=[],
        )

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        assert expected_sitemap_tree == actual_sitemap_tree
Example #14
0
def test_run_fetcher():
    db = connect_to_db()

    medium = create_test_medium(db=db, label='foo')
    feed = create_test_feed(db=db, label='foo', medium=medium)
    story = create_test_story(db=db, label='foo', feed=feed)

    port = random_unused_port()
    pages = {
        '/foo': 'foo',
        '/bar': 'bar',
    }

    hs = HashServer(port=port, pages=pages)
    hs.start()

    download = db.create(table='downloads',
                         insert_hash={
                             'state': 'pending',
                             'feeds_id': feed['feeds_id'],
                             'stories_id': story['stories_id'],
                             'type': 'content',
                             'sequence': 1,
                             'priority': 1,
                             'url': f"http://localhost:{port}/foo",
                             'host': 'localhost',
                         })

    db.query("""
        INSERT INTO queued_downloads (downloads_id)
        SELECT downloads_id FROM downloads
    """)

    run_fetcher(no_daemon=True)

    test_download = db.find_by_id(table='downloads',
                                  object_id=download['downloads_id'])
    assert test_download['state'] == 'success'
Example #15
0
def testRandomPort() -> None:
    """Test assigning a random port where port = 0."""

    hss = []
    for i in range(3):
        hs = HashServer(port=0, pages={'/foo': 'bar'})
        assert hs is not None

        hs.start()

        assert hs.port() >= START_RANDOM_PORT
        assert tcp_port_is_open(hs.port())
        assert str(requests.get(hs.page_url('/foo')).text) == 'bar'
        hss.append(hs)

    [hs.stop() for hs in hss]
Example #16
0
    def test_sitemap_tree_for_homepage_no_robots_txt(self):
        """Test sitemap_tree_for_homepage() with no robots.txt."""

        pages = {
            '/': 'This is a homepage.',
        }

        # noinspection PyArgumentList
        expected_sitemap_tree = InvalidSitemap(
            url='{}/robots.txt'.format(self.__test_url),
            reason=(
                'Unable to fetch sitemap from {base_url}/robots.txt: 404 Not Found'
            ).format(base_url=self.__test_url),
        )

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        assert expected_sitemap_tree == actual_sitemap_tree
Example #17
0
def testDelay() -> None:
    """Test the delay= parameter to hs.start."""
    hs = HashServer(port=0, pages={'/foo': 'bar'})

    hs.start(delay=1)
    caught_exception = False
    try:
        requests.get(hs.page_url('/foo'))
    except requests.exceptions.ConnectionError:
        caught_exception = True

    assert caught_exception

    time.sleep(2)
    assert str(requests.get(hs.page_url('/foo')).text) == 'bar'

    hs.stop()
Example #18
0
    def test_all_url_variants_redirect_to_homepage(self):
        """Redirect to a homepage"""
        pages = {
            '/first':
            '<meta http-equiv="refresh" content="0; URL=/second%s" />' %
            self.CRUFT,
            '/second':
            '<meta http-equiv="refresh" content="0; URL=/',
        }
        hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages)
        hs.start()
        actual_url_variants = all_url_variants(db=self.db(),
                                               url=self.STARTING_URL)
        hs.stop()

        assert set(actual_url_variants) == {
            self.STARTING_URL,
            self.STARTING_URL_WITHOUT_CRUFT,
            '%s/second' % self.TEST_HTTP_SERVER_URL,
            '%s/second%s' % (
                self.TEST_HTTP_SERVER_URL,
                self.CRUFT,
            ),
        }
Example #19
0
    def test_sitemap_tree_for_homepage_no_robots_txt(self):
        """Test sitemap_tree_for_homepage() with no robots.txt."""

        pages = {
            '/': 'This is a homepage.',
        }

        # noinspection PyArgumentList
        expected_sitemap_tree = InvalidSitemap(
            url='{}/robots.txt'.format(self.__test_url),
            reason=
            ('Unable to fetch sitemap from {base_url}/robots.txt: 404 Not Found'
             ).format(base_url=self.__test_url),
        )

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        assert expected_sitemap_tree == actual_sitemap_tree
Example #20
0
    def test_sitemap_tree_for_homepage_gzip(self):
        """Test sitemap_tree_for_homepage() with gzipped sitemaps."""

        pages = {
            '/': 'This is a homepage.',

            '/robots.txt': {
                'header': 'Content-Type: text/plain',
                'content': textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_1.gz
                        Sitemap: {base_url}/sitemap_2.dat
                    """.format(base_url=self.__test_url)).strip(),
            },

            # Gzipped sitemap without correct HTTP header but with .gz extension
            '/sitemap_1.gz': {
                'content': gzip(textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/foo.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                            </news:news>
                        </url>
                    </urlset>
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip()),
            },

            # Gzipped sitemap with correct HTTP header but without .gz extension
            '/sitemap_2.dat': {
                'header': 'Content-Type: application/x-gzip',
                'content': gzip(textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/baz.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title><![CDATA[BΔ…ΕΎ]]></news:title>    <!-- CDATA and UTF-8 -->
                            </news:news>
                        </url>
                    </urlset>
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip()),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        # Don't do an in-depth check, we just need to make sure that gunzip works
        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 2

        sitemap_1 = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap_1, PagesXMLSitemap)
        assert len(sitemap_1.pages) == 1

        sitemap_2 = actual_sitemap_tree.sub_sitemaps[1]
        assert isinstance(sitemap_2, PagesXMLSitemap)
        assert len(sitemap_2.pages) == 1
Example #21
0
    def test_sitemap_tree_for_homepage_plain_text(self):
        """Test sitemap_tree_for_homepage() with plain text sitemaps."""

        pages = {
            '/': 'This is a homepage.',

            '/robots.txt': {
                'header': 'Content-Type: text/plain',
                'content': textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_1.txt
                        Sitemap: {base_url}/sitemap_2.txt.dat
                    """.format(base_url=self.__test_url)).strip(),
            },

            # Plain text uncompressed sitemap
            '/sitemap_1.txt': {
                'content': textwrap.dedent("""

                    {base_url}/news/foo.html


                    {base_url}/news/bar.html

                    Some other stuff which totally doesn't look like an URL
                """.format(base_url=self.__test_url)).strip(),
            },

            # Plain text compressed sitemap without .gz extension
            '/sitemap_2.txt.dat': {
                'header': 'Content-Type: application/x-gzip',
                'content': gzip(textwrap.dedent("""
                    {base_url}/news/bar.html
                        {base_url}/news/baz.html
                """.format(base_url=self.__test_url)).strip()),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 2

        sitemap_1 = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap_1, PagesTextSitemap)
        assert len(sitemap_1.pages) == 2

        sitemap_2 = actual_sitemap_tree.sub_sitemaps[1]
        assert isinstance(sitemap_2, PagesTextSitemap)
        assert len(sitemap_2.pages) == 2

        pages = actual_sitemap_tree.all_pages()
        assert len(pages) == 3
        print(pages)
        assert SitemapPage(url='{}/news/foo.html'.format(self.__test_url)) in pages
        assert SitemapPage(url='{}/news/bar.html'.format(self.__test_url)) in pages
        assert SitemapPage(url='{}/news/baz.html'.format(self.__test_url)) in pages
Example #22
0
    def setUp(self) -> None:
        super().setUp()

        pages = {
            '/feed':
            encode_json({
                'status': 'success',
                'data': {
                    'title':
                    'Sample Univision feed',
                    'totalItems':
                    2,
                    'items': [
                        {
                            'type':
                            'article',
                            'uid':
                            '00000156-ba02-d374-ab77-feab13e20000',
                            'url':
                            f"{self.URL}/first_article",
                            'publishDate':
                            '2016-08-23T23:32:11-04:00',
                            'updateDate':
                            '2016-08-24T10:09:26-04:00',
                            'title':
                            'First article: πŸ•',  # UTF-8 in the title
                            'description':
                            'This is the first Univision sample article.',
                        },
                        {
                            'type':
                            'article',
                            'uid':
                            '00000156-ba73-d5b6-affe-faf77f890000',
                            'url':
                            f"{self.URL}/second_article",
                            'publishDate':
                            '2016-08-23T23:20:13-04:00',
                            'updateDate':
                            '2016-08-24T09:55:40-04:00',
                            'title':
                            'Second article: πŸ”',  # UTF-8 in the title
                            'description':
                            'This is the second Univision sample article.',
                        },
                    ]
                }
            }),
            '/first_article':
            """
                <h1>First article</h1>
                <p>This is the first Univision sample article.</p>
            """,
            '/second_article':
            """
                <h1>Second article</h1>
                <p>This is the second Univision sample article.</p>
            """,
        }

        self.__hs = HashServer(port=self.PORT, pages=pages)
        self.__hs.start()
Example #23
0
class TestUnivisionLocal(AbstractUnivisionTest, TestCase):
    URL = None
    PORT = None

    __slots__ = [
        '__hs',
    ]

    @classmethod
    def setUpClass(cls) -> None:
        super().setUpClass()

        cls.PORT = random_unused_port()
        cls.URL = f'http://localhost:{cls.PORT}'

    @classmethod
    def univision_credentials(cls) -> Optional[UnivisionTestCredentials]:
        return UnivisionTestCredentials(
            url=f"{cls.URL}/feed",
            client_id='foo',
            client_secret='bar',
        )

    @classmethod
    def expect_to_find_some_stories(cls) -> bool:
        # Test feed always has stories
        return True

    def setUp(self) -> None:
        super().setUp()

        pages = {
            '/feed':
            encode_json({
                'status': 'success',
                'data': {
                    'title':
                    'Sample Univision feed',
                    'totalItems':
                    2,
                    'items': [
                        {
                            'type':
                            'article',
                            'uid':
                            '00000156-ba02-d374-ab77-feab13e20000',
                            'url':
                            f"{self.URL}/first_article",
                            'publishDate':
                            '2016-08-23T23:32:11-04:00',
                            'updateDate':
                            '2016-08-24T10:09:26-04:00',
                            'title':
                            'First article: πŸ•',  # UTF-8 in the title
                            'description':
                            'This is the first Univision sample article.',
                        },
                        {
                            'type':
                            'article',
                            'uid':
                            '00000156-ba73-d5b6-affe-faf77f890000',
                            'url':
                            f"{self.URL}/second_article",
                            'publishDate':
                            '2016-08-23T23:20:13-04:00',
                            'updateDate':
                            '2016-08-24T09:55:40-04:00',
                            'title':
                            'Second article: πŸ”',  # UTF-8 in the title
                            'description':
                            'This is the second Univision sample article.',
                        },
                    ]
                }
            }),
            '/first_article':
            """
                <h1>First article</h1>
                <p>This is the first Univision sample article.</p>
            """,
            '/second_article':
            """
                <h1>Second article</h1>
                <p>This is the second Univision sample article.</p>
            """,
        }

        self.__hs = HashServer(port=self.PORT, pages=pages)
        self.__hs.start()

    def tearDown(self) -> None:
        self.__hs.stop()
Example #24
0
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media',
                                 insert_hash={
                                     'name': "test medium",
                                     'url': "url://test/medium",
                                 })

        story = self.db().create(table='stories',
                                 insert_hash={
                                     'media_id': media['media_id'],
                                     'url': 'url://story/a',
                                     'guid': 'guid://story/a',
                                     'title': 'story a',
                                     'description': 'description a',
                                     'publish_date': sql_now(),
                                     'collect_date': sql_now(),
                                     'full_text_rss': True,
                                 })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences',
                         insert_hash={
                             'stories_id': stories_id,
                             'sentence_number': 1,
                             'sentence':
                             'I hope that the CLIFF annotator is working.',
                             'media_id': media['media_id'],
                             'publish_date': sql_now(),
                             'language': 'en'
                         })

        def __nyt_labels_sample_response(
                _: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(),
                                               stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query(
            """
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {
                'object_id': stories_id
            }).hash()
        assert annotation_exists is not None

        story_tags = self.db().query(
            """
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {
                'stories_id': stories_id
            }).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags
Example #25
0
class TestDownloadHandler(TestCase, metaclass=abc.ABCMeta):
    __slots__ = [
        'db',
        'port',
        'media',
        'feed',
        '__hs',
    ]

    @abc.abstractmethod
    def hashserver_pages(self) -> Dict[str, Any]:
        """Return HashServer pages to serve."""
        raise NotImplemented("Abstract method")

    def _fetch_and_handle_response(
            self,
            path: str,
            downloads_id: Optional[int] = None) -> Dict[str, Any]:
        """Call the fetcher and handler on the given URL. Return the download passed to the fetcher and handler."""

        if downloads_id:
            download = self.db.find_by_id(table='downloads',
                                          object_id=downloads_id)
        else:
            download = self.db.create(
                table='downloads',
                insert_hash={
                    'url': f"http://localhost:{self.port}{path}",
                    'host': 'localhost',
                    'type': 'feed',
                    'state': 'pending',
                    'priority': 0,
                    'sequence': 1,
                    'feeds_id': self.feed['feeds_id'],
                })
            downloads_id = download['downloads_id']

        handler = handler_for_download(db=self.db, download=download)

        response = handler.fetch_download(db=self.db, download=download)
        assert response

        handler.store_response(db=self.db,
                               download=download,
                               response=response)

        download = self.db.find_by_id(table='downloads',
                                      object_id=downloads_id)

        return download

    def setUp(self) -> None:
        self.db = connect_to_db()

        self.port = random_unused_port()

        self.__hs = HashServer(port=self.port, pages=self.hashserver_pages())
        self.__hs.start()

        self.media = create_test_story_stack(db=self.db,
                                             data={'A': {
                                                 'B': [1]
                                             }})
        self.feed = self.media['A']['feeds']['B']

    def tearDown(self) -> None:
        self.__hs.stop()
Example #26
0
    def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):
        """Test sitemap_tree_for_homepage() with clipped XML.

        Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the
        server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with
        this behavior, so we have to support this too.
        """

        pages = {
            '/': 'This is a homepage.',
            '/robots.txt': {
                'header':
                'Content-Type: text/plain',
                'content':
                textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap.xml
                    """.format(base_url=self.__test_url)).strip(),
            },
            '/sitemap.xml': {
                'content':
                textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/first.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>First story</news:title>
                            </news:news>
                        </url>
                        <url>
                            <loc>{base_url}/news/second.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Second story</news:title>
                            </news:news>
                        </url>

                        <!-- The following story shouldn't get added as the XML ends prematurely -->
                        <url>
                            <loc>{base_url}/news/third.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publicat
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip(),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 1

        sitemap = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap, PagesXMLSitemap)
        assert len(sitemap.pages) == 2
Example #27
0
    def test_sitemap_tree_for_homepage_plain_text(self):
        """Test sitemap_tree_for_homepage() with plain text sitemaps."""

        pages = {
            '/': 'This is a homepage.',
            '/robots.txt': {
                'header':
                'Content-Type: text/plain',
                'content':
                textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_1.txt
                        Sitemap: {base_url}/sitemap_2.txt.dat
                    """.format(base_url=self.__test_url)).strip(),
            },

            # Plain text uncompressed sitemap
            '/sitemap_1.txt': {
                'content':
                textwrap.dedent("""

                    {base_url}/news/foo.html


                    {base_url}/news/bar.html

                    Some other stuff which totally doesn't look like an URL
                """.format(base_url=self.__test_url)).strip(),
            },

            # Plain text compressed sitemap without .gz extension
            '/sitemap_2.txt.dat': {
                'header':
                'Content-Type: application/x-gzip',
                'content':
                gzip(
                    textwrap.dedent("""
                    {base_url}/news/bar.html
                        {base_url}/news/baz.html
                """.format(base_url=self.__test_url)).strip()),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 2

        sitemap_1 = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap_1, PagesTextSitemap)
        assert len(sitemap_1.pages) == 2

        sitemap_2 = actual_sitemap_tree.sub_sitemaps[1]
        assert isinstance(sitemap_2, PagesTextSitemap)
        assert len(sitemap_2.pages) == 2

        pages = actual_sitemap_tree.all_pages()
        assert len(pages) == 3
        print(pages)
        assert SitemapPage(
            url='{}/news/foo.html'.format(self.__test_url)) in pages
        assert SitemapPage(
            url='{}/news/bar.html'.format(self.__test_url)) in pages
        assert SitemapPage(
            url='{}/news/baz.html'.format(self.__test_url)) in pages
Example #28
0
    def test_tagging(self):
        db = connect_to_db()

        media = db.create(table='media',
                          insert_hash={
                              'name': "test medium",
                              'url': "url://test/medium",
                          })

        story = db.create(table='stories',
                          insert_hash={
                              'media_id': media['media_id'],
                              'url': 'url://story/a',
                              'guid': 'guid://story/a',
                              'title': 'story a',
                              'description': 'description a',
                              'publish_date': sql_now(),
                              'collect_date': sql_now(),
                              'full_text_rss': True,
                          })
        stories_id = story['stories_id']

        db.create(table='story_sentences',
                  insert_hash={
                      'stories_id': stories_id,
                      'sentence_number': 1,
                      'sentence':
                      'I hope that the CLIFF annotator is working.',
                      'media_id': media['media_id'],
                      'publish_date': sql_now(),
                      'language': 'en'
                  })

        def __cliff_sample_response(
                _: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(sample_cliff_response())
            return response

        pages = {
            '/cliff/parse/text': {
                'callback': __cliff_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/cliff/parse/text' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        class TestCLIFFFetcherConfig(CLIFFTagsFromAnnotationConfig):
            @staticmethod
            def annotator_url() -> str:
                return annotator_url

        cliff = CLIFFTagsFromAnnotation(tagger_config=TestCLIFFFetcherConfig())
        cliff.update_tags_for_story(db=db, stories_id=stories_id)

        hs.stop()

        story_tags = db.query(
            """
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY
                lower(tag_sets.name),
                lower(tags.tag)
        """, {
                'stories_id': stories_id
            }).hashes()

        expected_tags = expected_cliff_tags()

        assert story_tags == expected_tags
Example #29
0
    def test_sitemap_tree_for_homepage_huge_sitemap(self):
        """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling)."""

        page_count = 1000

        sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                    xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
                    xmlns:xhtml="http://www.w3.org/1999/xhtml">
        """
        for x in range(page_count):
            sitemap_xml += """
                <url>
                    <loc>{base_url}/news/page_{x}.html</loc>

                    <!-- Element present but empty -->
                    <lastmod />

                    <!-- Some other XML namespace -->
                    <xhtml:link rel="alternate"
                                media="only screen and (max-width: 640px)"
                                href="{base_url}/news/page_{x}.html?mobile=1" />

                    <news:news>
                        <news:publication>
                            <news:name>{publication_name}</news:name>
                            <news:language>{publication_language}</news:language>
                        </news:publication>
                        <news:publication_date>{publication_date}</news:publication_date>
                        <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                    </news:news>
                </url>
            """.format(
                x=x,
                base_url=self.__test_url,
                publication_name=self.TEST_PUBLICATION_NAME,
                publication_language=self.TEST_PUBLICATION_LANGUAGE,
                publication_date=self.TEST_DATE_STR,
            )

        sitemap_xml += "</urlset>"

        pages = {
            '/': 'This is a homepage.',

            '/robots.txt': {
                'header': 'Content-Type: text/plain',
                'content': textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap.xml.gz
                    """.format(base_url=self.__test_url)).strip(),
            },

            '/sitemap.xml.gz': {
                'header': 'Content-Type: application/x-gzip',
                'content': gzip(sitemap_xml),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        assert len(actual_sitemap_tree.all_pages()) == page_count
Example #30
0
    def setUp(self) -> None:
        super().setUp()

        self.db = connect_to_db()

        test_medium = create_test_medium(db=self.db, label='test')
        test_feed = create_test_feed(db=self.db,
                                     label='test',
                                     medium=test_medium)

        # Add a story with a random ID to decrease the chance that object in GCS will collide with another test running
        # at the same time
        self.stories_id = random.randint(1, 2147483647 - 1)

        self.db.query(
            """
            INSERT INTO stories (
                stories_id,
                media_id,
                url,
                guid,
                title,
                description,
                publish_date,
                collect_date,
                full_text_rss
            ) VALUES (
                %(stories_id)s,
                %(media_id)s,
                'http://story.test/',
                'guid://story.test/',
                'story',
                'description',
                '2016-10-15 08:00:00',
                '2016-10-15 10:00:00',
                true
            )
        """, {
                'stories_id': self.stories_id,
                'media_id': test_feed['media_id'],
            })

        # Create missing partitions for "feeds_stories_map"
        self.db.query('SELECT create_missing_partitions()')

        self.db.create(table='feeds_stories_map',
                       insert_hash={
                           'feeds_id': int(test_feed['feeds_id']),
                           'stories_id': self.stories_id,
                       })

        assert os.path.isfile(self.input_media_path(
        )), f"Test media file '{self.input_media_path()}' should exist."

        with open(self.input_media_path(), mode='rb') as f:
            test_data = f.read()

        # noinspection PyUnusedLocal
        def __media_callback(request: HashServer.Request) -> Union[str, bytes]:
            response = "".encode('utf-8')
            response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
            response += f"Content-Type: {self.input_media_mime_type()}\r\n".encode(
                'utf-8')
            response += f"Content-Length: {len(test_data)}\r\n".encode('utf-8')
            response += "\r\n".encode('utf-8')
            response += test_data
            return response

        port = 8080  # Port exposed on docker-compose.tests.yml
        media_path = '/test_media_file'
        pages = {
            media_path: {
                'callback': __media_callback,
            }
        }

        self.hs = HashServer(port=port, pages=pages)
        self.hs.start()

        # Using our hostname as it will be another container that will be connecting to us
        media_url = f'http://{socket.gethostname()}:{port}{media_path}'

        self.db.insert(table='story_enclosures',
                       insert_hash={
                           'stories_id': self.stories_id,
                           'url': media_url,
                           'mime_type': self.input_media_mime_type(),
                           'length': len(test_data),
                       })

        # Add a "podcast-fetch-episode" job
        JobBroker(
            queue_name='MediaWords::Job::Podcast::FetchEpisode').add_to_queue(
                stories_id=self.stories_id)

        total_time = int(self.retries_per_step() *
                         self.seconds_between_retries())

        # Wait for "podcast-fetch-episode" to transcode, upload to Google Storage, and write it to "podcast_episodes"
        episodes = None
        for x in range(1, self.retries_per_step() + 1):
            log.info(f"Waiting for episode to appear (#{x})...")

            episodes = self.db.select(table='podcast_episodes',
                                      what_to_select='*').hashes()
            if episodes:
                log.info(f"Episode is here!")
                break

            time.sleep(self.seconds_between_retries())

        assert episodes, f"Episode didn't show up in {total_time} seconds."

        # Wait for "podcast-submit-operation" to submit Speech API operation
        self.transcript_fetches = None
        for x in range(1, self.retries_per_step() + 1):
            log.info(f"Waiting for transcript fetch to appear (#{x})...")

            self.transcript_fetches = self.db.select(
                table='podcast_episode_transcript_fetches',
                what_to_select='*').hashes()

            if self.transcript_fetches:
                log.info(f"Transcript fetch is here!")
                break

            time.sleep(self.seconds_between_retries())

        assert self.transcript_fetches, f"Operation didn't show up in {total_time} seconds."
Example #31
0
def test_http_hash_server():
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    def __simple_callback(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback',
            'method': request.method(),
            'url': request.url(),
            'content-type': request.content_type(),
            'params': request.query_params(),
            'cookies': request.cookies(),
        })
        return str.encode(r)

    # noinspection PyUnusedLocal
    def __callback_cookie_redirect(request: HashServer.Request) -> str:
        r = ""
        r += "HTTP/1.0 302 Moved Temporarily\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "Location: /check_cookie\r\n"
        r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n"
        r += "\r\n"
        r += "Redirecting to the cookie check page..."
        return r

    def __callback_post(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback_post',
            'post_data': request.content(),
        })
        return str.encode(r)

    pages = {
        '/': 'home',
        '/foo': b'foo',
        '/bar': 'bar Δ…Δ…',
        '/foo-bar': {b'redirect': b'/bar'},
        '/localhost': {'redirect': "http://localhost:%d/" % port},
        b'/127-foo': {b'redirect': "http://127.0.0.1:%d/foo" % port},
        '/auth': {b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc"},
        '/404': {b'content': b'not found', b'http_status_code': 404},
        '/callback': {b'callback': __simple_callback},

        # Test setting cookies, redirects
        '/callback_cookie_redirect': {'callback': __callback_cookie_redirect},

        # POST data
        '/callback_post': {'callback': __callback_post},
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    assert str(requests.get('%s/' % base_url).text) == 'home'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar' % base_url).text) == 'bar Δ…Δ…'
    assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar Δ…Δ…'
    assert str(requests.get('%s/localhost' % base_url).text) == 'home'
    assert str(requests.get('%s/127-foo' % base_url).text) == 'foo'

    # Path normalization
    assert str(requests.get('%s//' % base_url).text) == 'home'
    assert str(requests.get('%s///' % base_url).text) == 'home'
    assert str(requests.get('%s/something/../' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..//' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..///' % base_url).text) == 'home'
    assert str(requests.get('%s/foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo///' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo'

    response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={'cookie_name': 'cookie_value'}).json()
    assert response_json == {
        'name': 'callback',
        'method': 'GET',
        'url': 'http://localhost:%d/callback?a=b&c=d' % port,
        'content-type': None,
        'params': {
            'a': 'b',
            'c': 'd',
        },
        'cookies': {
            'cookie_name': 'cookie_value',
        },
    }

    response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False)
    assert response.status_code == 302
    assert response.headers['Location'] == '/check_cookie'

    response = requests.get("%s/404" % base_url)
    assert response.status_code == HTTPStatus.NOT_FOUND.value
    assert 'Not Found' in response.reason

    auth_url = "%s/auth" % base_url

    assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED
    assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED

    response = requests.get(auth_url, auth=('foo', 'bar'))
    assert response.status_code == HTTPStatus.OK
    assert response.content == b"foo bar \xf0\x90\x28\xbc"

    assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port)
    with pytest.raises(McHashServerException):
        hs.page_url('/does-not-exist')

    response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json()
    assert response_json == {
        'name': 'callback_post',
        'post_data': 'abc=def',
    }

    hs.stop()
Example #32
0
    def test_sitemap_tree_for_homepage_gzip(self):
        """Test sitemap_tree_for_homepage() with gzipped sitemaps."""

        pages = {
            '/': 'This is a homepage.',
            '/robots.txt': {
                'header':
                'Content-Type: text/plain',
                'content':
                textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_1.gz
                        Sitemap: {base_url}/sitemap_2.dat
                    """.format(base_url=self.__test_url)).strip(),
            },

            # Gzipped sitemap without correct HTTP header but with .gz extension
            '/sitemap_1.gz': {
                'content':
                gzip(
                    textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/foo.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                            </news:news>
                        </url>
                    </urlset>
                """.format(
                        base_url=self.__test_url,
                        publication_name=self.TEST_PUBLICATION_NAME,
                        publication_language=self.TEST_PUBLICATION_LANGUAGE,
                        publication_date=self.TEST_DATE_STR,
                    )).strip()),
            },

            # Gzipped sitemap with correct HTTP header but without .gz extension
            '/sitemap_2.dat': {
                'header':
                'Content-Type: application/x-gzip',
                'content':
                gzip(
                    textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/baz.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title><![CDATA[BΔ…ΕΎ]]></news:title>    <!-- CDATA and UTF-8 -->
                            </news:news>
                        </url>
                    </urlset>
                """.format(
                        base_url=self.__test_url,
                        publication_name=self.TEST_PUBLICATION_NAME,
                        publication_language=self.TEST_PUBLICATION_LANGUAGE,
                        publication_date=self.TEST_DATE_STR,
                    )).strip()),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        # Don't do an in-depth check, we just need to make sure that gunzip works
        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 2

        sitemap_1 = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap_1, PagesXMLSitemap)
        assert len(sitemap_1.pages) == 1

        sitemap_2 = actual_sitemap_tree.sub_sitemaps[1]
        assert isinstance(sitemap_2, PagesXMLSitemap)
        assert len(sitemap_2.pages) == 1
Example #33
0
    def test_sitemap_tree_for_homepage_prematurely_ending_xml(self):
        """Test sitemap_tree_for_homepage() with clipped XML.

        Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the
        server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with
        this behavior, so we have to support this too.
        """

        pages = {
            '/': 'This is a homepage.',

            '/robots.txt': {
                'header': 'Content-Type: text/plain',
                'content': textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap.xml
                    """.format(base_url=self.__test_url)).strip(),
            },

            '/sitemap.xml': {
                'content': textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
                        <url>
                            <loc>{base_url}/news/first.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>First story</news:title>
                            </news:news>
                        </url>
                        <url>
                            <loc>{base_url}/news/second.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Second story</news:title>
                            </news:news>
                        </url>

                        <!-- The following story shouldn't get added as the XML ends prematurely -->
                        <url>
                            <loc>{base_url}/news/third.html</loc>
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publicat
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip(),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap)
        assert len(actual_sitemap_tree.sub_sitemaps) == 1

        sitemap = actual_sitemap_tree.sub_sitemaps[0]
        assert isinstance(sitemap, PagesXMLSitemap)
        assert len(sitemap.pages) == 2
Example #34
0
    def test_sitemap_tree_for_homepage(self):
        """Test sitemap_tree_for_homepage()."""

        pages = {
            '/': 'This is a homepage.',
            '/robots.txt': {
                'header':
                'Content-Type: text/plain',
                'content':
                textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_pages.xml
                        Sitemap: {base_url}/sitemap_news_index_1.xml
                    """.format(base_url=self.__test_url)).strip(),
            },

            # One sitemap for random static pages
            '/sitemap_pages.xml': {
                'header':
                'Content-Type: application/xml',
                'content':
                textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                        <url>
                            <loc>{base_url}/about.html</loc>
                            <lastmod>{last_modified_date}</lastmod>
                            <changefreq>monthly</changefreq>
                            <priority>0.8</priority>
                        </url>
                        <url>
                            <loc>{base_url}/contact.html</loc>
                            <lastmod>{last_modified_date}</lastmod>

                            <!-- Invalid change frequency -->
                            <changefreq>when we feel like it</changefreq>

                            <!-- Invalid priority -->
                            <priority>1.1</priority>

                        </url>
                    </urlset>
                """.format(base_url=self.__test_url,
                           last_modified_date=self.TEST_DATE_STR)).strip(),
            },

            # Index sitemap pointing to sitemaps with stories
            '/sitemap_news_index_1.xml': {
                'header':
                'Content-Type: application/xml',
                'content':
                textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                        <sitemap>
                            <loc>{base_url}/sitemap_news_1.xml</loc>
                            <lastmod>{last_modified}</lastmod>
                        </sitemap>
                        <sitemap>
                            <loc>{base_url}/sitemap_news_index_2.xml</loc>
                            <lastmod>{last_modified}</lastmod>
                        </sitemap>
                    </sitemapindex>
                """.format(base_url=self.__test_url,
                           last_modified=self.TEST_DATE_STR)).strip(),
            },

            # First sitemap with actual stories
            '/sitemap_news_1.xml': {
                'header':
                'Content-Type: application/xml',
                'content':
                textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
                            xmlns:xhtml="http://www.w3.org/1999/xhtml">

                        <url>
                            <loc>{base_url}/news/foo.html</loc>

                            <!-- Element present but empty -->
                            <lastmod />

                            <!-- Some other XML namespace -->
                            <xhtml:link rel="alternate"
                                        media="only screen and (max-width: 640px)"
                                        href="{base_url}/news/foo.html?mobile=1" />

                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                            </news:news>
                        </url>

                        <!-- Has a duplicate story in /sitemap_news_2.xml -->
                        <url>
                            <loc>{base_url}/news/bar.html</loc>
                            <xhtml:link rel="alternate"
                                        media="only screen and (max-width: 640px)"
                                        href="{base_url}/news/bar.html?mobile=1" />
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Bar &amp; bar</news:title>
                            </news:news>
                        </url>

                    </urlset>
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip(),
            },

            # Another index sitemap pointing to a second sitemaps with stories
            '/sitemap_news_index_2.xml': {
                'header':
                'Content-Type: application/xml',
                'content':
                textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">

                        <sitemap>
                            <!-- Extra whitespace added around URL -->
                            <loc>  {base_url}/sitemap_news_2.xml  </loc>
                            <lastmod>{last_modified}</lastmod>
                        </sitemap>

                        <!-- Nonexistent sitemap -->
                        <sitemap>
                            <loc>{base_url}/sitemap_news_nonexistent.xml</loc>
                            <lastmod>{last_modified}</lastmod>
                        </sitemap>

                    </sitemapindex>
                """.format(base_url=self.__test_url,
                           last_modified=self.TEST_DATE_STR)).strip(),
            },

            # First sitemap with actual stories
            '/sitemap_news_2.xml': {
                'header':
                'Content-Type: application/xml',
                'content':
                textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
                            xmlns:xhtml="http://www.w3.org/1999/xhtml">

                        <!-- Has a duplicate story in /sitemap_news_1.xml -->
                        <url>
                            <!-- Extra whitespace added around URL -->
                            <loc>  {base_url}/news/bar.html#fragment_is_to_be_removed  </loc>
                            <xhtml:link rel="alternate"
                                        media="only screen and (max-width: 640px)"
                                        href="{base_url}/news/bar.html?mobile=1#fragment_is_to_be_removed" />
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>

                                <tag_without_inner_character_data name="value" />

                                <news:title>Bar &amp; bar</news:title>
                            </news:news>
                        </url>

                        <url>
                            <loc>{base_url}/news/baz.html</loc>
                            <xhtml:link rel="alternate"
                                        media="only screen and (max-width: 640px)"
                                        href="{base_url}/news/baz.html?mobile=1" />
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title><![CDATA[BΔ…ΕΎ]]></news:title>    <!-- CDATA and UTF-8 -->
                            </news:news>
                        </url>

                    </urlset>
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip(),
            },
        }

        # noinspection PyArgumentList
        expected_sitemap_tree = IndexRobotsTxtSitemap(
            url='{}/robots.txt'.format(self.__test_url),
            sub_sitemaps=[
                PagesXMLSitemap(
                    url='{}/sitemap_pages.xml'.format(self.__test_url),
                    pages=[
                        SitemapPage(
                            url='{}/about.html'.format(self.__test_url),
                            last_modified=self.TEST_DATE_DATETIME,
                            news_story=None,
                            change_frequency=SitemapPageChangeFrequency.
                            MONTHLY,
                            priority=Decimal('0.8'),
                        ),
                        SitemapPage(
                            url='{}/contact.html'.format(self.__test_url),
                            last_modified=self.TEST_DATE_DATETIME,
                            news_story=None,

                            # Invalid input -- should be reset to "always"
                            change_frequency=SitemapPageChangeFrequency.ALWAYS,

                            # Invalid input -- should be reset to 0.5 (the default as per the spec)
                            priority=Decimal('0.5'),
                        )
                    ],
                ),
                IndexXMLSitemap(
                    url='{}/sitemap_news_index_1.xml'.format(self.__test_url),
                    sub_sitemaps=[
                        PagesXMLSitemap(
                            url='{}/sitemap_news_1.xml'.format(
                                self.__test_url),
                            pages=[
                                SitemapPage(
                                    url='{}/news/foo.html'.format(
                                        self.__test_url),
                                    news_story=SitemapNewsStory(
                                        title='Foo <foo>',
                                        publish_date=self.TEST_DATE_DATETIME,
                                        publication_name=self.
                                        TEST_PUBLICATION_NAME,
                                        publication_language=self.
                                        TEST_PUBLICATION_LANGUAGE,
                                    ),
                                ),
                                SitemapPage(
                                    url='{}/news/bar.html'.format(
                                        self.__test_url),
                                    news_story=SitemapNewsStory(
                                        title='Bar & bar',
                                        publish_date=self.TEST_DATE_DATETIME,
                                        publication_name=self.
                                        TEST_PUBLICATION_NAME,
                                        publication_language=self.
                                        TEST_PUBLICATION_LANGUAGE,
                                    ),
                                ),
                            ]),
                        IndexXMLSitemap(
                            url='{}/sitemap_news_index_2.xml'.format(
                                self.__test_url),
                            sub_sitemaps=[
                                PagesXMLSitemap(
                                    url='{}/sitemap_news_2.xml'.format(
                                        self.__test_url),
                                    pages=[
                                        SitemapPage(
                                            url='{}/news/bar.html'.format(
                                                self.__test_url),
                                            news_story=SitemapNewsStory(
                                                title='Bar & bar',
                                                publish_date=self.
                                                TEST_DATE_DATETIME,
                                                publication_name=self.
                                                TEST_PUBLICATION_NAME,
                                                publication_language=self.
                                                TEST_PUBLICATION_LANGUAGE,
                                            ),
                                        ),
                                        SitemapPage(
                                            url='{}/news/baz.html'.format(
                                                self.__test_url),
                                            news_story=SitemapNewsStory(
                                                title='BΔ…ΕΎ',
                                                publish_date=self.
                                                TEST_DATE_DATETIME,
                                                publication_name=self.
                                                TEST_PUBLICATION_NAME,
                                                publication_language=self.
                                                TEST_PUBLICATION_LANGUAGE,
                                            ),
                                        ),
                                    ],
                                ),
                                InvalidSitemap(
                                    url='{}/sitemap_news_nonexistent.xml'.
                                    format(self.__test_url),
                                    reason=
                                    ('Unable to fetch sitemap from {base_url}/sitemap_news_nonexistent.xml: '
                                     '404 Not Found').format(
                                         base_url=self.__test_url),
                                ),
                            ],
                        ),
                    ],
                ),
            ],
        )

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        # PyCharm is not that great at formatting object diffs, so uncomment the following and set a breakpoint:
        #
        # expected_lines = str(expected_sitemap_tree).split()
        # actual_lines = str(actual_sitemap_tree).split()
        # diff = difflib.ndiff(expected_lines, actual_lines)
        # diff_str = '\n'.join(diff)
        # assert expected_lines == actual_lines

        assert expected_sitemap_tree == actual_sitemap_tree

        assert len(actual_sitemap_tree.all_pages()) == 5
Example #35
0
    def test_sitemap_tree_for_homepage(self):
        """Test sitemap_tree_for_homepage()."""

        pages = {
            '/': 'This is a homepage.',

            '/robots.txt': {
                'header': 'Content-Type: text/plain',
                'content': textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap_pages.xml
                        Sitemap: {base_url}/sitemap_news_index_1.xml
                    """.format(base_url=self.__test_url)).strip(),
            },

            # One sitemap for random static pages
            '/sitemap_pages.xml': {
                'header': 'Content-Type: application/xml',
                'content': textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                        <url>
                            <loc>{base_url}/about.html</loc>
                            <lastmod>{last_modified_date}</lastmod>
                            <changefreq>monthly</changefreq>
                            <priority>0.8</priority>
                        </url>
                        <url>
                            <loc>{base_url}/contact.html</loc>
                            <lastmod>{last_modified_date}</lastmod>

                            <!-- Invalid change frequency -->
                            <changefreq>when we feel like it</changefreq>

                            <!-- Invalid priority -->
                            <priority>1.1</priority>

                        </url>
                    </urlset>
                """.format(base_url=self.__test_url, last_modified_date=self.TEST_DATE_STR)).strip(),
            },

            # Index sitemap pointing to sitemaps with stories
            '/sitemap_news_index_1.xml': {
                'header': 'Content-Type: application/xml',
                'content': textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                        <sitemap>
                            <loc>{base_url}/sitemap_news_1.xml</loc>
                            <lastmod>{last_modified}</lastmod>
                        </sitemap>
                        <sitemap>
                            <loc>{base_url}/sitemap_news_index_2.xml</loc>
                            <lastmod>{last_modified}</lastmod>
                        </sitemap>
                    </sitemapindex>
                """.format(base_url=self.__test_url, last_modified=self.TEST_DATE_STR)).strip(),
            },

            # First sitemap with actual stories
            '/sitemap_news_1.xml': {
                'header': 'Content-Type: application/xml',
                'content': textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
                            xmlns:xhtml="http://www.w3.org/1999/xhtml">

                        <url>
                            <loc>{base_url}/news/foo.html</loc>

                            <!-- Element present but empty -->
                            <lastmod />

                            <!-- Some other XML namespace -->
                            <xhtml:link rel="alternate"
                                        media="only screen and (max-width: 640px)"
                                        href="{base_url}/news/foo.html?mobile=1" />

                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                            </news:news>
                        </url>

                        <!-- Has a duplicate story in /sitemap_news_2.xml -->
                        <url>
                            <loc>{base_url}/news/bar.html</loc>
                            <xhtml:link rel="alternate"
                                        media="only screen and (max-width: 640px)"
                                        href="{base_url}/news/bar.html?mobile=1" />
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title>Bar &amp; bar</news:title>
                            </news:news>
                        </url>

                    </urlset>
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip(),
            },

            # Another index sitemap pointing to a second sitemaps with stories
            '/sitemap_news_index_2.xml': {
                'header': 'Content-Type: application/xml',
                'content': textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">

                        <sitemap>
                            <!-- Extra whitespace added around URL -->
                            <loc>  {base_url}/sitemap_news_2.xml  </loc>
                            <lastmod>{last_modified}</lastmod>
                        </sitemap>

                        <!-- Nonexistent sitemap -->
                        <sitemap>
                            <loc>{base_url}/sitemap_news_nonexistent.xml</loc>
                            <lastmod>{last_modified}</lastmod>
                        </sitemap>

                    </sitemapindex>
                """.format(base_url=self.__test_url, last_modified=self.TEST_DATE_STR)).strip(),
            },

            # First sitemap with actual stories
            '/sitemap_news_2.xml': {
                'header': 'Content-Type: application/xml',
                'content': textwrap.dedent("""
                    <?xml version="1.0" encoding="UTF-8"?>
                    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                            xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
                            xmlns:xhtml="http://www.w3.org/1999/xhtml">

                        <!-- Has a duplicate story in /sitemap_news_1.xml -->
                        <url>
                            <!-- Extra whitespace added around URL -->
                            <loc>  {base_url}/news/bar.html#fragment_is_to_be_removed  </loc>
                            <xhtml:link rel="alternate"
                                        media="only screen and (max-width: 640px)"
                                        href="{base_url}/news/bar.html?mobile=1#fragment_is_to_be_removed" />
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>

                                <tag_without_inner_character_data name="value" />

                                <news:title>Bar &amp; bar</news:title>
                            </news:news>
                        </url>

                        <url>
                            <loc>{base_url}/news/baz.html</loc>
                            <xhtml:link rel="alternate"
                                        media="only screen and (max-width: 640px)"
                                        href="{base_url}/news/baz.html?mobile=1" />
                            <news:news>
                                <news:publication>
                                    <news:name>{publication_name}</news:name>
                                    <news:language>{publication_language}</news:language>
                                </news:publication>
                                <news:publication_date>{publication_date}</news:publication_date>
                                <news:title><![CDATA[BΔ…ΕΎ]]></news:title>    <!-- CDATA and UTF-8 -->
                            </news:news>
                        </url>

                    </urlset>
                """.format(
                    base_url=self.__test_url,
                    publication_name=self.TEST_PUBLICATION_NAME,
                    publication_language=self.TEST_PUBLICATION_LANGUAGE,
                    publication_date=self.TEST_DATE_STR,
                )).strip(),
            },
        }

        # noinspection PyArgumentList
        expected_sitemap_tree = IndexRobotsTxtSitemap(
            url='{}/robots.txt'.format(self.__test_url),
            sub_sitemaps=[
                PagesXMLSitemap(
                    url='{}/sitemap_pages.xml'.format(self.__test_url),
                    pages=[
                        SitemapPage(
                            url='{}/about.html'.format(self.__test_url),
                            last_modified=self.TEST_DATE_DATETIME,
                            news_story=None,
                            change_frequency=SitemapPageChangeFrequency.MONTHLY,
                            priority=Decimal('0.8'),
                        ),
                        SitemapPage(
                            url='{}/contact.html'.format(self.__test_url),
                            last_modified=self.TEST_DATE_DATETIME,
                            news_story=None,

                            # Invalid input -- should be reset to "always"
                            change_frequency=SitemapPageChangeFrequency.ALWAYS,

                            # Invalid input -- should be reset to 0.5 (the default as per the spec)
                            priority=Decimal('0.5'),

                        )
                    ],
                ),
                IndexXMLSitemap(
                    url='{}/sitemap_news_index_1.xml'.format(self.__test_url),
                    sub_sitemaps=[
                        PagesXMLSitemap(
                            url='{}/sitemap_news_1.xml'.format(self.__test_url),
                            pages=[
                                SitemapPage(
                                    url='{}/news/foo.html'.format(self.__test_url),
                                    news_story=SitemapNewsStory(
                                        title='Foo <foo>',
                                        publish_date=self.TEST_DATE_DATETIME,
                                        publication_name=self.TEST_PUBLICATION_NAME,
                                        publication_language=self.TEST_PUBLICATION_LANGUAGE,
                                    ),
                                ),
                                SitemapPage(
                                    url='{}/news/bar.html'.format(self.__test_url),
                                    news_story=SitemapNewsStory(
                                        title='Bar & bar',
                                        publish_date=self.TEST_DATE_DATETIME,
                                        publication_name=self.TEST_PUBLICATION_NAME,
                                        publication_language=self.TEST_PUBLICATION_LANGUAGE,
                                    ),
                                ),
                            ]
                        ),
                        IndexXMLSitemap(
                            url='{}/sitemap_news_index_2.xml'.format(self.__test_url),
                            sub_sitemaps=[
                                PagesXMLSitemap(
                                    url='{}/sitemap_news_2.xml'.format(self.__test_url),
                                    pages=[
                                        SitemapPage(
                                            url='{}/news/bar.html'.format(self.__test_url),
                                            news_story=SitemapNewsStory(
                                                title='Bar & bar',
                                                publish_date=self.TEST_DATE_DATETIME,
                                                publication_name=self.TEST_PUBLICATION_NAME,
                                                publication_language=self.TEST_PUBLICATION_LANGUAGE,
                                            ),
                                        ),
                                        SitemapPage(
                                            url='{}/news/baz.html'.format(self.__test_url),
                                            news_story=SitemapNewsStory(
                                                title='BΔ…ΕΎ',
                                                publish_date=self.TEST_DATE_DATETIME,
                                                publication_name=self.TEST_PUBLICATION_NAME,
                                                publication_language=self.TEST_PUBLICATION_LANGUAGE,
                                            ),
                                        ),
                                    ],
                                ),
                                InvalidSitemap(
                                    url='{}/sitemap_news_nonexistent.xml'.format(self.__test_url),
                                    reason=(
                                        'Unable to fetch sitemap from {base_url}/sitemap_news_nonexistent.xml: '
                                        '404 Not Found'
                                    ).format(base_url=self.__test_url),
                                ),
                            ],
                        ),
                    ],
                ),
            ],
        )

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url)

        hs.stop()

        # PyCharm is not that great at formatting object diffs, so uncomment the following and set a breakpoint:
        #
        # expected_lines = str(expected_sitemap_tree).split()
        # actual_lines = str(actual_sitemap_tree).split()
        # diff = difflib.ndiff(expected_lines, actual_lines)
        # diff_str = '\n'.join(diff)
        # assert expected_lines == actual_lines

        assert expected_sitemap_tree == actual_sitemap_tree

        assert len(actual_sitemap_tree.all_pages()) == 5
Example #36
0
def test_http_hash_server_stop():
    """Test if HTTP hash server gets stopped properly (including children)."""
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_sleep_forever(request: HashServer.Request) -> Union[str, bytes]:
        time.sleep(9999)

    pages = {
        '/simple-page': 'Works!',
        '/sleep-forever': {'callback': __callback_sleep_forever},
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    request_timed_out = False
    try:
        requests.get('%s/sleep-forever' % base_url, timeout=1)
    except requests.exceptions.Timeout:
        request_timed_out = True
    assert request_timed_out is True

    assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!'

    # Restart the server with the same port, make sure it works again, i.e. the server gets stopped properly, kills all
    # its children and releases the port
    hs.stop()

    assert tcp_port_is_open(port=port) is False

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port) is True

    assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!'

    hs.stop()
Example #37
0
class TestFetchBigFile(TestCase):
    __slots__ = [
        '__mock_data',
        '__hs',
        '__url',
        '__temp_dir',
        '__dest_file',
    ]

    def setUp(self) -> None:
        super().setUp()

        self.__mock_data = os.urandom(1024 * 1024)

        # noinspection PyUnusedLocal
        def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]:
            response = "".encode('utf-8')
            response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
            response += "Content-Type: audio/mpeg\r\n".encode('utf-8')
            response += f"Content-Length: {len(self.__mock_data)}\r\n".encode(
                'utf-8')
            response += "\r\n".encode('utf-8')
            response += self.__mock_data
            return response

        port = random_unused_port()
        pages = {
            '/test.mp3': {
                'callback': __mp3_callback,
            }
        }

        self.__hs = HashServer(port=port, pages=pages)
        self.__hs.start()

        self.__url = f"http://127.0.0.1:{port}/test.mp3"

        self.__temp_dir = tempfile.mkdtemp('test')
        self.__dest_file = os.path.join(self.__temp_dir, 'test.mp3')

    def tearDown(self) -> None:
        self.__hs.stop()
        shutil.rmtree(self.__temp_dir)

    def test_simple(self):
        """Simple fetch."""
        assert not os.path.isfile(
            self.__dest_file
        ), f"File '{self.__dest_file}' shouldn't exist before downloading."
        fetch_big_file(url=self.__url, dest_file=self.__dest_file)
        assert os.path.isfile(
            self.__dest_file
        ), f"File '{self.__dest_file}' should exist after downloading."
        assert os.stat(self.__dest_file).st_size == len(
            self.__mock_data
        ), f"File '{self.__dest_file}' should be of {len(self.__mock_data)} bytes."

        with open(self.__dest_file, mode='rb') as f:
            downloaded_data = f.read()
            assert self.__mock_data == downloaded_data, f"File's '{self.__dest_file}' data should be same as mock data."

    def test_max_size(self):
        """Fetch with max. size."""

        max_size = len(self.__mock_data) - 1000
        with pytest.raises(
                McPodcastFileFetchFailureException,
                message=
                f"Function should refuse to fetch more than {max_size} bytes"):
            fetch_big_file(url=self.__url,
                           dest_file=self.__dest_file,
                           max_size=max_size)
        assert not os.path.isfile(
            self.__dest_file
        ), f"File '{self.__dest_file}' should exist after a failed download."
Example #38
0
    def test_sitemap_tree_for_homepage_huge_sitemap(self):
        """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling)."""

        page_count = 1000

        sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
                    xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
                    xmlns:xhtml="http://www.w3.org/1999/xhtml">
        """
        for x in range(page_count):
            sitemap_xml += """
                <url>
                    <loc>{base_url}/news/page_{x}.html</loc>

                    <!-- Element present but empty -->
                    <lastmod />

                    <!-- Some other XML namespace -->
                    <xhtml:link rel="alternate"
                                media="only screen and (max-width: 640px)"
                                href="{base_url}/news/page_{x}.html?mobile=1" />

                    <news:news>
                        <news:publication>
                            <news:name>{publication_name}</news:name>
                            <news:language>{publication_language}</news:language>
                        </news:publication>
                        <news:publication_date>{publication_date}</news:publication_date>
                        <news:title>Foo &lt;foo&gt;</news:title>    <!-- HTML entity decoding -->
                    </news:news>
                </url>
            """.format(
                x=x,
                base_url=self.__test_url,
                publication_name=self.TEST_PUBLICATION_NAME,
                publication_language=self.TEST_PUBLICATION_LANGUAGE,
                publication_date=self.TEST_DATE_STR,
            )

        sitemap_xml += "</urlset>"

        pages = {
            '/': 'This is a homepage.',
            '/robots.txt': {
                'header':
                'Content-Type: text/plain',
                'content':
                textwrap.dedent("""
                        User-agent: *
                        Disallow: /whatever

                        Sitemap: {base_url}/sitemap.xml.gz
                    """.format(base_url=self.__test_url)).strip(),
            },
            '/sitemap.xml.gz': {
                'header': 'Content-Type: application/x-gzip',
                'content': gzip(sitemap_xml),
            },
        }

        hs = HashServer(port=self.__test_port, pages=pages)
        hs.start()

        actual_sitemap_tree = sitemap_tree_for_homepage(
            homepage_url=self.__test_url)

        hs.stop()

        assert len(actual_sitemap_tree.all_pages()) == page_count
Example #39
0
def test_http_hash_server_multiple_servers():
    """Test running multiple hash servers at the same time."""

    port_1 = random_unused_port()
    port_2 = random_unused_port()

    base_url_1 = 'http://localhost:%d' % port_1
    base_url_2 = 'http://localhost:%d' % port_2

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_sleep_forever(request: HashServer.Request) -> Union[str, bytes]:
        time.sleep(9999)

    pages = {
        '/simple-page': 'Works!',
        '/sleep-forever': {'callback': __callback_sleep_forever},
    }

    hs_1 = HashServer(port=port_1, pages=pages)
    hs_2 = HashServer(port=port_2, pages=pages)

    assert hs_1
    assert hs_2

    hs_1.start()
    hs_2.start()

    assert tcp_port_is_open(port=port_1)
    assert tcp_port_is_open(port=port_2)

    for base_url in [base_url_1, base_url_2]:
        request_timed_out = False
        try:
            requests.get('%s/sleep-forever' % base_url, timeout=1)
        except requests.exceptions.Timeout:
            request_timed_out = True
        assert request_timed_out is True

        assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!'

    hs_1.stop()
    hs_2.stop()

    assert tcp_port_is_open(port=port_1) is False
    assert tcp_port_is_open(port=port_2) is False
    def test_nyt_labels_annotator(self):
        media = self.db().create(table='media', insert_hash={
            'name': "test medium",
            'url': "url://test/medium",
        })

        story = self.db().create(table='stories', insert_hash={
            'media_id': media['media_id'],
            'url': 'url://story/a',
            'guid': 'guid://story/a',
            'title': 'story a',
            'description': 'description a',
            'publish_date': sql_now(),
            'collect_date': sql_now(),
            'full_text_rss': True,
        })
        stories_id = story['stories_id']

        self.db().create(table='story_sentences', insert_hash={
            'stories_id': stories_id,
            'sentence_number': 1,
            'sentence': 'I hope that the CLIFF annotator is working.',
            'media_id': media['media_id'],
            'publish_date': sql_now(),
            'language': 'en'
        })

        def __nyt_labels_sample_response(_: HashServer.Request) -> Union[str, bytes]:
            """Mock annotator."""
            response = ""
            response += "HTTP/1.0 200 OK\r\n"
            response += "Content-Type: application/json; charset=UTF-8\r\n"
            response += "\r\n"
            response += encode_json(self.__sample_nyt_labels_response())
            return response

        pages = {
            '/predict.json': {
                'callback': __nyt_labels_sample_response,
            }
        }

        port = random_unused_port()
        annotator_url = 'http://localhost:%d/predict.json' % port

        hs = HashServer(port=port, pages=pages)
        hs.start()

        # Inject NYTLabels credentials into configuration
        config = py_get_config()
        new_config = copy.deepcopy(config)
        new_config['nytlabels'] = {
            'enabled': True,
            'annotator_url': annotator_url,
        }
        py_set_config(new_config)

        nytlabels = NYTLabelsAnnotator()
        nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id)
        nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id)

        hs.stop()

        # Reset configuration
        py_set_config(config)

        annotation_exists = self.db().query("""
            SELECT 1
            FROM nytlabels_annotations
            WHERE object_id = %(object_id)s
        """, {'object_id': stories_id}).hash()
        assert annotation_exists is not None

        story_tags = self.db().query("""
            SELECT
                tags.tag AS tags_name,
                tags.label AS tags_label,
                tags.description AS tags_description,
                tag_sets.name AS tag_sets_name,
                tag_sets.label AS tag_sets_label,
                tag_sets.description AS tag_sets_description
            FROM stories_tags_map
                INNER JOIN tags
                    ON stories_tags_map.tags_id = tags.tags_id
                INNER JOIN tag_sets
                    ON tags.tag_sets_id = tag_sets.tag_sets_id
            WHERE stories_tags_map.stories_id = %(stories_id)s
            ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C"
        """, {'stories_id': stories_id}).hashes()

        expected_tags = self.__expected_tags()

        assert story_tags == expected_tags
Example #41
0
def test_http_hash_server_multiple_clients():
    """Test running hash server with multiple clients."""

    port = random_unused_port()

    # noinspection PyTypeChecker,PyUnusedLocal
    def __callback_timeout(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "\r\n"
        r += "And now we wait"
        time.sleep(10)
        return str.encode(r)

    pages = {
        '/a': 'π˜›π˜©π˜ͺ𝘴 π˜ͺ𝘴 𝘱𝘒𝘨𝘦 𝘈.',
        '/timeout': {'callback': __callback_timeout},
        # '/does-not-exist': '404',
        '/b': 'π•Ώπ–π–Žπ–˜ π–Žπ–˜ π–•π–†π–Œπ–Š 𝕭.',
        '/c': 'π•‹π•™π•šπ•€ π•šπ•€ π•‘π•’π•˜π•– β„‚.',
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    base_url = 'http://localhost:%d' % port

    session = FuturesSession(max_workers=10)

    future_a = session.get('%s/a' % base_url, timeout=2)
    future_timeout = session.get('%s/timeout' % base_url, timeout=2)
    future_404 = session.get('%s/does-not-exist' % base_url, timeout=2)
    future_b = session.get('%s/b' % base_url, timeout=2)
    future_c = session.get('%s/c' % base_url, timeout=2)

    response_a = future_a.result()

    with pytest.raises(requests.Timeout):
        future_timeout.result()

    response_404 = future_404.result()
    response_b = future_b.result()
    response_c = future_c.result()

    assert response_b.status_code == 200
    assert response_b.text == 'π•Ώπ–π–Žπ–˜ π–Žπ–˜ π–•π–†π–Œπ–Š 𝕭.'

    assert response_c.status_code == 200
    assert response_c.text == 'π•‹π•™π•šπ•€ π•šπ•€ π•‘π•’π•˜π•– β„‚.'

    assert response_404.status_code == 404

    assert response_a.status_code == 200
    assert response_a.text == 'π˜›π˜©π˜ͺ𝘴 π˜ͺ𝘴 𝘱𝘒𝘨𝘦 𝘈.'

    hs.stop()