def test_all_url_variants_basic(self): """Basic""" pages = { '/first': '<meta http-equiv="refresh" content="0; URL=/second%s" />' % self.CRUFT, '/second': '<meta http-equiv="refresh" content="0; URL=/third%s" />' % self.CRUFT, '/third': 'This is where the redirect chain should end.', } hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages) hs.start() actual_url_variants = all_url_variants(db=self.db(), url=self.STARTING_URL) hs.stop() assert set(actual_url_variants) == { self.STARTING_URL, self.STARTING_URL_WITHOUT_CRUFT, '%s/third' % self.TEST_HTTP_SERVER_URL, '%s/third%s' % ( self.TEST_HTTP_SERVER_URL, self.CRUFT, ) }
class MockAPIServer(object): __slots__ = [ '__hs', '__port', ] def __init__(self, pages: Dict[str, Any]): self.__port = random_unused_port() self.__hs = HashServer(port=self.__port, pages=pages) self.__hs.start() def __del__(self): self.__hs.stop() def config(self) -> FacebookConfig: port = self.__port class MockFacebookConfig(FacebookConfig): @staticmethod def api_endpoint() -> str: return f'http://localhost:{port}/' @staticmethod def seconds_to_wait_between_retries() -> int: # Don't wait between retries return 0 return MockFacebookConfig()
def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self): """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: ', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever """.format(base_url=self.__test_url)).strip(), }, } # noinspection PyArgumentList expected_sitemap_tree = IndexRobotsTxtSitemap( url='{}/robots.txt'.format(self.__test_url), sub_sitemaps=[], ) hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() assert expected_sitemap_tree == actual_sitemap_tree
def test_all_url_variants_link_canonical(self): """<link rel="canonical" />""" pages = { '/first': '<meta http-equiv="refresh" content="0; URL=/second%s" />' % self.CRUFT, '/second': '<meta http-equiv="refresh" content="0; URL=/third%s" />' % self.CRUFT, '/third': '<link rel="canonical" href="%s/fourth" />' % self.TEST_HTTP_SERVER_URL, } hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages) hs.start() actual_url_variants = all_url_variants(db=self.db(), url=self.STARTING_URL) hs.stop() assert set(actual_url_variants) == { self.STARTING_URL, self.STARTING_URL_WITHOUT_CRUFT, '%s/third' % self.TEST_HTTP_SERVER_URL, '%s/third%s' % ( self.TEST_HTTP_SERVER_URL, self.CRUFT, ), '%s/fourth' % self.TEST_HTTP_SERVER_URL, }
def test_sitemap_tree_for_homepage_robots_txt_no_content_type(self): """Test sitemap_tree_for_homepage() with no Content-Type in robots.txt.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: ', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever """.format(base_url=self.__test_url)).strip(), }, } # noinspection PyArgumentList expected_sitemap_tree = IndexRobotsTxtSitemap( url='{}/robots.txt'.format(self.__test_url), sub_sitemaps=[], ) hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() assert expected_sitemap_tree == actual_sitemap_tree
def test_extract_article_html_from_page_html_connection_errors(self): """Try extracting with connection errors.""" # Use multiprocessing.Value() because request might be handled in a fork self.is_first_response = multiprocessing.Value('i', 1) pages = { '/extract': { 'callback': self.__extract_but_initially_fail, } } port = random_unused_port() hs = HashServer(port=port, pages=pages) hs.start() class MockExtractorCommonConfig(CommonConfig): """Mock configuration which points to our unstable extractor.""" def extractor_api_url(self) -> str: return f'http://localhost:{port}/extract' extractor_response = extract_article_html_from_page_html( content='whatever', config=MockExtractorCommonConfig()) hs.stop() assert extractor_response assert 'extracted_html' in extractor_response assert 'extractor_version' in extractor_response assert extractor_response[ 'extracted_html'] == self.expected_extracted_text assert not self.is_first_response.value, "Make sure the initial extractor call failed."
def test_fetch_url(): db = connect_to_db() def _meta_redirect(r): resp = "" resp += 'HTTP/1.0 200 OK\r\n' resp += 'Content-Type: text/html\r\n\r\n' resp += '<meta http-equiv="refresh" content="0; url=%s-foo">\n' % r.url( ) return resp hs = HashServer(port=0, pages={ '/foo': 'bar', '/400': { 'http_status_code': 400 }, '/404': { 'http_status_code': 404 }, '/500': { 'http_status_code': 500 }, '/mr-foo': 'meta redirect target', '/mr': { 'callback': _meta_redirect }, }) hs.start(delay=2) port = hs.port() timeout_args = { 'network_down_host': 'localhost', 'network_down_port': port, 'network_down_timeout': 1, 'domain_timeout': 0 } # before delayed start, 404s and 500s should still return None assert not _fetch_url(db, hs.page_url('/404'), **timeout_args).is_success assert not _fetch_url(db, hs.page_url('/500'), **timeout_args).is_success # request for a valid page should make the call wait until the hs comes up assert _fetch_url(db, hs.page_url('/foo'), **timeout_args).content == 'bar' # and now a 400 should return a None assert not _fetch_url(db, hs.page_url('/400'), **timeout_args).is_success # make sure invalid url does not raise an exception assert not _fetch_url(db, 'this is not a url', **timeout_args) is None # make sure that requests follow meta redirects response = _fetch_url(db, hs.page_url('/mr'), **timeout_args) assert response.content == 'meta redirect target' assert response.last_requested_url == hs.page_url('/mr-foo')
def test_request(self) -> None: """Test requests with throttling.""" pages = {'/test': 'Hello!', } port = 8888 hs = HashServer(port=port, pages=pages) hs.start() ua = ThrottledUserAgent(self.db(), domain_timeout=2) test_url = hs.page_url('/test') # first request should work response = ua.get(test_url) assert response.decoded_content() == 'Hello!' # fail because we're in the timeout ua = ThrottledUserAgent(self.db(), domain_timeout=2) self.assertRaises(McThrottledDomainException, ua.get, test_url) # succeed because it's a different domain ua = ThrottledUserAgent(self.db(), domain_timeout=2) response = ua.get('http://127.0.0.1:8888/test') assert response.decoded_content() == 'Hello!' # still fail within the timeout ua = ThrottledUserAgent(self.db(), domain_timeout=2) self.assertRaises(McThrottledDomainException, ua.get, test_url) time.sleep(2) # now we're outside the timeout, so it should work ua = ThrottledUserAgent(self.db(), domain_timeout=2) response = ua.get(test_url) assert response.decoded_content() == 'Hello!' # and follow up request on the same ua object should work response = ua.get(test_url) assert response.decoded_content() == 'Hello!' # but then fail within the new timeout period with a new object ua = ThrottledUserAgent(self.db(), domain_timeout=2) self.assertRaises(McThrottledDomainException, ua.get, test_url) hs.stop() # test domain_timeout assignment logic ua = ThrottledUserAgent(self.db(), domain_timeout=100) assert ua.domain_timeout == 100 config = mediawords.util.config.get_config() config['mediawords']['throttled_user_agent_domain_timeout'] = 200 ua = ThrottledUserAgent(self.db()) assert ua.domain_timeout == 200 del config['mediawords']['throttled_user_agent_domain_timeout'] ua = ThrottledUserAgent(self.db()) assert ua.domain_timeout == mediawords.util.web.user_agent.throttled._DEFAULT_DOMAIN_TIMEOUT
def test_http_hash_server_stop(): """Test if HTTP hash server gets stopped properly (including children).""" port = random_unused_port() base_url = 'http://localhost:%d' % port # noinspection PyTypeChecker,PyUnusedLocal def __callback_sleep_forever( request: HashServer.Request) -> Union[str, bytes]: time.sleep(9999) pages = { '/simple-page': 'Works!', '/sleep-forever': { 'callback': __callback_sleep_forever }, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) request_timed_out = False try: requests.get('%s/sleep-forever' % base_url, timeout=1) except requests.exceptions.Timeout: request_timed_out = True assert request_timed_out is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' # Restart the server with the same port, make sure it works again, i.e. the server gets stopped properly, kills all # its children and releases the port hs.stop() assert tcp_port_is_open(port=port) is False hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' hs.stop()
def test_http_hash_server_multiple_servers(): """Test running multiple hash servers at the same time.""" port_1 = random_unused_port() port_2 = random_unused_port() base_url_1 = 'http://localhost:%d' % port_1 base_url_2 = 'http://localhost:%d' % port_2 # noinspection PyTypeChecker,PyUnusedLocal def __callback_sleep_forever( request: HashServer.Request) -> Union[str, bytes]: time.sleep(9999) pages = { '/simple-page': 'Works!', '/sleep-forever': { 'callback': __callback_sleep_forever }, } hs_1 = HashServer(port=port_1, pages=pages) hs_2 = HashServer(port=port_2, pages=pages) assert hs_1 assert hs_2 hs_1.start() hs_2.start() assert tcp_port_is_open(port=port_1) assert tcp_port_is_open(port=port_2) for base_url in [base_url_1, base_url_2]: request_timed_out = False try: requests.get('%s/sleep-forever' % base_url, timeout=1) except requests.exceptions.Timeout: request_timed_out = True assert request_timed_out is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' hs_1.stop() hs_2.stop() assert tcp_port_is_open(port=port_1) is False assert tcp_port_is_open(port=port_2) is False
def testRandomPort() -> None: """Test assigning a random port where port = 0.""" hss = [] for i in range(3): hs = HashServer(port=0, pages={'/foo': 'bar'}) assert hs is not None hs.start() assert hs.port() >= START_RANDOM_PORT assert tcp_port_is_open(hs.port()) assert str(requests.get(hs.page_url('/foo')).text) == 'bar' hss.append(hs) [hs.stop() for hs in hss]
def testDelay() -> None: """Test the delay= parameter to hs.start.""" hs = HashServer(port=0, pages={'/foo': 'bar'}) hs.start(delay=1) caught_exception = False try: requests.get(hs.page_url('/foo')) except requests.exceptions.ConnectionError: caught_exception = True assert caught_exception time.sleep(2) assert str(requests.get(hs.page_url('/foo')).text) == 'bar' hs.stop()
def test_all_url_variants_redirect_to_homepage(self): """Redirect to a homepage""" pages = { '/first': '<meta http-equiv="refresh" content="0; URL=/second%s" />' % self.CRUFT, '/second': '<meta http-equiv="refresh" content="0; URL=/', } hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages) hs.start() actual_url_variants = all_url_variants(db=self.db(), url=self.STARTING_URL) hs.stop() assert set(actual_url_variants) == { self.STARTING_URL, self.STARTING_URL_WITHOUT_CRUFT, '%s/second' % self.TEST_HTTP_SERVER_URL, '%s/second%s' % (self.TEST_HTTP_SERVER_URL, self.CRUFT,), }
def test_http_hash_server_stop(): """Test if HTTP hash server gets stopped properly (including children).""" port = random_unused_port() base_url = 'http://localhost:%d' % port # noinspection PyTypeChecker,PyUnusedLocal def __callback_sleep_forever(request: HashServer.Request) -> Union[str, bytes]: time.sleep(9999) pages = { '/simple-page': 'Works!', '/sleep-forever': {'callback': __callback_sleep_forever}, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) request_timed_out = False try: requests.get('%s/sleep-forever' % base_url, timeout=1) except requests.exceptions.Timeout: request_timed_out = True assert request_timed_out is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' # Restart the server with the same port, make sure it works again, i.e. the server gets stopped properly, kills all # its children and releases the port hs.stop() assert tcp_port_is_open(port=port) is False hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' hs.stop()
def test_http_hash_server_multiple_servers(): """Test running multiple hash servers at the same time.""" port_1 = random_unused_port() port_2 = random_unused_port() base_url_1 = 'http://localhost:%d' % port_1 base_url_2 = 'http://localhost:%d' % port_2 # noinspection PyTypeChecker,PyUnusedLocal def __callback_sleep_forever(request: HashServer.Request) -> Union[str, bytes]: time.sleep(9999) pages = { '/simple-page': 'Works!', '/sleep-forever': {'callback': __callback_sleep_forever}, } hs_1 = HashServer(port=port_1, pages=pages) hs_2 = HashServer(port=port_2, pages=pages) assert hs_1 assert hs_2 hs_1.start() hs_2.start() assert tcp_port_is_open(port=port_1) assert tcp_port_is_open(port=port_2) for base_url in [base_url_1, base_url_2]: request_timed_out = False try: requests.get('%s/sleep-forever' % base_url, timeout=1) except requests.exceptions.Timeout: request_timed_out = True assert request_timed_out is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' hs_1.stop() hs_2.stop() assert tcp_port_is_open(port=port_1) is False assert tcp_port_is_open(port=port_2) is False
def test_all_url_variants_link_canonical(self): """<link rel="canonical" />""" pages = { '/first': '<meta http-equiv="refresh" content="0; URL=/second%s" />' % self.CRUFT, '/second': '<meta http-equiv="refresh" content="0; URL=/third%s" />' % self.CRUFT, '/third': '<link rel="canonical" href="%s/fourth" />' % self.TEST_HTTP_SERVER_URL, } hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages) hs.start() actual_url_variants = all_url_variants(db=self.db(), url=self.STARTING_URL) hs.stop() assert set(actual_url_variants) == { self.STARTING_URL, self.STARTING_URL_WITHOUT_CRUFT, '%s/third' % self.TEST_HTTP_SERVER_URL, '%s/third%s' % (self.TEST_HTTP_SERVER_URL, self.CRUFT,), '%s/fourth' % self.TEST_HTTP_SERVER_URL, }
def test_all_url_variants_basic(self): """Basic""" pages = { '/first': '<meta http-equiv="refresh" content="0; URL=/second%s" />' % self.CRUFT, '/second': '<meta http-equiv="refresh" content="0; URL=/third%s" />' % self.CRUFT, '/third': 'This is where the redirect chain should end.', } hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages) hs.start() actual_url_variants = all_url_variants(db=self.db(), url=self.STARTING_URL) hs.stop() assert set(actual_url_variants) == { self.STARTING_URL, self.STARTING_URL_WITHOUT_CRUFT, '%s/third' % self.TEST_HTTP_SERVER_URL, '%s/third%s' % (self.TEST_HTTP_SERVER_URL, self.CRUFT,) }
def test_run_fetcher(): db = connect_to_db() medium = create_test_medium(db=db, label='foo') feed = create_test_feed(db=db, label='foo', medium=medium) story = create_test_story(db=db, label='foo', feed=feed) port = random_unused_port() pages = { '/foo': 'foo', '/bar': 'bar', } hs = HashServer(port=port, pages=pages) hs.start() download = db.create(table='downloads', insert_hash={ 'state': 'pending', 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'], 'type': 'content', 'sequence': 1, 'priority': 1, 'url': f"http://localhost:{port}/foo", 'host': 'localhost', }) db.query(""" INSERT INTO queued_downloads (downloads_id) SELECT downloads_id FROM downloads """) run_fetcher(no_daemon=True) test_download = db.find_by_id(table='downloads', object_id=download['downloads_id']) assert test_download['state'] == 'success'
def test_sitemap_tree_for_homepage_no_robots_txt(self): """Test sitemap_tree_for_homepage() with no robots.txt.""" pages = { '/': 'This is a homepage.', } # noinspection PyArgumentList expected_sitemap_tree = InvalidSitemap( url='{}/robots.txt'.format(self.__test_url), reason=( 'Unable to fetch sitemap from {base_url}/robots.txt: 404 Not Found' ).format(base_url=self.__test_url), ) hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() assert expected_sitemap_tree == actual_sitemap_tree
def test_sitemap_tree_for_homepage_no_robots_txt(self): """Test sitemap_tree_for_homepage() with no robots.txt.""" pages = { '/': 'This is a homepage.', } # noinspection PyArgumentList expected_sitemap_tree = InvalidSitemap( url='{}/robots.txt'.format(self.__test_url), reason= ('Unable to fetch sitemap from {base_url}/robots.txt: 404 Not Found' ).format(base_url=self.__test_url), ) hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() assert expected_sitemap_tree == actual_sitemap_tree
def test_all_url_variants_redirect_to_homepage(self): """Redirect to a homepage""" pages = { '/first': '<meta http-equiv="refresh" content="0; URL=/second%s" />' % self.CRUFT, '/second': '<meta http-equiv="refresh" content="0; URL=/', } hs = HashServer(port=self.TEST_HTTP_SERVER_PORT, pages=pages) hs.start() actual_url_variants = all_url_variants(db=self.db(), url=self.STARTING_URL) hs.stop() assert set(actual_url_variants) == { self.STARTING_URL, self.STARTING_URL_WITHOUT_CRUFT, '%s/second' % self.TEST_HTTP_SERVER_URL, '%s/second%s' % ( self.TEST_HTTP_SERVER_URL, self.CRUFT, ), }
def test_sitemap_tree_for_homepage_plain_text(self): """Test sitemap_tree_for_homepage() with plain text sitemaps.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_1.txt Sitemap: {base_url}/sitemap_2.txt.dat """.format(base_url=self.__test_url)).strip(), }, # Plain text uncompressed sitemap '/sitemap_1.txt': { 'content': textwrap.dedent(""" {base_url}/news/foo.html {base_url}/news/bar.html Some other stuff which totally doesn't look like an URL """.format(base_url=self.__test_url)).strip(), }, # Plain text compressed sitemap without .gz extension '/sitemap_2.txt.dat': { 'header': 'Content-Type: application/x-gzip', 'content': gzip(textwrap.dedent(""" {base_url}/news/bar.html {base_url}/news/baz.html """.format(base_url=self.__test_url)).strip()), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 2 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap_1, PagesTextSitemap) assert len(sitemap_1.pages) == 2 sitemap_2 = actual_sitemap_tree.sub_sitemaps[1] assert isinstance(sitemap_2, PagesTextSitemap) assert len(sitemap_2.pages) == 2 pages = actual_sitemap_tree.all_pages() assert len(pages) == 3 print(pages) assert SitemapPage(url='{}/news/foo.html'.format(self.__test_url)) in pages assert SitemapPage(url='{}/news/bar.html'.format(self.__test_url)) in pages assert SitemapPage(url='{}/news/baz.html'.format(self.__test_url)) in pages
def test_sitemap_tree_for_homepage(self): """Test sitemap_tree_for_homepage().""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_pages.xml Sitemap: {base_url}/sitemap_news_index_1.xml """.format(base_url=self.__test_url)).strip(), }, # One sitemap for random static pages '/sitemap_pages.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>{base_url}/about.html</loc> <lastmod>{last_modified_date}</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> <url> <loc>{base_url}/contact.html</loc> <lastmod>{last_modified_date}</lastmod> <!-- Invalid change frequency --> <changefreq>when we feel like it</changefreq> <!-- Invalid priority --> <priority>1.1</priority> </url> </urlset> """.format(base_url=self.__test_url, last_modified_date=self.TEST_DATE_STR)).strip(), }, # Index sitemap pointing to sitemaps with stories '/sitemap_news_index_1.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>{base_url}/sitemap_news_1.xml</loc> <lastmod>{last_modified}</lastmod> </sitemap> <sitemap> <loc>{base_url}/sitemap_news_index_2.xml</loc> <lastmod>{last_modified}</lastmod> </sitemap> </sitemapindex> """.format(base_url=self.__test_url, last_modified=self.TEST_DATE_STR)).strip(), }, # First sitemap with actual stories '/sitemap_news_1.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> <url> <loc>{base_url}/news/foo.html</loc> <!-- Element present but empty --> <lastmod /> <!-- Some other XML namespace --> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/foo.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> <!-- Has a duplicate story in /sitemap_news_2.xml --> <url> <loc>{base_url}/news/bar.html</loc> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/bar.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Bar & bar</news:title> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip(), }, # Another index sitemap pointing to a second sitemaps with stories '/sitemap_news_index_2.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <!-- Extra whitespace added around URL --> <loc> {base_url}/sitemap_news_2.xml </loc> <lastmod>{last_modified}</lastmod> </sitemap> <!-- Nonexistent sitemap --> <sitemap> <loc>{base_url}/sitemap_news_nonexistent.xml</loc> <lastmod>{last_modified}</lastmod> </sitemap> </sitemapindex> """.format(base_url=self.__test_url, last_modified=self.TEST_DATE_STR)).strip(), }, # First sitemap with actual stories '/sitemap_news_2.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> <!-- Has a duplicate story in /sitemap_news_1.xml --> <url> <!-- Extra whitespace added around URL --> <loc> {base_url}/news/bar.html#fragment_is_to_be_removed </loc> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/bar.html?mobile=1#fragment_is_to_be_removed" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <tag_without_inner_character_data name="value" /> <news:title>Bar & bar</news:title> </news:news> </url> <url> <loc>{base_url}/news/baz.html</loc> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/baz.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip(), }, } # noinspection PyArgumentList expected_sitemap_tree = IndexRobotsTxtSitemap( url='{}/robots.txt'.format(self.__test_url), sub_sitemaps=[ PagesXMLSitemap( url='{}/sitemap_pages.xml'.format(self.__test_url), pages=[ SitemapPage( url='{}/about.html'.format(self.__test_url), last_modified=self.TEST_DATE_DATETIME, news_story=None, change_frequency=SitemapPageChangeFrequency.MONTHLY, priority=Decimal('0.8'), ), SitemapPage( url='{}/contact.html'.format(self.__test_url), last_modified=self.TEST_DATE_DATETIME, news_story=None, # Invalid input -- should be reset to "always" change_frequency=SitemapPageChangeFrequency.ALWAYS, # Invalid input -- should be reset to 0.5 (the default as per the spec) priority=Decimal('0.5'), ) ], ), IndexXMLSitemap( url='{}/sitemap_news_index_1.xml'.format(self.__test_url), sub_sitemaps=[ PagesXMLSitemap( url='{}/sitemap_news_1.xml'.format(self.__test_url), pages=[ SitemapPage( url='{}/news/foo.html'.format(self.__test_url), news_story=SitemapNewsStory( title='Foo <foo>', publish_date=self.TEST_DATE_DATETIME, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, ), ), SitemapPage( url='{}/news/bar.html'.format(self.__test_url), news_story=SitemapNewsStory( title='Bar & bar', publish_date=self.TEST_DATE_DATETIME, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, ), ), ] ), IndexXMLSitemap( url='{}/sitemap_news_index_2.xml'.format(self.__test_url), sub_sitemaps=[ PagesXMLSitemap( url='{}/sitemap_news_2.xml'.format(self.__test_url), pages=[ SitemapPage( url='{}/news/bar.html'.format(self.__test_url), news_story=SitemapNewsStory( title='Bar & bar', publish_date=self.TEST_DATE_DATETIME, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, ), ), SitemapPage( url='{}/news/baz.html'.format(self.__test_url), news_story=SitemapNewsStory( title='Bąž', publish_date=self.TEST_DATE_DATETIME, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, ), ), ], ), InvalidSitemap( url='{}/sitemap_news_nonexistent.xml'.format(self.__test_url), reason=( 'Unable to fetch sitemap from {base_url}/sitemap_news_nonexistent.xml: ' '404 Not Found' ).format(base_url=self.__test_url), ), ], ), ], ), ], ) hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() # PyCharm is not that great at formatting object diffs, so uncomment the following and set a breakpoint: # # expected_lines = str(expected_sitemap_tree).split() # actual_lines = str(actual_sitemap_tree).split() # diff = difflib.ndiff(expected_lines, actual_lines) # diff_str = '\n'.join(diff) # assert expected_lines == actual_lines assert expected_sitemap_tree == actual_sitemap_tree assert len(actual_sitemap_tree.all_pages()) == 5
def test_tagging(self): db = connect_to_db() media = db.create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = db.create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] db.create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __cliff_sample_response( _: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(sample_cliff_response()) return response pages = { '/cliff/parse/text': { 'callback': __cliff_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/cliff/parse/text' % port hs = HashServer(port=port, pages=pages) hs.start() class TestCLIFFFetcherConfig(CLIFFTagsFromAnnotationConfig): @staticmethod def annotator_url() -> str: return annotator_url cliff = CLIFFTagsFromAnnotation(tagger_config=TestCLIFFFetcherConfig()) cliff.update_tags_for_story(db=db, stories_id=stories_id) hs.stop() story_tags = db.query( """ SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY lower(tag_sets.name), lower(tags.tag) """, { 'stories_id': stories_id }).hashes() expected_tags = expected_cliff_tags() assert story_tags == expected_tags
class AbstractFetchTranscriptTestCase(TestCase, metaclass=abc.ABCMeta): __slots__ = [ 'db', 'hs', 'stories_id', 'transcript_fetches', ] @classmethod @abc.abstractmethod def input_media_path(cls) -> str: """Return full path to input media file.""" raise NotImplemented("Abstract method") @classmethod @abc.abstractmethod def input_media_mime_type(cls) -> str: """Return input media file's MIME type.""" raise NotImplemented("Abstract method") @classmethod @abc.abstractmethod def story_title_description(cls) -> str: """Return a string to store as both story title and description.""" raise NotImplemented("Abstract method") @classmethod @abc.abstractmethod def retries_per_step(cls) -> int: """How many retries to do per each local step.""" raise NotImplemented("Abstract method") @classmethod @abc.abstractmethod def seconds_between_retries(cls) -> float: """How many seconds to wait between retries.""" raise NotImplemented("Abstract method") def setUp(self) -> None: super().setUp() self.db = connect_to_db() test_medium = create_test_medium(db=self.db, label='test') test_feed = create_test_feed(db=self.db, label='test', medium=test_medium) # Add a story with a random ID to decrease the chance that object in GCS will collide with another test running # at the same time self.stories_id = random.randint(1, 2147483647 - 1) self.db.query( """ INSERT INTO stories ( stories_id, media_id, url, guid, title, description, publish_date, collect_date, full_text_rss ) VALUES ( %(stories_id)s, %(media_id)s, 'http://story.test/', 'guid://story.test/', 'story', 'description', '2016-10-15 08:00:00', '2016-10-15 10:00:00', true ) """, { 'stories_id': self.stories_id, 'media_id': test_feed['media_id'], }) # Create missing partitions for "feeds_stories_map" self.db.query('SELECT create_missing_partitions()') self.db.create(table='feeds_stories_map', insert_hash={ 'feeds_id': int(test_feed['feeds_id']), 'stories_id': self.stories_id, }) assert os.path.isfile(self.input_media_path( )), f"Test media file '{self.input_media_path()}' should exist." with open(self.input_media_path(), mode='rb') as f: test_data = f.read() # noinspection PyUnusedLocal def __media_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += f"Content-Type: {self.input_media_mime_type()}\r\n".encode( 'utf-8') response += f"Content-Length: {len(test_data)}\r\n".encode('utf-8') response += "\r\n".encode('utf-8') response += test_data return response port = 8080 # Port exposed on docker-compose.tests.yml media_path = '/test_media_file' pages = { media_path: { 'callback': __media_callback, } } self.hs = HashServer(port=port, pages=pages) self.hs.start() # Using our hostname as it will be another container that will be connecting to us media_url = f'http://{socket.gethostname()}:{port}{media_path}' self.db.insert(table='story_enclosures', insert_hash={ 'stories_id': self.stories_id, 'url': media_url, 'mime_type': self.input_media_mime_type(), 'length': len(test_data), }) # Add a "podcast-fetch-episode" job JobBroker( queue_name='MediaWords::Job::Podcast::FetchEpisode').add_to_queue( stories_id=self.stories_id) total_time = int(self.retries_per_step() * self.seconds_between_retries()) # Wait for "podcast-fetch-episode" to transcode, upload to Google Storage, and write it to "podcast_episodes" episodes = None for x in range(1, self.retries_per_step() + 1): log.info(f"Waiting for episode to appear (#{x})...") episodes = self.db.select(table='podcast_episodes', what_to_select='*').hashes() if episodes: log.info(f"Episode is here!") break time.sleep(self.seconds_between_retries()) assert episodes, f"Episode didn't show up in {total_time} seconds." # Wait for "podcast-submit-operation" to submit Speech API operation self.transcript_fetches = None for x in range(1, self.retries_per_step() + 1): log.info(f"Waiting for transcript fetch to appear (#{x})...") self.transcript_fetches = self.db.select( table='podcast_episode_transcript_fetches', what_to_select='*').hashes() if self.transcript_fetches: log.info(f"Transcript fetch is here!") break time.sleep(self.seconds_between_retries()) assert self.transcript_fetches, f"Operation didn't show up in {total_time} seconds." def tearDown(self) -> None: super().tearDown() self.hs.stop()
def test_sitemap_tree_for_homepage_prematurely_ending_xml(self): """Test sitemap_tree_for_homepage() with clipped XML. Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with this behavior, so we have to support this too. """ pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap.xml """.format(base_url=self.__test_url)).strip(), }, '/sitemap.xml': { 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/first.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>First story</news:title> </news:news> </url> <url> <loc>{base_url}/news/second.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Second story</news:title> </news:news> </url> <!-- The following story shouldn't get added as the XML ends prematurely --> <url> <loc>{base_url}/news/third.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publicat """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip(), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 1 sitemap = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap, PagesXMLSitemap) assert len(sitemap.pages) == 2
def test_sitemap_tree_for_homepage_plain_text(self): """Test sitemap_tree_for_homepage() with plain text sitemaps.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_1.txt Sitemap: {base_url}/sitemap_2.txt.dat """.format(base_url=self.__test_url)).strip(), }, # Plain text uncompressed sitemap '/sitemap_1.txt': { 'content': textwrap.dedent(""" {base_url}/news/foo.html {base_url}/news/bar.html Some other stuff which totally doesn't look like an URL """.format(base_url=self.__test_url)).strip(), }, # Plain text compressed sitemap without .gz extension '/sitemap_2.txt.dat': { 'header': 'Content-Type: application/x-gzip', 'content': gzip( textwrap.dedent(""" {base_url}/news/bar.html {base_url}/news/baz.html """.format(base_url=self.__test_url)).strip()), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 2 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap_1, PagesTextSitemap) assert len(sitemap_1.pages) == 2 sitemap_2 = actual_sitemap_tree.sub_sitemaps[1] assert isinstance(sitemap_2, PagesTextSitemap) assert len(sitemap_2.pages) == 2 pages = actual_sitemap_tree.all_pages() assert len(pages) == 3 print(pages) assert SitemapPage( url='{}/news/foo.html'.format(self.__test_url)) in pages assert SitemapPage( url='{}/news/bar.html'.format(self.__test_url)) in pages assert SitemapPage( url='{}/news/baz.html'.format(self.__test_url)) in pages
def test_fetch_link_job_update_state(): db = connect_to_db() # In testing environment, RabbitMQ or workers might be slow to start up so # we increase the delay ("timeout") between domain fetches to be able to # test out throttling domain_timeout = 360 hs = HashServer(port=0, pages={ '/foo': '<title>foo</title>', '/throttle': '<title>throttle</title>' }) hs.start() topic = create_test_topic(db, 'foo') topic['pattern'] = '.' topic = db.update_by_id('topics', topic['topics_id'], topic) fetch_url = hs.page_url('/foo') # basic sanity test for link fetching tfu = db.create( 'topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': hs.page_url('/foo'), 'state': FETCH_STATE_PENDING }) fetch_topic_url_update_state( db=db, topics_id=topic['topics_id'], topic_fetch_urls_id=tfu['topic_fetch_urls_id'], domain_timeout=domain_timeout, ) tfu = db.require_by_id('topic_fetch_urls', tfu['topic_fetch_urls_id']) assert tfu['state'] == FETCH_STATE_STORY_ADDED assert tfu['url'] == fetch_url assert tfu['code'] == 200 assert tfu['stories_id'] is not None new_story = db.require_by_id('stories', tfu['stories_id']) assert new_story['url'] == fetch_url assert new_story['title'] == 'foo' # now make sure that the domain throttling sets tfu = db.create( 'topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': hs.page_url('/throttle'), 'state': FETCH_STATE_PENDING }) fetch_topic_url_update_state( db=db, topics_id=topic['topics_id'], topic_fetch_urls_id=tfu['topic_fetch_urls_id'], domain_timeout=domain_timeout, ) tfu = db.require_by_id('topic_fetch_urls', tfu['topic_fetch_urls_id']) assert tfu['state'] == FETCH_STATE_REQUEUED
def test_cliff_annotator(): db = connect_to_db() media = db.create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = db.create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] db.create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __cliff_sample_response(_: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(sample_cliff_response()) return response pages = { '/cliff/parse/text': { 'callback': __cliff_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/cliff/parse/text' % port hs = HashServer(port=port, pages=pages) hs.start() class TestCLIFFFetcherConfig(CLIFFFetcherConfig): @staticmethod def annotator_url() -> str: return annotator_url cliff = CLIFFAnnotatorFetcher(fetcher_config=TestCLIFFFetcherConfig()) cliff.annotate_and_store_for_story(db=db, stories_id=stories_id) hs.stop() annotation_exists = db.query(""" SELECT 1 FROM cliff_annotations WHERE object_id = %(object_id)s """, {'object_id': stories_id}).hash() assert annotation_exists is not None
def test_http_hash_server_multiple_clients(): """Test running hash server with multiple clients.""" port = random_unused_port() # noinspection PyTypeChecker,PyUnusedLocal def __callback_timeout(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "\r\n" r += "And now we wait" time.sleep(10) return str.encode(r) pages = { '/a': '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.', '/timeout': {'callback': __callback_timeout}, # '/does-not-exist': '404', '/b': '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.', '/c': '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.', } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) base_url = 'http://localhost:%d' % port session = FuturesSession(max_workers=10) future_a = session.get('%s/a' % base_url, timeout=2) future_timeout = session.get('%s/timeout' % base_url, timeout=2) future_404 = session.get('%s/does-not-exist' % base_url, timeout=2) future_b = session.get('%s/b' % base_url, timeout=2) future_c = session.get('%s/c' % base_url, timeout=2) response_a = future_a.result() with pytest.raises(requests.Timeout): future_timeout.result() response_404 = future_404.result() response_b = future_b.result() response_c = future_c.result() assert response_b.status_code == 200 assert response_b.text == '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.' assert response_c.status_code == 200 assert response_c.text == '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.' assert response_404.status_code == 404 assert response_a.status_code == 200 assert response_a.text == '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.' hs.stop()
def test_fetch_and_store_episode(): db = connect_to_db() test_medium = create_test_medium(db=db, label='test') test_feed = create_test_feed(db=db, label='test', medium=test_medium) # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be # used to guess the probable language of the podcast episode test_story = create_test_story(db=db, label='keeping up with Kardashians', feed=test_feed) stories_id = test_story['stories_id'] with open(TEST_MP3_PATH, mode='rb') as f: test_mp3_data = f.read() # noinspection PyUnusedLocal def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += "Content-Type: audio/mpeg\r\n".encode('utf-8') response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8') response += "\r\n".encode('utf-8') response += test_mp3_data return response port = random_unused_port() pages = { '/test.mp3': { 'callback': __mp3_callback, } } hs = HashServer(port=port, pages=pages) hs.start() mp3_url = f'http://127.0.0.1:{port}/test.mp3' story_enclosure = db.insert(table='story_enclosures', insert_hash={ 'stories_id': stories_id, 'url': mp3_url, 'mime_type': 'audio/mpeg', 'length': len(test_mp3_data), }) conf = RandomPathPrefixConfig() fetch_and_store_episode(db=db, stories_id=stories_id, config=conf) episodes = db.select(table='podcast_episodes', what_to_select='*').hashes() assert len(episodes), f"Only one episode is expected." episode = episodes[0] assert episode['stories_id'] == stories_id assert episode['story_enclosures_id'] == story_enclosure[ 'story_enclosures_id'] assert episode[ 'gcs_uri'] == f"gs://{conf.gc_storage_bucket_name()}/{conf.gc_storage_path_prefix()}/{stories_id}" assert episode['duration'] > 0 assert episode['codec'] == 'MP3' assert episode['sample_rate'] == 44100 assert episode['bcp47_language_code'] == 'en-US' # Try removing test object gcs = GCSStore(config=conf) gcs.delete_object(object_id=str(stories_id))
class TestDownloadHandler(TestCase, metaclass=abc.ABCMeta): __slots__ = [ 'db', 'port', 'media', 'feed', '__hs', ] @abc.abstractmethod def hashserver_pages(self) -> Dict[str, Any]: """Return HashServer pages to serve.""" raise NotImplemented("Abstract method") def _fetch_and_handle_response( self, path: str, downloads_id: Optional[int] = None) -> Dict[str, Any]: """Call the fetcher and handler on the given URL. Return the download passed to the fetcher and handler.""" if downloads_id: download = self.db.find_by_id(table='downloads', object_id=downloads_id) else: download = self.db.create( table='downloads', insert_hash={ 'url': f"http://localhost:{self.port}{path}", 'host': 'localhost', 'type': 'feed', 'state': 'pending', 'priority': 0, 'sequence': 1, 'feeds_id': self.feed['feeds_id'], }) downloads_id = download['downloads_id'] handler = handler_for_download(db=self.db, download=download) response = handler.fetch_download(db=self.db, download=download) assert response handler.store_response(db=self.db, download=download, response=response) download = self.db.find_by_id(table='downloads', object_id=downloads_id) return download def setUp(self) -> None: self.db = connect_to_db() self.port = random_unused_port() self.__hs = HashServer(port=self.port, pages=self.hashserver_pages()) self.__hs.start() self.media = create_test_story_stack(db=self.db, data={'A': { 'B': [1] }}) self.feed = self.media['A']['feeds']['B'] def tearDown(self) -> None: self.__hs.stop()
def test_http_hash_server(): port = random_unused_port() base_url = 'http://localhost:%d' % port def __simple_callback(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback', 'method': request.method(), 'url': request.url(), 'content-type': request.content_type(), 'params': request.query_params(), 'cookies': request.cookies(), }) return str.encode(r) # noinspection PyUnusedLocal def __callback_cookie_redirect(request: HashServer.Request) -> str: r = "" r += "HTTP/1.0 302 Moved Temporarily\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "Location: /check_cookie\r\n" r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n" r += "\r\n" r += "Redirecting to the cookie check page..." return r def __callback_post(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback_post', 'post_data': request.content(), }) return str.encode(r) pages = { '/': 'home', '/foo': b'foo', '/bar': 'bar ąą', '/foo-bar': {b'redirect': b'/bar'}, '/localhost': {'redirect': "http://localhost:%d/" % port}, b'/127-foo': {b'redirect': "http://127.0.0.1:%d/foo" % port}, '/auth': {b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc"}, '/404': {b'content': b'not found', b'http_status_code': 404}, '/callback': {b'callback': __simple_callback}, # Test setting cookies, redirects '/callback_cookie_redirect': {'callback': __callback_cookie_redirect}, # POST data '/callback_post': {'callback': __callback_post}, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) assert str(requests.get('%s/' % base_url).text) == 'home' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/localhost' % base_url).text) == 'home' assert str(requests.get('%s/127-foo' % base_url).text) == 'foo' # Path normalization assert str(requests.get('%s//' % base_url).text) == 'home' assert str(requests.get('%s///' % base_url).text) == 'home' assert str(requests.get('%s/something/../' % base_url).text) == 'home' assert str(requests.get('%s/something/..//' % base_url).text) == 'home' assert str(requests.get('%s/something/..///' % base_url).text) == 'home' assert str(requests.get('%s/foo/' % base_url).text) == 'foo' assert str(requests.get('%s/foo//' % base_url).text) == 'foo' assert str(requests.get('%s/foo///' % base_url).text) == 'foo' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo' response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={'cookie_name': 'cookie_value'}).json() assert response_json == { 'name': 'callback', 'method': 'GET', 'url': 'http://localhost:%d/callback?a=b&c=d' % port, 'content-type': None, 'params': { 'a': 'b', 'c': 'd', }, 'cookies': { 'cookie_name': 'cookie_value', }, } response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False) assert response.status_code == 302 assert response.headers['Location'] == '/check_cookie' response = requests.get("%s/404" % base_url) assert response.status_code == HTTPStatus.NOT_FOUND.value assert 'Not Found' in response.reason auth_url = "%s/auth" % base_url assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED response = requests.get(auth_url, auth=('foo', 'bar')) assert response.status_code == HTTPStatus.OK assert response.content == b"foo bar \xf0\x90\x28\xbc" assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port) with pytest.raises(McHashServerException): hs.page_url('/does-not-exist') response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json() assert response_json == { 'name': 'callback_post', 'post_data': 'abc=def', } hs.stop()
def test_sitemap_tree_for_homepage_gzip(self): """Test sitemap_tree_for_homepage() with gzipped sitemaps.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_1.gz Sitemap: {base_url}/sitemap_2.dat """.format(base_url=self.__test_url)).strip(), }, # Gzipped sitemap without correct HTTP header but with .gz extension '/sitemap_1.gz': { 'content': gzip( textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/foo.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip()), }, # Gzipped sitemap with correct HTTP header but without .gz extension '/sitemap_2.dat': { 'header': 'Content-Type: application/x-gzip', 'content': gzip( textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/baz.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip()), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() # Don't do an in-depth check, we just need to make sure that gunzip works assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 2 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap_1, PagesXMLSitemap) assert len(sitemap_1.pages) == 1 sitemap_2 = actual_sitemap_tree.sub_sitemaps[1] assert isinstance(sitemap_2, PagesXMLSitemap) assert len(sitemap_2.pages) == 1
def test_sitemap_tree_for_homepage_huge_sitemap(self): """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling).""" page_count = 1000 sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> """ for x in range(page_count): sitemap_xml += """ <url> <loc>{base_url}/news/page_{x}.html</loc> <!-- Element present but empty --> <lastmod /> <!-- Some other XML namespace --> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/page_{x}.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> """.format( x=x, base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, ) sitemap_xml += "</urlset>" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap.xml.gz """.format(base_url=self.__test_url)).strip(), }, '/sitemap.xml.gz': { 'header': 'Content-Type: application/x-gzip', 'content': gzip(sitemap_xml), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() assert len(actual_sitemap_tree.all_pages()) == page_count
def test_sitemap_tree_for_homepage(self): """Test sitemap_tree_for_homepage().""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_pages.xml Sitemap: {base_url}/sitemap_news_index_1.xml """.format(base_url=self.__test_url)).strip(), }, # One sitemap for random static pages '/sitemap_pages.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>{base_url}/about.html</loc> <lastmod>{last_modified_date}</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> <url> <loc>{base_url}/contact.html</loc> <lastmod>{last_modified_date}</lastmod> <!-- Invalid change frequency --> <changefreq>when we feel like it</changefreq> <!-- Invalid priority --> <priority>1.1</priority> </url> </urlset> """.format(base_url=self.__test_url, last_modified_date=self.TEST_DATE_STR)).strip(), }, # Index sitemap pointing to sitemaps with stories '/sitemap_news_index_1.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>{base_url}/sitemap_news_1.xml</loc> <lastmod>{last_modified}</lastmod> </sitemap> <sitemap> <loc>{base_url}/sitemap_news_index_2.xml</loc> <lastmod>{last_modified}</lastmod> </sitemap> </sitemapindex> """.format(base_url=self.__test_url, last_modified=self.TEST_DATE_STR)).strip(), }, # First sitemap with actual stories '/sitemap_news_1.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> <url> <loc>{base_url}/news/foo.html</loc> <!-- Element present but empty --> <lastmod /> <!-- Some other XML namespace --> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/foo.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> <!-- Has a duplicate story in /sitemap_news_2.xml --> <url> <loc>{base_url}/news/bar.html</loc> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/bar.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Bar & bar</news:title> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip(), }, # Another index sitemap pointing to a second sitemaps with stories '/sitemap_news_index_2.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <!-- Extra whitespace added around URL --> <loc> {base_url}/sitemap_news_2.xml </loc> <lastmod>{last_modified}</lastmod> </sitemap> <!-- Nonexistent sitemap --> <sitemap> <loc>{base_url}/sitemap_news_nonexistent.xml</loc> <lastmod>{last_modified}</lastmod> </sitemap> </sitemapindex> """.format(base_url=self.__test_url, last_modified=self.TEST_DATE_STR)).strip(), }, # First sitemap with actual stories '/sitemap_news_2.xml': { 'header': 'Content-Type: application/xml', 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> <!-- Has a duplicate story in /sitemap_news_1.xml --> <url> <!-- Extra whitespace added around URL --> <loc> {base_url}/news/bar.html#fragment_is_to_be_removed </loc> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/bar.html?mobile=1#fragment_is_to_be_removed" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <tag_without_inner_character_data name="value" /> <news:title>Bar & bar</news:title> </news:news> </url> <url> <loc>{base_url}/news/baz.html</loc> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/baz.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip(), }, } # noinspection PyArgumentList expected_sitemap_tree = IndexRobotsTxtSitemap( url='{}/robots.txt'.format(self.__test_url), sub_sitemaps=[ PagesXMLSitemap( url='{}/sitemap_pages.xml'.format(self.__test_url), pages=[ SitemapPage( url='{}/about.html'.format(self.__test_url), last_modified=self.TEST_DATE_DATETIME, news_story=None, change_frequency=SitemapPageChangeFrequency. MONTHLY, priority=Decimal('0.8'), ), SitemapPage( url='{}/contact.html'.format(self.__test_url), last_modified=self.TEST_DATE_DATETIME, news_story=None, # Invalid input -- should be reset to "always" change_frequency=SitemapPageChangeFrequency.ALWAYS, # Invalid input -- should be reset to 0.5 (the default as per the spec) priority=Decimal('0.5'), ) ], ), IndexXMLSitemap( url='{}/sitemap_news_index_1.xml'.format(self.__test_url), sub_sitemaps=[ PagesXMLSitemap( url='{}/sitemap_news_1.xml'.format( self.__test_url), pages=[ SitemapPage( url='{}/news/foo.html'.format( self.__test_url), news_story=SitemapNewsStory( title='Foo <foo>', publish_date=self.TEST_DATE_DATETIME, publication_name=self. TEST_PUBLICATION_NAME, publication_language=self. TEST_PUBLICATION_LANGUAGE, ), ), SitemapPage( url='{}/news/bar.html'.format( self.__test_url), news_story=SitemapNewsStory( title='Bar & bar', publish_date=self.TEST_DATE_DATETIME, publication_name=self. TEST_PUBLICATION_NAME, publication_language=self. TEST_PUBLICATION_LANGUAGE, ), ), ]), IndexXMLSitemap( url='{}/sitemap_news_index_2.xml'.format( self.__test_url), sub_sitemaps=[ PagesXMLSitemap( url='{}/sitemap_news_2.xml'.format( self.__test_url), pages=[ SitemapPage( url='{}/news/bar.html'.format( self.__test_url), news_story=SitemapNewsStory( title='Bar & bar', publish_date=self. TEST_DATE_DATETIME, publication_name=self. TEST_PUBLICATION_NAME, publication_language=self. TEST_PUBLICATION_LANGUAGE, ), ), SitemapPage( url='{}/news/baz.html'.format( self.__test_url), news_story=SitemapNewsStory( title='Bąž', publish_date=self. TEST_DATE_DATETIME, publication_name=self. TEST_PUBLICATION_NAME, publication_language=self. TEST_PUBLICATION_LANGUAGE, ), ), ], ), InvalidSitemap( url='{}/sitemap_news_nonexistent.xml'. format(self.__test_url), reason= ('Unable to fetch sitemap from {base_url}/sitemap_news_nonexistent.xml: ' '404 Not Found').format( base_url=self.__test_url), ), ], ), ], ), ], ) hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() # PyCharm is not that great at formatting object diffs, so uncomment the following and set a breakpoint: # # expected_lines = str(expected_sitemap_tree).split() # actual_lines = str(actual_sitemap_tree).split() # diff = difflib.ndiff(expected_lines, actual_lines) # diff_str = '\n'.join(diff) # assert expected_lines == actual_lines assert expected_sitemap_tree == actual_sitemap_tree assert len(actual_sitemap_tree.all_pages()) == 5
def test_sitemap_tree_for_homepage_prematurely_ending_xml(self): """Test sitemap_tree_for_homepage() with clipped XML. Some webservers are misconfigured to limit the request length to a certain number of seconds, in which time the server is unable to generate and compress a 50 MB sitemap XML. Google News doesn't seem to have a problem with this behavior, so we have to support this too. """ pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap.xml """.format(base_url=self.__test_url)).strip(), }, '/sitemap.xml': { 'content': textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/first.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>First story</news:title> </news:news> </url> <url> <loc>{base_url}/news/second.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Second story</news:title> </news:news> </url> <!-- The following story shouldn't get added as the XML ends prematurely --> <url> <loc>{base_url}/news/third.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publicat """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip(), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 1 sitemap = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap, PagesXMLSitemap) assert len(sitemap.pages) == 2
def test_sitemap_tree_for_homepage_gzip(self): """Test sitemap_tree_for_homepage() with gzipped sitemaps.""" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap_1.gz Sitemap: {base_url}/sitemap_2.dat """.format(base_url=self.__test_url)).strip(), }, # Gzipped sitemap without correct HTTP header but with .gz extension '/sitemap_1.gz': { 'content': gzip(textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/foo.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip()), }, # Gzipped sitemap with correct HTTP header but without .gz extension '/sitemap_2.dat': { 'header': 'Content-Type: application/x-gzip', 'content': gzip(textwrap.dedent(""" <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> <url> <loc>{base_url}/news/baz.html</loc> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title><![CDATA[Bąž]]></news:title> <!-- CDATA and UTF-8 --> </news:news> </url> </urlset> """.format( base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, )).strip()), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage(homepage_url=self.__test_url) hs.stop() # Don't do an in-depth check, we just need to make sure that gunzip works assert isinstance(actual_sitemap_tree, IndexRobotsTxtSitemap) assert len(actual_sitemap_tree.sub_sitemaps) == 2 sitemap_1 = actual_sitemap_tree.sub_sitemaps[0] assert isinstance(sitemap_1, PagesXMLSitemap) assert len(sitemap_1.pages) == 1 sitemap_2 = actual_sitemap_tree.sub_sitemaps[1] assert isinstance(sitemap_2, PagesXMLSitemap) assert len(sitemap_2.pages) == 1
def test_nyt_labels_annotator(self): media = self.db().create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = self.db().create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] self.db().create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __nyt_labels_sample_response( _: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(self.__sample_nyt_labels_response()) return response pages = { '/predict.json': { 'callback': __nyt_labels_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/predict.json' % port hs = HashServer(port=port, pages=pages) hs.start() # Inject NYTLabels credentials into configuration config = py_get_config() new_config = copy.deepcopy(config) new_config['nytlabels'] = { 'enabled': True, 'annotator_url': annotator_url, } py_set_config(new_config) nytlabels = NYTLabelsAnnotator() nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id) nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id) hs.stop() # Reset configuration py_set_config(config) annotation_exists = self.db().query( """ SELECT 1 FROM nytlabels_annotations WHERE object_id = %(object_id)s """, { 'object_id': stories_id }).hash() assert annotation_exists is not None story_tags = self.db().query( """ SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C" """, { 'stories_id': stories_id }).hashes() expected_tags = self.__expected_tags() assert story_tags == expected_tags
def test_sitemap_tree_for_homepage_huge_sitemap(self): """Test sitemap_tree_for_homepage() with a huge sitemap (mostly for profiling).""" page_count = 1000 sitemap_xml = """<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml"> """ for x in range(page_count): sitemap_xml += """ <url> <loc>{base_url}/news/page_{x}.html</loc> <!-- Element present but empty --> <lastmod /> <!-- Some other XML namespace --> <xhtml:link rel="alternate" media="only screen and (max-width: 640px)" href="{base_url}/news/page_{x}.html?mobile=1" /> <news:news> <news:publication> <news:name>{publication_name}</news:name> <news:language>{publication_language}</news:language> </news:publication> <news:publication_date>{publication_date}</news:publication_date> <news:title>Foo <foo></news:title> <!-- HTML entity decoding --> </news:news> </url> """.format( x=x, base_url=self.__test_url, publication_name=self.TEST_PUBLICATION_NAME, publication_language=self.TEST_PUBLICATION_LANGUAGE, publication_date=self.TEST_DATE_STR, ) sitemap_xml += "</urlset>" pages = { '/': 'This is a homepage.', '/robots.txt': { 'header': 'Content-Type: text/plain', 'content': textwrap.dedent(""" User-agent: * Disallow: /whatever Sitemap: {base_url}/sitemap.xml.gz """.format(base_url=self.__test_url)).strip(), }, '/sitemap.xml.gz': { 'header': 'Content-Type: application/x-gzip', 'content': gzip(sitemap_xml), }, } hs = HashServer(port=self.__test_port, pages=pages) hs.start() actual_sitemap_tree = sitemap_tree_for_homepage( homepage_url=self.__test_url) hs.stop() assert len(actual_sitemap_tree.all_pages()) == page_count
class TestUnivisionLocal(AbstractUnivisionTest, TestCase): URL = None PORT = None __slots__ = [ '__hs', ] @classmethod def setUpClass(cls) -> None: super().setUpClass() cls.PORT = random_unused_port() cls.URL = f'http://localhost:{cls.PORT}' @classmethod def univision_credentials(cls) -> Optional[UnivisionTestCredentials]: return UnivisionTestCredentials( url=f"{cls.URL}/feed", client_id='foo', client_secret='bar', ) @classmethod def expect_to_find_some_stories(cls) -> bool: # Test feed always has stories return True def setUp(self) -> None: super().setUp() pages = { '/feed': encode_json({ 'status': 'success', 'data': { 'title': 'Sample Univision feed', 'totalItems': 2, 'items': [ { 'type': 'article', 'uid': '00000156-ba02-d374-ab77-feab13e20000', 'url': f"{self.URL}/first_article", 'publishDate': '2016-08-23T23:32:11-04:00', 'updateDate': '2016-08-24T10:09:26-04:00', 'title': 'First article: 🍕', # UTF-8 in the title 'description': 'This is the first Univision sample article.', }, { 'type': 'article', 'uid': '00000156-ba73-d5b6-affe-faf77f890000', 'url': f"{self.URL}/second_article", 'publishDate': '2016-08-23T23:20:13-04:00', 'updateDate': '2016-08-24T09:55:40-04:00', 'title': 'Second article: 🍔', # UTF-8 in the title 'description': 'This is the second Univision sample article.', }, ] } }), '/first_article': """ <h1>First article</h1> <p>This is the first Univision sample article.</p> """, '/second_article': """ <h1>Second article</h1> <p>This is the second Univision sample article.</p> """, } self.__hs = HashServer(port=self.PORT, pages=pages) self.__hs.start() def tearDown(self) -> None: self.__hs.stop()
def test_nyt_labels_annotator(self): media = self.db().create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = self.db().create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] self.db().create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __nyt_labels_sample_response(_: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(self.__sample_nyt_labels_response()) return response pages = { '/predict.json': { 'callback': __nyt_labels_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/predict.json' % port hs = HashServer(port=port, pages=pages) hs.start() # Inject NYTLabels credentials into configuration config = py_get_config() new_config = copy.deepcopy(config) new_config['nytlabels'] = { 'enabled': True, 'annotator_url': annotator_url, } py_set_config(new_config) nytlabels = NYTLabelsAnnotator() nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id) nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id) hs.stop() # Reset configuration py_set_config(config) annotation_exists = self.db().query(""" SELECT 1 FROM nytlabels_annotations WHERE object_id = %(object_id)s """, {'object_id': stories_id}).hash() assert annotation_exists is not None story_tags = self.db().query(""" SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C" """, {'stories_id': stories_id}).hashes() expected_tags = self.__expected_tags() assert story_tags == expected_tags
class TestFetchBigFile(TestCase): __slots__ = [ '__mock_data', '__hs', '__url', '__temp_dir', '__dest_file', ] def setUp(self) -> None: super().setUp() self.__mock_data = os.urandom(1024 * 1024) # noinspection PyUnusedLocal def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += "Content-Type: audio/mpeg\r\n".encode('utf-8') response += f"Content-Length: {len(self.__mock_data)}\r\n".encode( 'utf-8') response += "\r\n".encode('utf-8') response += self.__mock_data return response port = random_unused_port() pages = { '/test.mp3': { 'callback': __mp3_callback, } } self.__hs = HashServer(port=port, pages=pages) self.__hs.start() self.__url = f"http://127.0.0.1:{port}/test.mp3" self.__temp_dir = tempfile.mkdtemp('test') self.__dest_file = os.path.join(self.__temp_dir, 'test.mp3') def tearDown(self) -> None: self.__hs.stop() shutil.rmtree(self.__temp_dir) def test_simple(self): """Simple fetch.""" assert not os.path.isfile( self.__dest_file ), f"File '{self.__dest_file}' shouldn't exist before downloading." fetch_big_file(url=self.__url, dest_file=self.__dest_file) assert os.path.isfile( self.__dest_file ), f"File '{self.__dest_file}' should exist after downloading." assert os.stat(self.__dest_file).st_size == len( self.__mock_data ), f"File '{self.__dest_file}' should be of {len(self.__mock_data)} bytes." with open(self.__dest_file, mode='rb') as f: downloaded_data = f.read() assert self.__mock_data == downloaded_data, f"File's '{self.__dest_file}' data should be same as mock data." def test_max_size(self): """Fetch with max. size.""" max_size = len(self.__mock_data) - 1000 with pytest.raises( McPodcastFileFetchFailureException, message= f"Function should refuse to fetch more than {max_size} bytes"): fetch_big_file(url=self.__url, dest_file=self.__dest_file, max_size=max_size) assert not os.path.isfile( self.__dest_file ), f"File '{self.__dest_file}' should exist after a failed download."