def test_http_hash_server_multiple_servers(): """Test running multiple hash servers at the same time.""" port_1 = random_unused_port() port_2 = random_unused_port() base_url_1 = 'http://localhost:%d' % port_1 base_url_2 = 'http://localhost:%d' % port_2 # noinspection PyTypeChecker,PyUnusedLocal def __callback_sleep_forever( request: HashServer.Request) -> Union[str, bytes]: time.sleep(9999) pages = { '/simple-page': 'Works!', '/sleep-forever': { 'callback': __callback_sleep_forever }, } hs_1 = HashServer(port=port_1, pages=pages) hs_2 = HashServer(port=port_2, pages=pages) assert hs_1 assert hs_2 hs_1.start() hs_2.start() assert tcp_port_is_open(port=port_1) assert tcp_port_is_open(port=port_2) for base_url in [base_url_1, base_url_2]: request_timed_out = False try: requests.get('%s/sleep-forever' % base_url, timeout=1) except requests.exceptions.Timeout: request_timed_out = True assert request_timed_out is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' hs_1.stop() hs_2.stop() assert tcp_port_is_open(port=port_1) is False assert tcp_port_is_open(port=port_2) is False
def setUp(self) -> None: super().setUp() self.__mock_data = os.urandom(1024 * 1024) # noinspection PyUnusedLocal def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += "Content-Type: audio/mpeg\r\n".encode('utf-8') response += f"Content-Length: {len(self.__mock_data)}\r\n".encode( 'utf-8') response += "\r\n".encode('utf-8') response += self.__mock_data return response port = random_unused_port() pages = { '/test.mp3': { 'callback': __mp3_callback, } } self.__hs = HashServer(port=port, pages=pages) self.__hs.start() self.__url = f"http://127.0.0.1:{port}/test.mp3" self.__temp_dir = tempfile.mkdtemp('test') self.__dest_file = os.path.join(self.__temp_dir, 'test.mp3')
def test_extract_article_html_from_page_html_connection_errors(self): """Try extracting with connection errors.""" # Use multiprocessing.Value() because request might be handled in a fork self.is_first_response = multiprocessing.Value('i', 1) pages = { '/extract': { 'callback': self.__extract_but_initially_fail, } } port = random_unused_port() hs = HashServer(port=port, pages=pages) hs.start() class MockExtractorCommonConfig(CommonConfig): """Mock configuration which points to our unstable extractor.""" def extractor_api_url(self) -> str: return f'http://localhost:{port}/extract' extractor_response = extract_article_html_from_page_html( content='whatever', config=MockExtractorCommonConfig()) hs.stop() assert extractor_response assert 'extracted_html' in extractor_response assert 'extractor_version' in extractor_response assert extractor_response[ 'extracted_html'] == self.expected_extracted_text assert not self.is_first_response.value, "Make sure the initial extractor call failed."
def setUp(self): super().setUp() self.TEST_HTTP_SERVER_PORT = random_unused_port() self.TEST_HTTP_SERVER_URL = 'http://localhost:%d' % self.TEST_HTTP_SERVER_PORT self.STARTING_URL_WITHOUT_CRUFT = '%s/first' % self.TEST_HTTP_SERVER_URL self.STARTING_URL = self.STARTING_URL_WITHOUT_CRUFT + self.CRUFT
def test_http_hash_server_multiple_servers(): """Test running multiple hash servers at the same time.""" port_1 = random_unused_port() port_2 = random_unused_port() base_url_1 = 'http://localhost:%d' % port_1 base_url_2 = 'http://localhost:%d' % port_2 # noinspection PyTypeChecker,PyUnusedLocal def __callback_sleep_forever(request: HashServer.Request) -> Union[str, bytes]: time.sleep(9999) pages = { '/simple-page': 'Works!', '/sleep-forever': {'callback': __callback_sleep_forever}, } hs_1 = HashServer(port=port_1, pages=pages) hs_2 = HashServer(port=port_2, pages=pages) assert hs_1 assert hs_2 hs_1.start() hs_2.start() assert tcp_port_is_open(port=port_1) assert tcp_port_is_open(port=port_2) for base_url in [base_url_1, base_url_2]: request_timed_out = False try: requests.get('%s/sleep-forever' % base_url, timeout=1) except requests.exceptions.Timeout: request_timed_out = True assert request_timed_out is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' hs_1.stop() hs_2.stop() assert tcp_port_is_open(port=port_1) is False assert tcp_port_is_open(port=port_2) is False
def test_wait_for_tcp_port_to_close(): random_port = random_unused_port() assert wait_for_tcp_port_to_close(port=random_port, retries=2) is True # Open port s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('localhost', random_port)) s.listen() assert wait_for_tcp_port_to_close(port=random_port, retries=2) is False # Close port s.close() assert wait_for_tcp_port_to_close(port=random_port, retries=2) is True
def setUp(self) -> None: self.db = connect_to_db() self.port = random_unused_port() self.__hs = HashServer(port=self.port, pages=self.hashserver_pages()) self.__hs.start() self.media = create_test_story_stack(db=self.db, data={'A': { 'B': [1] }}) self.feed = self.media['A']['feeds']['B']
def test_tcp_port_is_open(): random_port = random_unused_port() assert tcp_port_is_open(random_port) is False # Open port s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('localhost', random_port)) s.listen() assert tcp_port_is_open(random_port) is True # Close port s.close() assert tcp_port_is_open(random_port) is False
def test_http_hash_server_stop(): """Test if HTTP hash server gets stopped properly (including children).""" port = random_unused_port() base_url = 'http://localhost:%d' % port # noinspection PyTypeChecker,PyUnusedLocal def __callback_sleep_forever( request: HashServer.Request) -> Union[str, bytes]: time.sleep(9999) pages = { '/simple-page': 'Works!', '/sleep-forever': { 'callback': __callback_sleep_forever }, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) request_timed_out = False try: requests.get('%s/sleep-forever' % base_url, timeout=1) except requests.exceptions.Timeout: request_timed_out = True assert request_timed_out is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' # Restart the server with the same port, make sure it works again, i.e. the server gets stopped properly, kills all # its children and releases the port hs.stop() assert tcp_port_is_open(port=port) is False hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' hs.stop()
def test_http_hash_server_stop(): """Test if HTTP hash server gets stopped properly (including children).""" port = random_unused_port() base_url = 'http://localhost:%d' % port # noinspection PyTypeChecker,PyUnusedLocal def __callback_sleep_forever(request: HashServer.Request) -> Union[str, bytes]: time.sleep(9999) pages = { '/simple-page': 'Works!', '/sleep-forever': {'callback': __callback_sleep_forever}, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) request_timed_out = False try: requests.get('%s/sleep-forever' % base_url, timeout=1) except requests.exceptions.Timeout: request_timed_out = True assert request_timed_out is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' # Restart the server with the same port, make sure it works again, i.e. the server gets stopped properly, kills all # its children and releases the port hs.stop() assert tcp_port_is_open(port=port) is False hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) is True assert str(requests.get('%s/simple-page' % base_url).text) == 'Works!' hs.stop()
def test_run_fetcher(): db = connect_to_db() medium = create_test_medium(db=db, label='foo') feed = create_test_feed(db=db, label='foo', medium=medium) story = create_test_story(db=db, label='foo', feed=feed) port = random_unused_port() pages = { '/foo': 'foo', '/bar': 'bar', } hs = HashServer(port=port, pages=pages) hs.start() download = db.create(table='downloads', insert_hash={ 'state': 'pending', 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'], 'type': 'content', 'sequence': 1, 'priority': 1, 'url': f"http://localhost:{port}/foo", 'host': 'localhost', }) db.query(""" INSERT INTO queued_downloads (downloads_id) SELECT downloads_id FROM downloads """) run_fetcher(no_daemon=True) test_download = db.find_by_id(table='downloads', object_id=download['downloads_id']) assert test_download['state'] == 'success'
def setUp(self): super().setUp() self.__test_port = random_unused_port() self.__test_url = 'http://localhost:%d' % self.__test_port
async def test_workflow(): db = connect_to_db() test_medium = create_test_medium(db=db, label='test') test_feed = create_test_feed(db=db, label='test', medium=test_medium) # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be # used to guess the probable language of the podcast episode test_story = create_test_story(db=db, label='keeping up with Kardashians', feed=test_feed) stories_id = test_story['stories_id'] with open(TEST_MP3_PATH, mode='rb') as f: test_mp3_data = f.read() # noinspection PyUnusedLocal def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += "Content-Type: audio/mpeg\r\n".encode('utf-8') response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8') response += "\r\n".encode('utf-8') response += test_mp3_data return response port = random_unused_port() pages = { '/test.mp3': { 'callback': __mp3_callback, } } hs = HashServer(port=port, pages=pages) hs.start() # Not localhost as this might get fetched from a remote worker mp3_url = hs.page_url('/test.mp3') db.insert(table='story_enclosures', insert_hash={ 'stories_id': stories_id, 'url': mp3_url, 'mime_type': 'audio/mpeg', 'length': len(test_mp3_data), }) client = workflow_client() # Start worker factory = WorkerFactory(client=client, namespace=client.namespace) worker = factory.new_worker(task_queue=TASK_QUEUE) # Use an activities implementation with random GCS prefixes set activities = _RandomPrefixesPodcastTranscribeActivities() worker.register_activities_implementation( activities_instance=activities, activities_cls_name=PodcastTranscribeActivities.__name__, ) worker.register_workflow_implementation_type( impl_cls=PodcastTranscribeWorkflowImpl) factory.start() # Initialize workflow instance workflow: PodcastTranscribeWorkflow = client.new_workflow_stub( cls=PodcastTranscribeWorkflow, workflow_options=WorkflowOptions( workflow_id=str(stories_id), # By default, if individual activities of the workflow fail, they will get restarted pretty much # indefinitely, and so this test might run for days (or rather just timeout on the CI). So we cap the # workflow so that if it doesn't manage to complete in X minutes, we consider it as failed. workflow_run_timeout=timedelta(minutes=5), ), ) # Wait for the workflow to complete await workflow.transcribe_episode(stories_id) downloads = db.select(table='downloads', what_to_select='*').hashes() assert len(downloads) == 1 first_download = downloads[0] assert first_download['stories_id'] == stories_id assert first_download['type'] == 'content' assert first_download['state'] == 'success' download_content = fetch_content(db=db, download=first_download) # It's what gets said in the sample MP3 file assert 'Kim Kardashian' in download_content # Initiate the worker shutdown in the background while we do the GCS cleanup so that the stop_workers_faster() # doesn't have to wait that long await worker.stop(background=True) log.info("Cleaning up GCS...") GCSStore(bucket_config=activities.config.raw_enclosures()).delete_object( object_id=str(stories_id)) GCSStore( bucket_config=activities.config.transcoded_episodes()).delete_object( object_id=str(stories_id)) GCSStore(bucket_config=activities.config.transcripts()).delete_object( object_id=str(stories_id)) log.info("Cleaned up GCS") log.info("Stopping workers...") await stop_worker_faster(worker) log.info("Stopped workers")
def test_random_unused_port(): random_port = random_unused_port() assert tcp_port_is_open(random_port) is False
def test_http_hash_server_multiple_clients(): """Test running hash server with multiple clients.""" port = random_unused_port() # noinspection PyTypeChecker,PyUnusedLocal def __callback_timeout(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "\r\n" r += "And now we wait" time.sleep(10) return str.encode(r) pages = { '/a': '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.', '/timeout': { 'callback': __callback_timeout }, # '/does-not-exist': '404', '/b': '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.', '/c': '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.', } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) base_url = 'http://localhost:%d' % port session = FuturesSession(max_workers=10) future_a = session.get('%s/a' % base_url, timeout=2) future_timeout = session.get('%s/timeout' % base_url, timeout=2) future_404 = session.get('%s/does-not-exist' % base_url, timeout=2) future_b = session.get('%s/b' % base_url, timeout=2) future_c = session.get('%s/c' % base_url, timeout=2) response_a = future_a.result() with pytest.raises(requests.Timeout): future_timeout.result() response_404 = future_404.result() response_b = future_b.result() response_c = future_c.result() assert response_b.status_code == 200 assert response_b.text == '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.' assert response_c.status_code == 200 assert response_c.text == '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.' assert response_404.status_code == 404 assert response_a.status_code == 200 assert response_a.text == '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.' hs.stop()
def test_cliff_annotator(): db = connect_to_db() media = db.create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = db.create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] db.create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __cliff_sample_response(_: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(sample_cliff_response()) return response pages = { '/cliff/parse/text': { 'callback': __cliff_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/cliff/parse/text' % port hs = HashServer(port=port, pages=pages) hs.start() class TestCLIFFFetcherConfig(CLIFFFetcherConfig): @staticmethod def annotator_url() -> str: return annotator_url cliff = CLIFFAnnotatorFetcher(fetcher_config=TestCLIFFFetcherConfig()) cliff.annotate_and_store_for_story(db=db, stories_id=stories_id) hs.stop() annotation_exists = db.query(""" SELECT 1 FROM cliff_annotations WHERE object_id = %(object_id)s """, {'object_id': stories_id}).hash() assert annotation_exists is not None
def test_tagging(self): db = connect_to_db() media = db.create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = db.create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] db.create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __cliff_sample_response( _: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(sample_cliff_response()) return response pages = { '/cliff/parse/text': { 'callback': __cliff_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/cliff/parse/text' % port hs = HashServer(port=port, pages=pages) hs.start() class TestCLIFFFetcherConfig(CLIFFTagsFromAnnotationConfig): @staticmethod def annotator_url() -> str: return annotator_url cliff = CLIFFTagsFromAnnotation(tagger_config=TestCLIFFFetcherConfig()) cliff.update_tags_for_story(db=db, stories_id=stories_id) hs.stop() story_tags = db.query( """ SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY lower(tag_sets.name), lower(tags.tag) """, { 'stories_id': stories_id }).hashes() expected_tags = expected_cliff_tags() assert story_tags == expected_tags
def test_fetch_and_store_episode(): db = connect_to_db() test_medium = create_test_medium(db=db, label='test') test_feed = create_test_feed(db=db, label='test', medium=test_medium) # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be # used to guess the probable language of the podcast episode test_story = create_test_story(db=db, label='keeping up with Kardashians', feed=test_feed) stories_id = test_story['stories_id'] with open(TEST_MP3_PATH, mode='rb') as f: test_mp3_data = f.read() # noinspection PyUnusedLocal def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += "Content-Type: audio/mpeg\r\n".encode('utf-8') response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8') response += "\r\n".encode('utf-8') response += test_mp3_data return response port = random_unused_port() pages = { '/test.mp3': { 'callback': __mp3_callback, } } hs = HashServer(port=port, pages=pages) hs.start() mp3_url = f'http://127.0.0.1:{port}/test.mp3' story_enclosure = db.insert(table='story_enclosures', insert_hash={ 'stories_id': stories_id, 'url': mp3_url, 'mime_type': 'audio/mpeg', 'length': len(test_mp3_data), }) conf = RandomPathPrefixConfig() fetch_and_store_episode(db=db, stories_id=stories_id, config=conf) episodes = db.select(table='podcast_episodes', what_to_select='*').hashes() assert len(episodes), f"Only one episode is expected." episode = episodes[0] assert episode['stories_id'] == stories_id assert episode['story_enclosures_id'] == story_enclosure[ 'story_enclosures_id'] assert episode[ 'gcs_uri'] == f"gs://{conf.gc_storage_bucket_name()}/{conf.gc_storage_path_prefix()}/{stories_id}" assert episode['duration'] > 0 assert episode['codec'] == 'MP3' assert episode['sample_rate'] == 44100 assert episode['bcp47_language_code'] == 'en-US' # Try removing test object gcs = GCSStore(config=conf) gcs.delete_object(object_id=str(stories_id))
def test_http_hash_server(): port = random_unused_port() base_url = 'http://localhost:%d' % port def __simple_callback(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback', 'method': request.method(), 'url': request.url(), 'content-type': request.content_type(), 'params': request.query_params(), 'cookies': request.cookies(), }) return str.encode(r) # noinspection PyUnusedLocal def __callback_cookie_redirect(request: HashServer.Request) -> str: r = "" r += "HTTP/1.0 302 Moved Temporarily\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "Location: /check_cookie\r\n" r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n" r += "\r\n" r += "Redirecting to the cookie check page..." return r def __callback_post(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback_post', 'post_data': request.content(), }) return str.encode(r) pages = { '/': 'home', '/foo': b'foo', '/bar': 'bar ąą', '/foo-bar': { b'redirect': b'/bar' }, '/localhost': { 'redirect': "http://localhost:%d/" % port }, b'/127-foo': { b'redirect': "http://127.0.0.1:%d/foo" % port }, '/auth': { b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc" }, '/404': { b'content': b'not found', b'http_status_code': 404 }, '/callback': { b'callback': __simple_callback }, # Test setting cookies, redirects '/callback_cookie_redirect': { 'callback': __callback_cookie_redirect }, # POST data '/callback_post': { 'callback': __callback_post }, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) assert str(requests.get('%s/' % base_url).text) == 'home' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/localhost' % base_url).text) == 'home' assert str(requests.get('%s/127-foo' % base_url).text) == 'foo' # Path normalization assert str(requests.get('%s//' % base_url).text) == 'home' assert str(requests.get('%s///' % base_url).text) == 'home' assert str(requests.get('%s/something/../' % base_url).text) == 'home' assert str(requests.get('%s/something/..//' % base_url).text) == 'home' assert str(requests.get('%s/something/..///' % base_url).text) == 'home' assert str(requests.get('%s/foo/' % base_url).text) == 'foo' assert str(requests.get('%s/foo//' % base_url).text) == 'foo' assert str(requests.get('%s/foo///' % base_url).text) == 'foo' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo' response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={ 'cookie_name': 'cookie_value' }).json() assert response_json == { 'name': 'callback', 'method': 'GET', 'url': 'http://localhost:%d/callback?a=b&c=d' % port, 'content-type': None, 'params': { 'a': 'b', 'c': 'd', }, 'cookies': { 'cookie_name': 'cookie_value', }, } response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False) assert response.status_code == 302 assert response.headers['Location'] == '/check_cookie' response = requests.get("%s/404" % base_url) assert response.status_code == HTTPStatus.NOT_FOUND.value assert 'Not Found' in response.reason auth_url = "%s/auth" % base_url assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED response = requests.get(auth_url, auth=('foo', 'bar')) assert response.status_code == HTTPStatus.OK assert response.content == b"foo bar \xf0\x90\x28\xbc" assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port) with pytest.raises(McHashServerException): hs.page_url('/does-not-exist') response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json() assert response_json == { 'name': 'callback_post', 'post_data': 'abc=def', } hs.stop()
def test_http_hash_server(): port = random_unused_port() base_url = 'http://localhost:%d' % port def __simple_callback(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback', 'method': request.method(), 'url': request.url(), 'content-type': request.content_type(), 'params': request.query_params(), 'cookies': request.cookies(), }) return str.encode(r) # noinspection PyUnusedLocal def __callback_cookie_redirect(request: HashServer.Request) -> str: r = "" r += "HTTP/1.0 302 Moved Temporarily\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "Location: /check_cookie\r\n" r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n" r += "\r\n" r += "Redirecting to the cookie check page..." return r def __callback_post(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback_post', 'post_data': request.content(), }) return str.encode(r) pages = { '/': 'home', '/foo': b'foo', '/bar': 'bar ąą', '/foo-bar': {b'redirect': b'/bar'}, '/localhost': {'redirect': "http://localhost:%d/" % port}, b'/127-foo': {b'redirect': "http://127.0.0.1:%d/foo" % port}, '/auth': {b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc"}, '/404': {b'content': b'not found', b'http_status_code': 404}, '/callback': {b'callback': __simple_callback}, # Test setting cookies, redirects '/callback_cookie_redirect': {'callback': __callback_cookie_redirect}, # POST data '/callback_post': {'callback': __callback_post}, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) assert str(requests.get('%s/' % base_url).text) == 'home' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/localhost' % base_url).text) == 'home' assert str(requests.get('%s/127-foo' % base_url).text) == 'foo' # Path normalization assert str(requests.get('%s//' % base_url).text) == 'home' assert str(requests.get('%s///' % base_url).text) == 'home' assert str(requests.get('%s/something/../' % base_url).text) == 'home' assert str(requests.get('%s/something/..//' % base_url).text) == 'home' assert str(requests.get('%s/something/..///' % base_url).text) == 'home' assert str(requests.get('%s/foo/' % base_url).text) == 'foo' assert str(requests.get('%s/foo//' % base_url).text) == 'foo' assert str(requests.get('%s/foo///' % base_url).text) == 'foo' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo' response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={'cookie_name': 'cookie_value'}).json() assert response_json == { 'name': 'callback', 'method': 'GET', 'url': 'http://localhost:%d/callback?a=b&c=d' % port, 'content-type': None, 'params': { 'a': 'b', 'c': 'd', }, 'cookies': { 'cookie_name': 'cookie_value', }, } response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False) assert response.status_code == 302 assert response.headers['Location'] == '/check_cookie' response = requests.get("%s/404" % base_url) assert response.status_code == HTTPStatus.NOT_FOUND.value assert 'Not Found' in response.reason auth_url = "%s/auth" % base_url assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED response = requests.get(auth_url, auth=('foo', 'bar')) assert response.status_code == HTTPStatus.OK assert response.content == b"foo bar \xf0\x90\x28\xbc" assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port) with pytest.raises(McHashServerException): hs.page_url('/does-not-exist') response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json() assert response_json == { 'name': 'callback_post', 'post_data': 'abc=def', } hs.stop()
def test_http_hash_server_multiple_clients(): """Test running hash server with multiple clients.""" port = random_unused_port() # noinspection PyTypeChecker,PyUnusedLocal def __callback_timeout(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "\r\n" r += "And now we wait" time.sleep(10) return str.encode(r) pages = { '/a': '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.', '/timeout': {'callback': __callback_timeout}, # '/does-not-exist': '404', '/b': '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.', '/c': '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.', } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) base_url = 'http://localhost:%d' % port session = FuturesSession(max_workers=10) future_a = session.get('%s/a' % base_url, timeout=2) future_timeout = session.get('%s/timeout' % base_url, timeout=2) future_404 = session.get('%s/does-not-exist' % base_url, timeout=2) future_b = session.get('%s/b' % base_url, timeout=2) future_c = session.get('%s/c' % base_url, timeout=2) response_a = future_a.result() with pytest.raises(requests.Timeout): future_timeout.result() response_404 = future_404.result() response_b = future_b.result() response_c = future_c.result() assert response_b.status_code == 200 assert response_b.text == '𝕿𝖍𝖎𝖘 𝖎𝖘 𝖕𝖆𝖌𝖊 𝕭.' assert response_c.status_code == 200 assert response_c.text == '𝕋𝕙𝕚𝕤 𝕚𝕤 𝕡𝕒𝕘𝕖 ℂ.' assert response_404.status_code == 404 assert response_a.status_code == 200 assert response_a.text == '𝘛𝘩𝘪𝘴 𝘪𝘴 𝘱𝘢𝘨𝘦 𝘈.' hs.stop()
def setUpClass(cls) -> None: super().setUpClass() cls.PORT = random_unused_port() cls.URL = f'http://localhost:{cls.PORT}'
def __init__(self, pages: Dict[str, Any]): self.__port = random_unused_port() self.__hs = HashServer(port=self.__port, pages=pages) self.__hs.start()
def test_nyt_labels_annotator(self): media = self.db().create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = self.db().create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] self.db().create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __nyt_labels_sample_response(_: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(self.__sample_nyt_labels_response()) return response pages = { '/predict.json': { 'callback': __nyt_labels_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/predict.json' % port hs = HashServer(port=port, pages=pages) hs.start() # Inject NYTLabels credentials into configuration config = py_get_config() new_config = copy.deepcopy(config) new_config['nytlabels'] = { 'enabled': True, 'annotator_url': annotator_url, } py_set_config(new_config) nytlabels = NYTLabelsAnnotator() nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id) nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id) hs.stop() # Reset configuration py_set_config(config) annotation_exists = self.db().query(""" SELECT 1 FROM nytlabels_annotations WHERE object_id = %(object_id)s """, {'object_id': stories_id}).hash() assert annotation_exists is not None story_tags = self.db().query(""" SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C" """, {'stories_id': stories_id}).hashes() expected_tags = self.__expected_tags() assert story_tags == expected_tags
def test_http_hash_server(): port = random_unused_port() base_url = 'http://localhost:%d' % port def __simple_callback(params: dict, cookies: dict) -> str: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback', 'params': params, 'cookies': cookies, }) return r # noinspection PyUnusedLocal def __callback_cookie_redirect(params: dict, cookies: dict) -> str: r = "" r += "HTTP/1.0 302 Moved Temporarily\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "Location: /check_cookie\r\n" r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n" r += "\r\n" r += "Redirecting to the cookie check page..." return r pages = { '/': 'home', '/foo': 'foo', '/bar': 'bar', '/foo-bar': { 'redirect': '/bar' }, '/localhost': { 'redirect': "http://localhost:%d/" % port }, '/127-foo': { 'redirect': "http://127.0.0.1:%d/foo" % port }, '/auth': { 'auth': 'foo:bar', 'content': 'foo bar' }, '/404': { 'content': 'not found', 'http_status_code': 404 }, '/callback': { 'callback': __simple_callback }, # Test setting cookies, redirects '/callback_cookie_redirect': { 'callback': __callback_cookie_redirect }, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) assert str(requests.get('%s/' % base_url).text) == 'home' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar' % base_url).text) == 'bar' assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar' assert str(requests.get('%s/localhost' % base_url).text) == 'home' assert str(requests.get('%s/127-foo' % base_url).text) == 'foo' response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={ 'cookie_name': 'cookie_value' }).json() assert response_json == { 'name': 'callback', 'params': { 'a': 'b', 'c': 'd', }, 'cookies': { 'cookie_name': 'cookie_value', }, } response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False) assert response.status_code == 302 assert response.headers['Location'] == '/check_cookie' response = requests.get("%s/404" % base_url) assert response.status_code == HTTPStatus.NOT_FOUND.value assert 'Not Found' in response.reason auth_url = "%s/auth" % base_url assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED response = requests.get(auth_url, auth=('foo', 'bar')) assert response.status_code == HTTPStatus.OK assert response.text == 'foo bar' assert hs.page_url( '/callback?a=b&c=d') == 'http://localhost:%d/callback' % port assert_raises(McHashServerException, hs.page_url, '/does-not-exist') hs.stop()
def test_nyt_labels_annotator(self): media = self.db().create(table='media', insert_hash={ 'name': "test medium", 'url': "url://test/medium", }) story = self.db().create(table='stories', insert_hash={ 'media_id': media['media_id'], 'url': 'url://story/a', 'guid': 'guid://story/a', 'title': 'story a', 'description': 'description a', 'publish_date': sql_now(), 'collect_date': sql_now(), 'full_text_rss': True, }) stories_id = story['stories_id'] self.db().create(table='story_sentences', insert_hash={ 'stories_id': stories_id, 'sentence_number': 1, 'sentence': 'I hope that the CLIFF annotator is working.', 'media_id': media['media_id'], 'publish_date': sql_now(), 'language': 'en' }) def __nyt_labels_sample_response( _: HashServer.Request) -> Union[str, bytes]: """Mock annotator.""" response = "" response += "HTTP/1.0 200 OK\r\n" response += "Content-Type: application/json; charset=UTF-8\r\n" response += "\r\n" response += encode_json(self.__sample_nyt_labels_response()) return response pages = { '/predict.json': { 'callback': __nyt_labels_sample_response, } } port = random_unused_port() annotator_url = 'http://localhost:%d/predict.json' % port hs = HashServer(port=port, pages=pages) hs.start() # Inject NYTLabels credentials into configuration config = py_get_config() new_config = copy.deepcopy(config) new_config['nytlabels'] = { 'enabled': True, 'annotator_url': annotator_url, } py_set_config(new_config) nytlabels = NYTLabelsAnnotator() nytlabels.annotate_and_store_for_story(db=self.db(), stories_id=stories_id) nytlabels.update_tags_for_story(db=self.db(), stories_id=stories_id) hs.stop() # Reset configuration py_set_config(config) annotation_exists = self.db().query( """ SELECT 1 FROM nytlabels_annotations WHERE object_id = %(object_id)s """, { 'object_id': stories_id }).hash() assert annotation_exists is not None story_tags = self.db().query( """ SELECT tags.tag AS tags_name, tags.label AS tags_label, tags.description AS tags_description, tag_sets.name AS tag_sets_name, tag_sets.label AS tag_sets_label, tag_sets.description AS tag_sets_description FROM stories_tags_map INNER JOIN tags ON stories_tags_map.tags_id = tags.tags_id INNER JOIN tag_sets ON tags.tag_sets_id = tag_sets.tag_sets_id WHERE stories_tags_map.stories_id = %(stories_id)s ORDER BY tags.tag COLLATE "C", tag_sets.name COLLATE "C" """, { 'stories_id': stories_id }).hashes() expected_tags = self.__expected_tags() assert story_tags == expected_tags