def test_fetch_url(): db = connect_to_db() def _meta_redirect(r): resp = "" resp += 'HTTP/1.0 200 OK\r\n' resp += 'Content-Type: text/html\r\n\r\n' resp += '<meta http-equiv="refresh" content="0; url=%s-foo">\n' % r.url( ) return resp hs = HashServer(port=0, pages={ '/foo': 'bar', '/400': { 'http_status_code': 400 }, '/404': { 'http_status_code': 404 }, '/500': { 'http_status_code': 500 }, '/mr-foo': 'meta redirect target', '/mr': { 'callback': _meta_redirect }, }) hs.start(delay=2) port = hs.port() timeout_args = { 'network_down_host': 'localhost', 'network_down_port': port, 'network_down_timeout': 1, 'domain_timeout': 0 } # before delayed start, 404s and 500s should still return None assert not _fetch_url(db, hs.page_url('/404'), **timeout_args).is_success assert not _fetch_url(db, hs.page_url('/500'), **timeout_args).is_success # request for a valid page should make the call wait until the hs comes up assert _fetch_url(db, hs.page_url('/foo'), **timeout_args).content == 'bar' # and now a 400 should return a None assert not _fetch_url(db, hs.page_url('/400'), **timeout_args).is_success # make sure invalid url does not raise an exception assert not _fetch_url(db, 'this is not a url', **timeout_args) is None # make sure that requests follow meta redirects response = _fetch_url(db, hs.page_url('/mr'), **timeout_args) assert response.content == 'meta redirect target' assert response.last_requested_url == hs.page_url('/mr-foo')
def testDelay() -> None: """Test the delay= parameter to hs.start.""" hs = HashServer(port=0, pages={'/foo': 'bar'}) hs.start(delay=1) caught_exception = False try: requests.get(hs.page_url('/foo')) except requests.exceptions.ConnectionError: caught_exception = True assert caught_exception time.sleep(2) assert str(requests.get(hs.page_url('/foo')).text) == 'bar' hs.stop()
def test_request(self) -> None: """Test requests with throttling.""" pages = {'/test': 'Hello!', } port = 8888 hs = HashServer(port=port, pages=pages) hs.start() ua = ThrottledUserAgent(self.db(), domain_timeout=2) test_url = hs.page_url('/test') # first request should work response = ua.get(test_url) assert response.decoded_content() == 'Hello!' # fail because we're in the timeout ua = ThrottledUserAgent(self.db(), domain_timeout=2) self.assertRaises(McThrottledDomainException, ua.get, test_url) # succeed because it's a different domain ua = ThrottledUserAgent(self.db(), domain_timeout=2) response = ua.get('http://127.0.0.1:8888/test') assert response.decoded_content() == 'Hello!' # still fail within the timeout ua = ThrottledUserAgent(self.db(), domain_timeout=2) self.assertRaises(McThrottledDomainException, ua.get, test_url) time.sleep(2) # now we're outside the timeout, so it should work ua = ThrottledUserAgent(self.db(), domain_timeout=2) response = ua.get(test_url) assert response.decoded_content() == 'Hello!' # and follow up request on the same ua object should work response = ua.get(test_url) assert response.decoded_content() == 'Hello!' # but then fail within the new timeout period with a new object ua = ThrottledUserAgent(self.db(), domain_timeout=2) self.assertRaises(McThrottledDomainException, ua.get, test_url) hs.stop() # test domain_timeout assignment logic ua = ThrottledUserAgent(self.db(), domain_timeout=100) assert ua.domain_timeout == 100 config = mediawords.util.config.get_config() config['mediawords']['throttled_user_agent_domain_timeout'] = 200 ua = ThrottledUserAgent(self.db()) assert ua.domain_timeout == 200 del config['mediawords']['throttled_user_agent_domain_timeout'] ua = ThrottledUserAgent(self.db()) assert ua.domain_timeout == mediawords.util.web.user_agent.throttled._DEFAULT_DOMAIN_TIMEOUT
def testRandomPort() -> None: """Test assigning a random port where port = 0.""" hss = [] for i in range(3): hs = HashServer(port=0, pages={'/foo': 'bar'}) assert hs is not None hs.start() assert hs.port() >= START_RANDOM_PORT assert tcp_port_is_open(hs.port()) assert str(requests.get(hs.page_url('/foo')).text) == 'bar' hss.append(hs) [hs.stop() for hs in hss]
def test_http_hash_server(): port = random_unused_port() base_url = 'http://localhost:%d' % port def __simple_callback(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback', 'method': request.method(), 'url': request.url(), 'content-type': request.content_type(), 'params': request.query_params(), 'cookies': request.cookies(), }) return str.encode(r) # noinspection PyUnusedLocal def __callback_cookie_redirect(request: HashServer.Request) -> str: r = "" r += "HTTP/1.0 302 Moved Temporarily\r\n" r += "Content-Type: text/html; charset=UTF-8\r\n" r += "Location: /check_cookie\r\n" r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n" r += "\r\n" r += "Redirecting to the cookie check page..." return r def __callback_post(request: HashServer.Request) -> Union[str, bytes]: r = "" r += "HTTP/1.0 200 OK\r\n" r += "Content-Type: application/json; charset=UTF-8\r\n" r += "\r\n" r += json.dumps({ 'name': 'callback_post', 'post_data': request.content(), }) return str.encode(r) pages = { '/': 'home', '/foo': b'foo', '/bar': 'bar ąą', '/foo-bar': {b'redirect': b'/bar'}, '/localhost': {'redirect': "http://localhost:%d/" % port}, b'/127-foo': {b'redirect': "http://127.0.0.1:%d/foo" % port}, '/auth': {b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc"}, '/404': {b'content': b'not found', b'http_status_code': 404}, '/callback': {b'callback': __simple_callback}, # Test setting cookies, redirects '/callback_cookie_redirect': {'callback': __callback_cookie_redirect}, # POST data '/callback_post': {'callback': __callback_post}, } hs = HashServer(port=port, pages=pages) assert hs hs.start() assert tcp_port_is_open(port=port) assert str(requests.get('%s/' % base_url).text) == 'home' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą' assert str(requests.get('%s/localhost' % base_url).text) == 'home' assert str(requests.get('%s/127-foo' % base_url).text) == 'foo' # Path normalization assert str(requests.get('%s//' % base_url).text) == 'home' assert str(requests.get('%s///' % base_url).text) == 'home' assert str(requests.get('%s/something/../' % base_url).text) == 'home' assert str(requests.get('%s/something/..//' % base_url).text) == 'home' assert str(requests.get('%s/something/..///' % base_url).text) == 'home' assert str(requests.get('%s/foo/' % base_url).text) == 'foo' assert str(requests.get('%s/foo//' % base_url).text) == 'foo' assert str(requests.get('%s/foo///' % base_url).text) == 'foo' assert str(requests.get('%s/foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo' assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo' response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={'cookie_name': 'cookie_value'}).json() assert response_json == { 'name': 'callback', 'method': 'GET', 'url': 'http://localhost:%d/callback?a=b&c=d' % port, 'content-type': None, 'params': { 'a': 'b', 'c': 'd', }, 'cookies': { 'cookie_name': 'cookie_value', }, } response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False) assert response.status_code == 302 assert response.headers['Location'] == '/check_cookie' response = requests.get("%s/404" % base_url) assert response.status_code == HTTPStatus.NOT_FOUND.value assert 'Not Found' in response.reason auth_url = "%s/auth" % base_url assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED response = requests.get(auth_url, auth=('foo', 'bar')) assert response.status_code == HTTPStatus.OK assert response.content == b"foo bar \xf0\x90\x28\xbc" assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port) with pytest.raises(McHashServerException): hs.page_url('/does-not-exist') response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json() assert response_json == { 'name': 'callback_post', 'post_data': 'abc=def', } hs.stop()
def test_fetch_link_job_update_state(): db = connect_to_db() # In testing environment, RabbitMQ or workers might be slow to start up so # we increase the delay ("timeout") between domain fetches to be able to # test out throttling domain_timeout = 360 hs = HashServer(port=0, pages={ '/foo': '<title>foo</title>', '/throttle': '<title>throttle</title>' }) hs.start() topic = create_test_topic(db, 'foo') topic['pattern'] = '.' topic = db.update_by_id('topics', topic['topics_id'], topic) fetch_url = hs.page_url('/foo') # basic sanity test for link fetching tfu = db.create( 'topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': hs.page_url('/foo'), 'state': FETCH_STATE_PENDING }) fetch_topic_url_update_state( db=db, topics_id=topic['topics_id'], topic_fetch_urls_id=tfu['topic_fetch_urls_id'], domain_timeout=domain_timeout, ) tfu = db.require_by_id('topic_fetch_urls', tfu['topic_fetch_urls_id']) assert tfu['state'] == FETCH_STATE_STORY_ADDED assert tfu['url'] == fetch_url assert tfu['code'] == 200 assert tfu['stories_id'] is not None new_story = db.require_by_id('stories', tfu['stories_id']) assert new_story['url'] == fetch_url assert new_story['title'] == 'foo' # now make sure that the domain throttling sets tfu = db.create( 'topic_fetch_urls', { 'topics_id': topic['topics_id'], 'url': hs.page_url('/throttle'), 'state': FETCH_STATE_PENDING }) fetch_topic_url_update_state( db=db, topics_id=topic['topics_id'], topic_fetch_urls_id=tfu['topic_fetch_urls_id'], domain_timeout=domain_timeout, ) tfu = db.require_by_id('topic_fetch_urls', tfu['topic_fetch_urls_id']) assert tfu['state'] == FETCH_STATE_REQUEUED
async def test_workflow(): db = connect_to_db() test_medium = create_test_medium(db=db, label='test') test_feed = create_test_feed(db=db, label='test', medium=test_medium) # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be # used to guess the probable language of the podcast episode test_story = create_test_story(db=db, label='keeping up with Kardashians', feed=test_feed) stories_id = test_story['stories_id'] with open(TEST_MP3_PATH, mode='rb') as f: test_mp3_data = f.read() # noinspection PyUnusedLocal def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += "Content-Type: audio/mpeg\r\n".encode('utf-8') response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8') response += "\r\n".encode('utf-8') response += test_mp3_data return response port = random_unused_port() pages = { '/test.mp3': { 'callback': __mp3_callback, } } hs = HashServer(port=port, pages=pages) hs.start() # Not localhost as this might get fetched from a remote worker mp3_url = hs.page_url('/test.mp3') db.insert(table='story_enclosures', insert_hash={ 'stories_id': stories_id, 'url': mp3_url, 'mime_type': 'audio/mpeg', 'length': len(test_mp3_data), }) client = workflow_client() # Start worker factory = WorkerFactory(client=client, namespace=client.namespace) worker = factory.new_worker(task_queue=TASK_QUEUE) # Use an activities implementation with random GCS prefixes set activities = _RandomPrefixesPodcastTranscribeActivities() worker.register_activities_implementation( activities_instance=activities, activities_cls_name=PodcastTranscribeActivities.__name__, ) worker.register_workflow_implementation_type( impl_cls=PodcastTranscribeWorkflowImpl) factory.start() # Initialize workflow instance workflow: PodcastTranscribeWorkflow = client.new_workflow_stub( cls=PodcastTranscribeWorkflow, workflow_options=WorkflowOptions( workflow_id=str(stories_id), # By default, if individual activities of the workflow fail, they will get restarted pretty much # indefinitely, and so this test might run for days (or rather just timeout on the CI). So we cap the # workflow so that if it doesn't manage to complete in X minutes, we consider it as failed. workflow_run_timeout=timedelta(minutes=5), ), ) # Wait for the workflow to complete await workflow.transcribe_episode(stories_id) downloads = db.select(table='downloads', what_to_select='*').hashes() assert len(downloads) == 1 first_download = downloads[0] assert first_download['stories_id'] == stories_id assert first_download['type'] == 'content' assert first_download['state'] == 'success' download_content = fetch_content(db=db, download=first_download) # It's what gets said in the sample MP3 file assert 'Kim Kardashian' in download_content # Initiate the worker shutdown in the background while we do the GCS cleanup so that the stop_workers_faster() # doesn't have to wait that long await worker.stop(background=True) log.info("Cleaning up GCS...") GCSStore(bucket_config=activities.config.raw_enclosures()).delete_object( object_id=str(stories_id)) GCSStore( bucket_config=activities.config.transcoded_episodes()).delete_object( object_id=str(stories_id)) GCSStore(bucket_config=activities.config.transcripts()).delete_object( object_id=str(stories_id)) log.info("Cleaned up GCS") log.info("Stopping workers...") await stop_worker_faster(worker) log.info("Stopped workers")