Exemple #1
0
def test_fetch_url():
    db = connect_to_db()

    def _meta_redirect(r):
        resp = ""
        resp += 'HTTP/1.0 200 OK\r\n'
        resp += 'Content-Type: text/html\r\n\r\n'
        resp += '<meta http-equiv="refresh" content="0; url=%s-foo">\n' % r.url(
        )
        return resp

    hs = HashServer(port=0,
                    pages={
                        '/foo': 'bar',
                        '/400': {
                            'http_status_code': 400
                        },
                        '/404': {
                            'http_status_code': 404
                        },
                        '/500': {
                            'http_status_code': 500
                        },
                        '/mr-foo': 'meta redirect target',
                        '/mr': {
                            'callback': _meta_redirect
                        },
                    })

    hs.start(delay=2)

    port = hs.port()

    timeout_args = {
        'network_down_host': 'localhost',
        'network_down_port': port,
        'network_down_timeout': 1,
        'domain_timeout': 0
    }

    # before delayed start, 404s and 500s should still return None
    assert not _fetch_url(db, hs.page_url('/404'), **timeout_args).is_success
    assert not _fetch_url(db, hs.page_url('/500'), **timeout_args).is_success

    # request for a valid page should make the call wait until the hs comes up
    assert _fetch_url(db, hs.page_url('/foo'), **timeout_args).content == 'bar'

    # and now a 400 should return a None
    assert not _fetch_url(db, hs.page_url('/400'), **timeout_args).is_success

    # make sure invalid url does not raise an exception
    assert not _fetch_url(db, 'this is not a url', **timeout_args) is None

    # make sure that requests follow meta redirects
    response = _fetch_url(db, hs.page_url('/mr'), **timeout_args)

    assert response.content == 'meta redirect target'
    assert response.last_requested_url == hs.page_url('/mr-foo')
def testDelay() -> None:
    """Test the delay= parameter to hs.start."""
    hs = HashServer(port=0, pages={'/foo': 'bar'})

    hs.start(delay=1)
    caught_exception = False
    try:
        requests.get(hs.page_url('/foo'))
    except requests.exceptions.ConnectionError:
        caught_exception = True

    assert caught_exception

    time.sleep(2)
    assert str(requests.get(hs.page_url('/foo')).text) == 'bar'

    hs.stop()
    def test_request(self) -> None:
        """Test requests with throttling."""
        pages = {'/test': 'Hello!', }
        port = 8888
        hs = HashServer(port=port, pages=pages)
        hs.start()

        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        test_url = hs.page_url('/test')

        # first request should work
        response = ua.get(test_url)
        assert response.decoded_content() == 'Hello!'

        # fail because we're in the timeout
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        self.assertRaises(McThrottledDomainException, ua.get, test_url)

        # succeed because it's a different domain
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        response = ua.get('http://127.0.0.1:8888/test')
        assert response.decoded_content() == 'Hello!'

        # still fail within the timeout
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        self.assertRaises(McThrottledDomainException, ua.get, test_url)

        time.sleep(2)

        # now we're outside the timeout, so it should work
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        response = ua.get(test_url)
        assert response.decoded_content() == 'Hello!'

        # and follow up request on the same ua object should work
        response = ua.get(test_url)
        assert response.decoded_content() == 'Hello!'

        # but then fail within the new timeout period with a new object
        ua = ThrottledUserAgent(self.db(), domain_timeout=2)
        self.assertRaises(McThrottledDomainException, ua.get, test_url)

        hs.stop()

        # test domain_timeout assignment logic
        ua = ThrottledUserAgent(self.db(), domain_timeout=100)
        assert ua.domain_timeout == 100

        config = mediawords.util.config.get_config()

        config['mediawords']['throttled_user_agent_domain_timeout'] = 200
        ua = ThrottledUserAgent(self.db())
        assert ua.domain_timeout == 200

        del config['mediawords']['throttled_user_agent_domain_timeout']
        ua = ThrottledUserAgent(self.db())
        assert ua.domain_timeout == mediawords.util.web.user_agent.throttled._DEFAULT_DOMAIN_TIMEOUT
def testRandomPort() -> None:
    """Test assigning a random port where port = 0."""

    hss = []
    for i in range(3):
        hs = HashServer(port=0, pages={'/foo': 'bar'})
        assert hs is not None

        hs.start()

        assert hs.port() >= START_RANDOM_PORT
        assert tcp_port_is_open(hs.port())
        assert str(requests.get(hs.page_url('/foo')).text) == 'bar'
        hss.append(hs)

    [hs.stop() for hs in hss]
def test_http_hash_server():
    port = random_unused_port()
    base_url = 'http://localhost:%d' % port

    def __simple_callback(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback',
            'method': request.method(),
            'url': request.url(),
            'content-type': request.content_type(),
            'params': request.query_params(),
            'cookies': request.cookies(),
        })
        return str.encode(r)

    # noinspection PyUnusedLocal
    def __callback_cookie_redirect(request: HashServer.Request) -> str:
        r = ""
        r += "HTTP/1.0 302 Moved Temporarily\r\n"
        r += "Content-Type: text/html; charset=UTF-8\r\n"
        r += "Location: /check_cookie\r\n"
        r += "Set-Cookie: test_cookie=I'm a cookie and I know it!\r\n"
        r += "\r\n"
        r += "Redirecting to the cookie check page..."
        return r

    def __callback_post(request: HashServer.Request) -> Union[str, bytes]:
        r = ""
        r += "HTTP/1.0 200 OK\r\n"
        r += "Content-Type: application/json; charset=UTF-8\r\n"
        r += "\r\n"
        r += json.dumps({
            'name': 'callback_post',
            'post_data': request.content(),
        })
        return str.encode(r)

    pages = {
        '/': 'home',
        '/foo': b'foo',
        '/bar': 'bar ąą',
        '/foo-bar': {b'redirect': b'/bar'},
        '/localhost': {'redirect': "http://localhost:%d/" % port},
        b'/127-foo': {b'redirect': "http://127.0.0.1:%d/foo" % port},
        '/auth': {b'auth': b'foo:bar', b'content': b"foo bar \xf0\x90\x28\xbc"},
        '/404': {b'content': b'not found', b'http_status_code': 404},
        '/callback': {b'callback': __simple_callback},

        # Test setting cookies, redirects
        '/callback_cookie_redirect': {'callback': __callback_cookie_redirect},

        # POST data
        '/callback_post': {'callback': __callback_post},
    }

    hs = HashServer(port=port, pages=pages)
    assert hs

    hs.start()

    assert tcp_port_is_open(port=port)

    assert str(requests.get('%s/' % base_url).text) == 'home'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/foo-bar' % base_url).text) == 'bar ąą'
    assert str(requests.get('%s/localhost' % base_url).text) == 'home'
    assert str(requests.get('%s/127-foo' % base_url).text) == 'foo'

    # Path normalization
    assert str(requests.get('%s//' % base_url).text) == 'home'
    assert str(requests.get('%s///' % base_url).text) == 'home'
    assert str(requests.get('%s/something/../' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..//' % base_url).text) == 'home'
    assert str(requests.get('%s/something/..///' % base_url).text) == 'home'
    assert str(requests.get('%s/foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo///' % base_url).text) == 'foo'
    assert str(requests.get('%s/foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo/' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo//' % base_url).text) == 'foo'
    assert str(requests.get('%s/bar/../foo///' % base_url).text) == 'foo'

    response_json = requests.get('%s/callback?a=b&c=d' % base_url, cookies={'cookie_name': 'cookie_value'}).json()
    assert response_json == {
        'name': 'callback',
        'method': 'GET',
        'url': 'http://localhost:%d/callback?a=b&c=d' % port,
        'content-type': None,
        'params': {
            'a': 'b',
            'c': 'd',
        },
        'cookies': {
            'cookie_name': 'cookie_value',
        },
    }

    response = requests.get('%s/callback_cookie_redirect' % base_url, allow_redirects=False)
    assert response.status_code == 302
    assert response.headers['Location'] == '/check_cookie'

    response = requests.get("%s/404" % base_url)
    assert response.status_code == HTTPStatus.NOT_FOUND.value
    assert 'Not Found' in response.reason

    auth_url = "%s/auth" % base_url

    assert requests.get(auth_url).status_code == HTTPStatus.UNAUTHORIZED
    assert requests.get(auth_url, auth=('foo', 'foo')).status_code == HTTPStatus.UNAUTHORIZED

    response = requests.get(auth_url, auth=('foo', 'bar'))
    assert response.status_code == HTTPStatus.OK
    assert response.content == b"foo bar \xf0\x90\x28\xbc"

    assert urls_are_equal(url1=hs.page_url('/callback?a=b&c=d'), url2='http://localhost:%d/callback' % port)
    with pytest.raises(McHashServerException):
        hs.page_url('/does-not-exist')

    response_json = requests.post('%s/callback_post' % base_url, data='abc=def').json()
    assert response_json == {
        'name': 'callback_post',
        'post_data': 'abc=def',
    }

    hs.stop()
def test_fetch_link_job_update_state():
    db = connect_to_db()

    # In testing environment, RabbitMQ or workers might be slow to start up so
    # we increase the delay ("timeout") between domain fetches to be able to
    # test out throttling
    domain_timeout = 360

    hs = HashServer(port=0,
                    pages={
                        '/foo': '<title>foo</title>',
                        '/throttle': '<title>throttle</title>'
                    })
    hs.start()

    topic = create_test_topic(db, 'foo')
    topic['pattern'] = '.'
    topic = db.update_by_id('topics', topic['topics_id'], topic)

    fetch_url = hs.page_url('/foo')

    # basic sanity test for link fetching
    tfu = db.create(
        'topic_fetch_urls', {
            'topics_id': topic['topics_id'],
            'url': hs.page_url('/foo'),
            'state': FETCH_STATE_PENDING
        })

    fetch_topic_url_update_state(
        db=db,
        topics_id=topic['topics_id'],
        topic_fetch_urls_id=tfu['topic_fetch_urls_id'],
        domain_timeout=domain_timeout,
    )

    tfu = db.require_by_id('topic_fetch_urls', tfu['topic_fetch_urls_id'])

    assert tfu['state'] == FETCH_STATE_STORY_ADDED
    assert tfu['url'] == fetch_url
    assert tfu['code'] == 200
    assert tfu['stories_id'] is not None

    new_story = db.require_by_id('stories', tfu['stories_id'])

    assert new_story['url'] == fetch_url
    assert new_story['title'] == 'foo'

    # now make sure that the domain throttling sets
    tfu = db.create(
        'topic_fetch_urls', {
            'topics_id': topic['topics_id'],
            'url': hs.page_url('/throttle'),
            'state': FETCH_STATE_PENDING
        })

    fetch_topic_url_update_state(
        db=db,
        topics_id=topic['topics_id'],
        topic_fetch_urls_id=tfu['topic_fetch_urls_id'],
        domain_timeout=domain_timeout,
    )

    tfu = db.require_by_id('topic_fetch_urls', tfu['topic_fetch_urls_id'])
    assert tfu['state'] == FETCH_STATE_REQUEUED
async def test_workflow():
    db = connect_to_db()

    test_medium = create_test_medium(db=db, label='test')
    test_feed = create_test_feed(db=db, label='test', medium=test_medium)

    # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be
    # used to guess the probable language of the podcast episode
    test_story = create_test_story(db=db,
                                   label='keeping up with Kardashians',
                                   feed=test_feed)

    stories_id = test_story['stories_id']

    with open(TEST_MP3_PATH, mode='rb') as f:
        test_mp3_data = f.read()

    # noinspection PyUnusedLocal
    def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]:
        response = "".encode('utf-8')
        response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
        response += "Content-Type: audio/mpeg\r\n".encode('utf-8')
        response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8')
        response += "\r\n".encode('utf-8')
        response += test_mp3_data
        return response

    port = random_unused_port()
    pages = {
        '/test.mp3': {
            'callback': __mp3_callback,
        }
    }

    hs = HashServer(port=port, pages=pages)
    hs.start()

    # Not localhost as this might get fetched from a remote worker
    mp3_url = hs.page_url('/test.mp3')

    db.insert(table='story_enclosures',
              insert_hash={
                  'stories_id': stories_id,
                  'url': mp3_url,
                  'mime_type': 'audio/mpeg',
                  'length': len(test_mp3_data),
              })

    client = workflow_client()

    # Start worker
    factory = WorkerFactory(client=client, namespace=client.namespace)
    worker = factory.new_worker(task_queue=TASK_QUEUE)

    # Use an activities implementation with random GCS prefixes set
    activities = _RandomPrefixesPodcastTranscribeActivities()

    worker.register_activities_implementation(
        activities_instance=activities,
        activities_cls_name=PodcastTranscribeActivities.__name__,
    )
    worker.register_workflow_implementation_type(
        impl_cls=PodcastTranscribeWorkflowImpl)
    factory.start()

    # Initialize workflow instance
    workflow: PodcastTranscribeWorkflow = client.new_workflow_stub(
        cls=PodcastTranscribeWorkflow,
        workflow_options=WorkflowOptions(
            workflow_id=str(stories_id),

            # By default, if individual activities of the workflow fail, they will get restarted pretty much
            # indefinitely, and so this test might run for days (or rather just timeout on the CI). So we cap the
            # workflow so that if it doesn't manage to complete in X minutes, we consider it as failed.
            workflow_run_timeout=timedelta(minutes=5),
        ),
    )

    # Wait for the workflow to complete
    await workflow.transcribe_episode(stories_id)

    downloads = db.select(table='downloads', what_to_select='*').hashes()
    assert len(downloads) == 1
    first_download = downloads[0]
    assert first_download['stories_id'] == stories_id
    assert first_download['type'] == 'content'
    assert first_download['state'] == 'success'

    download_content = fetch_content(db=db, download=first_download)

    # It's what gets said in the sample MP3 file
    assert 'Kim Kardashian' in download_content

    # Initiate the worker shutdown in the background while we do the GCS cleanup so that the stop_workers_faster()
    # doesn't have to wait that long
    await worker.stop(background=True)

    log.info("Cleaning up GCS...")
    GCSStore(bucket_config=activities.config.raw_enclosures()).delete_object(
        object_id=str(stories_id))
    GCSStore(
        bucket_config=activities.config.transcoded_episodes()).delete_object(
            object_id=str(stories_id))
    GCSStore(bucket_config=activities.config.transcripts()).delete_object(
        object_id=str(stories_id))
    log.info("Cleaned up GCS")

    log.info("Stopping workers...")
    await stop_worker_faster(worker)
    log.info("Stopped workers")