Beispiel #1
0
    def test_object_uri(self):
        gcs = GCSStore()

        with pytest.raises(McPodcastMisconfiguredGCSException,
                           message="Empty object ID"):
            gcs.object_uri(object_id='')

        class NoPathPrefixConfig(PodcastFetchEpisodeConfig):
            @staticmethod
            def gc_storage_path_prefix() -> str:
                return ''

        config = NoPathPrefixConfig()
        gcs = GCSStore(config=config)
        assert gcs.object_uri(
            object_id='a') == f'gs://{config.gc_storage_bucket_name()}/a'

        class MultiPathPrefixConfig(PodcastFetchEpisodeConfig):
            @staticmethod
            def gc_storage_path_prefix() -> str:
                return '//foo/bar//'

        config = MultiPathPrefixConfig()
        gcs = GCSStore(config=config)
        assert gcs.object_uri(
            object_id='a'
        ) == f'gs://{config.gc_storage_bucket_name()}/foo/bar/a'
Beispiel #2
0
def fetch_and_store_episode(
        db: DatabaseHandler,
        stories_id: int,
        config: Optional[PodcastFetchEpisodeConfig] = None) -> None:
    """
    Choose a viable story enclosure for podcast, fetch it, transcode if needed, store to GCS, and record to DB.

    1) Determines the episode's likely language by looking into its title and description, converts the language code to
       BCP 47;
    1) Using enclosures from "story_enclosures", chooses the one that looks like a podcast episode the most;
    2) Fetches the chosen enclosure;
    3) Transcodes the file (if needed) by:
        a) converting it to an audio format that the Speech API can support, and / or
        b) discarding video stream from the media file, and / or
        c) discarding other audio streams from the media file;
    5) Reads the various parameters, e.g. sample rate, of the episode audio file;
    4) Uploads the episode audio file to Google Cloud Storage;
    5) Adds a row to "podcast_episodes".

    Adding a job to submit the newly created episode to Speech API (by adding a RabbitMQ job) is up to the caller.

    :param db: Database handler.
    :param stories_id: Story ID for the story to operate on.
    :param config: (optional) Podcast fetcher configuration object (useful for testing).
    """

    if not config:
        config = PodcastFetchEpisodeConfig()

    story = db.find_by_id(table='stories', object_id=stories_id)
    if not story:
        raise McStoryNotFoundException(f"Story {stories_id} was not found.")

    # Try to determine language of the story
    story_title = story['title']
    story_description = html_strip(story['description'])
    sample_text = f"{story_title}\n{story_description}"

    iso_639_1_language_code = None
    if identification_would_be_reliable(text=sample_text):
        iso_639_1_language_code = language_code_for_text(text=sample_text)

    if not iso_639_1_language_code:
        iso_639_1_language_code = 'en'

    # Convert to BCP 47 identifier
    bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier(
        iso_639_1_code=iso_639_1_language_code,
        url_hint=story['url'],
    )

    # Find the enclosure that might work the best
    best_enclosure = podcast_viable_enclosure_for_story(db=db,
                                                        stories_id=stories_id)
    if not best_enclosure:
        raise McPodcastNoViableStoryEnclosuresException(
            f"There were no viable enclosures found for story {stories_id}")

    if best_enclosure.length:
        if best_enclosure.length > MAX_ENCLOSURE_SIZE:
            raise McPodcastEnclosureTooBigException(
                f"Chosen enclosure {best_enclosure} is too big.")

    try:
        temp_dir = tempfile.mkdtemp('fetch_and_store')
    except Exception as ex:
        raise McPodcastFileStoreFailureException(
            f"Unable to create temporary directory: {ex}")

    # Fetch enclosure
    input_filename = 'input_file'
    input_file_path = os.path.join(temp_dir, input_filename)
    log.info(f"Fetching enclosure {best_enclosure} to {input_file_path}...")
    fetch_big_file(url=best_enclosure.url,
                   dest_file=input_file_path,
                   max_size=MAX_ENCLOSURE_SIZE)
    log.info(f"Done fetching enclosure {best_enclosure} to {input_file_path}")

    if os.stat(input_file_path).st_size == 0:
        # Might happen with misconfigured webservers
        raise McPodcastFileFetchFailureException(
            f"Fetched file {input_file_path} is empty.")

    # Transcode if needed
    input_file_obj = TranscodeTempDirAndFile(temp_dir=temp_dir,
                                             filename=input_filename)
    transcoded_file_obj = transcode_media_file_if_needed(
        input_media_file=input_file_obj)

    # Unset the variable so that we don't accidentally use it later
    del input_filename, temp_dir

    if input_file_obj != transcoded_file_obj:
        # Function did some transcoding and stored everything in yet another file

        # Remove the input file
        _cleanup_temp_dir(temp=input_file_obj)

        # Consider the transcoded file the new input file
        input_file_obj = transcoded_file_obj

    # (Re)read the properties of either the original or the transcoded file
    media_info = media_file_info(media_file_path=input_file_obj.temp_full_path)
    best_audio_stream = media_info.best_supported_audio_stream()

    # Store input file to GCS
    try:
        gcs = GCSStore(config=config)
        gcs_uri = gcs.store_object(
            local_file_path=input_file_obj.temp_full_path,
            object_id=str(stories_id),
            mime_type=best_audio_stream.audio_codec_class.mime_type(),
        )

    except Exception as ex:

        log.error(
            f"Unable to store episode file '{input_file_obj.temp_full_path}' for story {stories_id}: {ex}"
        )

        # Clean up, then raise further
        _cleanup_temp_dir(temp=input_file_obj)

        raise ex

    # Clean up the locally stored file as we don't need it anymore
    _cleanup_temp_dir(temp=input_file_obj)

    # Insert everything to the database
    try:
        db.query(
            """
            INSERT INTO podcast_episodes (
                stories_id,
                story_enclosures_id,
                gcs_uri,
                duration,
                codec,
                sample_rate,
                bcp47_language_code
            ) VALUES (
                %(stories_id)s,
                %(story_enclosures_id)s,
                %(gcs_uri)s,
                %(duration)s,
                %(codec)s,
                %(sample_rate)s,
                %(bcp47_language_code)s            
            ) ON CONFLICT (stories_id) DO UPDATE SET
                story_enclosures_id = %(story_enclosures_id)s,
                gcs_uri = %(gcs_uri)s,
                duration = %(duration)s,
                codec = %(codec)s,
                sample_rate = %(sample_rate)s,
                bcp47_language_code = %(bcp47_language_code)s
        """, {
                'stories_id':
                stories_id,
                'story_enclosures_id':
                best_enclosure.story_enclosures_id,
                'gcs_uri':
                gcs_uri,
                'duration':
                best_audio_stream.duration,
                'codec':
                best_audio_stream.audio_codec_class.postgresql_enum_value(),
                'sample_rate':
                best_audio_stream.sample_rate,
                'bcp47_language_code':
                bcp_47_language_code,
            })

    except Exception as ex_db:

        # Try to delete object on GCS first
        try:
            gcs.delete_object(object_id=str(stories_id))
        except Exception as ex_gcs:
            # We should be able to delete it as we've just uploaded it
            raise McPodcastGCSStoreFailureException((
                f"Unable to clean up story's {stories_id} audio file from GCS after database insert failure; "
                f"database insert exception: {ex_db}; "
                f"GCS exception: {ex_gcs}"))

        raise McPodcastPostgreSQLException(
            f"Failed inserting episode for story {stories_id}: {ex_db}")
def test_fetch_and_store_episode():
    db = connect_to_db()

    test_medium = create_test_medium(db=db, label='test')
    test_feed = create_test_feed(db=db, label='test', medium=test_medium)

    # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be
    # used to guess the probable language of the podcast episode
    test_story = create_test_story(db=db,
                                   label='keeping up with Kardashians',
                                   feed=test_feed)

    stories_id = test_story['stories_id']

    with open(TEST_MP3_PATH, mode='rb') as f:
        test_mp3_data = f.read()

    # noinspection PyUnusedLocal
    def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]:
        response = "".encode('utf-8')
        response += "HTTP/1.0 200 OK\r\n".encode('utf-8')
        response += "Content-Type: audio/mpeg\r\n".encode('utf-8')
        response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8')
        response += "\r\n".encode('utf-8')
        response += test_mp3_data
        return response

    port = random_unused_port()
    pages = {
        '/test.mp3': {
            'callback': __mp3_callback,
        }
    }

    hs = HashServer(port=port, pages=pages)
    hs.start()

    mp3_url = f'http://127.0.0.1:{port}/test.mp3'

    story_enclosure = db.insert(table='story_enclosures',
                                insert_hash={
                                    'stories_id': stories_id,
                                    'url': mp3_url,
                                    'mime_type': 'audio/mpeg',
                                    'length': len(test_mp3_data),
                                })

    conf = RandomPathPrefixConfig()
    fetch_and_store_episode(db=db, stories_id=stories_id, config=conf)

    episodes = db.select(table='podcast_episodes', what_to_select='*').hashes()
    assert len(episodes), f"Only one episode is expected."

    episode = episodes[0]
    assert episode['stories_id'] == stories_id
    assert episode['story_enclosures_id'] == story_enclosure[
        'story_enclosures_id']
    assert episode[
        'gcs_uri'] == f"gs://{conf.gc_storage_bucket_name()}/{conf.gc_storage_path_prefix()}/{stories_id}"
    assert episode['duration'] > 0
    assert episode['codec'] == 'MP3'
    assert episode['sample_rate'] == 44100
    assert episode['bcp47_language_code'] == 'en-US'

    # Try removing test object
    gcs = GCSStore(config=conf)
    gcs.delete_object(object_id=str(stories_id))
Beispiel #4
0
    def test_store_exists_delete(self):
        config = RandomPathPrefixConfig()
        gcs = GCSStore(config=config)

        object_id = 'test'
        assert gcs.object_exists(object_id=object_id) is False

        mock_data = os.urandom(1024 * 10)
        temp_file = os.path.join(tempfile.mkdtemp('test'), 'test')
        with open(temp_file, mode='wb') as f:
            f.write(mock_data)

        gcs.store_object(local_file_path=temp_file, object_id=object_id)
        assert gcs.object_exists(object_id=object_id) is True

        # Try storing twice
        gcs.store_object(local_file_path=temp_file, object_id=object_id)
        assert gcs.object_exists(object_id=object_id) is True

        gcs.delete_object(object_id=object_id)
        assert gcs.object_exists(object_id=object_id) is False

        # Try deleting nonexistent object
        gcs.delete_object(object_id='does_not_exist')
Beispiel #5
0
    def test_remote_path(self):
        with pytest.raises(McPodcastMisconfiguredGCSException,
                           message="Empty object ID"):
            GCSStore._remote_path(path_prefix='', object_id='')

        assert GCSStore._remote_path(path_prefix='', object_id='a') == 'a'
        assert GCSStore._remote_path(path_prefix='', object_id='/a') == 'a'
        assert GCSStore._remote_path(path_prefix='/', object_id='a') == 'a'
        assert GCSStore._remote_path(path_prefix='/', object_id='/a') == 'a'

        # GCS doesn't like double slashes
        assert GCSStore._remote_path(path_prefix='//', object_id='a') == 'a'
        assert GCSStore._remote_path(path_prefix='//', object_id='/a') == 'a'
        assert GCSStore._remote_path(path_prefix='//', object_id='//a') == 'a'
        assert GCSStore._remote_path(path_prefix='//', object_id='//a') == 'a'

        assert GCSStore._remote_path(path_prefix='//',
                                     object_id='//a///b//c') == 'a/b/c'

        assert GCSStore._remote_path(path_prefix='//',
                                     object_id='//a///b//../b/c') == 'a/b/c'