def test_object_uri(self): gcs = GCSStore() with pytest.raises(McPodcastMisconfiguredGCSException, message="Empty object ID"): gcs.object_uri(object_id='') class NoPathPrefixConfig(PodcastFetchEpisodeConfig): @staticmethod def gc_storage_path_prefix() -> str: return '' config = NoPathPrefixConfig() gcs = GCSStore(config=config) assert gcs.object_uri( object_id='a') == f'gs://{config.gc_storage_bucket_name()}/a' class MultiPathPrefixConfig(PodcastFetchEpisodeConfig): @staticmethod def gc_storage_path_prefix() -> str: return '//foo/bar//' config = MultiPathPrefixConfig() gcs = GCSStore(config=config) assert gcs.object_uri( object_id='a' ) == f'gs://{config.gc_storage_bucket_name()}/foo/bar/a'
def fetch_and_store_episode( db: DatabaseHandler, stories_id: int, config: Optional[PodcastFetchEpisodeConfig] = None) -> None: """ Choose a viable story enclosure for podcast, fetch it, transcode if needed, store to GCS, and record to DB. 1) Determines the episode's likely language by looking into its title and description, converts the language code to BCP 47; 1) Using enclosures from "story_enclosures", chooses the one that looks like a podcast episode the most; 2) Fetches the chosen enclosure; 3) Transcodes the file (if needed) by: a) converting it to an audio format that the Speech API can support, and / or b) discarding video stream from the media file, and / or c) discarding other audio streams from the media file; 5) Reads the various parameters, e.g. sample rate, of the episode audio file; 4) Uploads the episode audio file to Google Cloud Storage; 5) Adds a row to "podcast_episodes". Adding a job to submit the newly created episode to Speech API (by adding a RabbitMQ job) is up to the caller. :param db: Database handler. :param stories_id: Story ID for the story to operate on. :param config: (optional) Podcast fetcher configuration object (useful for testing). """ if not config: config = PodcastFetchEpisodeConfig() story = db.find_by_id(table='stories', object_id=stories_id) if not story: raise McStoryNotFoundException(f"Story {stories_id} was not found.") # Try to determine language of the story story_title = story['title'] story_description = html_strip(story['description']) sample_text = f"{story_title}\n{story_description}" iso_639_1_language_code = None if identification_would_be_reliable(text=sample_text): iso_639_1_language_code = language_code_for_text(text=sample_text) if not iso_639_1_language_code: iso_639_1_language_code = 'en' # Convert to BCP 47 identifier bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier( iso_639_1_code=iso_639_1_language_code, url_hint=story['url'], ) # Find the enclosure that might work the best best_enclosure = podcast_viable_enclosure_for_story(db=db, stories_id=stories_id) if not best_enclosure: raise McPodcastNoViableStoryEnclosuresException( f"There were no viable enclosures found for story {stories_id}") if best_enclosure.length: if best_enclosure.length > MAX_ENCLOSURE_SIZE: raise McPodcastEnclosureTooBigException( f"Chosen enclosure {best_enclosure} is too big.") try: temp_dir = tempfile.mkdtemp('fetch_and_store') except Exception as ex: raise McPodcastFileStoreFailureException( f"Unable to create temporary directory: {ex}") # Fetch enclosure input_filename = 'input_file' input_file_path = os.path.join(temp_dir, input_filename) log.info(f"Fetching enclosure {best_enclosure} to {input_file_path}...") fetch_big_file(url=best_enclosure.url, dest_file=input_file_path, max_size=MAX_ENCLOSURE_SIZE) log.info(f"Done fetching enclosure {best_enclosure} to {input_file_path}") if os.stat(input_file_path).st_size == 0: # Might happen with misconfigured webservers raise McPodcastFileFetchFailureException( f"Fetched file {input_file_path} is empty.") # Transcode if needed input_file_obj = TranscodeTempDirAndFile(temp_dir=temp_dir, filename=input_filename) transcoded_file_obj = transcode_media_file_if_needed( input_media_file=input_file_obj) # Unset the variable so that we don't accidentally use it later del input_filename, temp_dir if input_file_obj != transcoded_file_obj: # Function did some transcoding and stored everything in yet another file # Remove the input file _cleanup_temp_dir(temp=input_file_obj) # Consider the transcoded file the new input file input_file_obj = transcoded_file_obj # (Re)read the properties of either the original or the transcoded file media_info = media_file_info(media_file_path=input_file_obj.temp_full_path) best_audio_stream = media_info.best_supported_audio_stream() # Store input file to GCS try: gcs = GCSStore(config=config) gcs_uri = gcs.store_object( local_file_path=input_file_obj.temp_full_path, object_id=str(stories_id), mime_type=best_audio_stream.audio_codec_class.mime_type(), ) except Exception as ex: log.error( f"Unable to store episode file '{input_file_obj.temp_full_path}' for story {stories_id}: {ex}" ) # Clean up, then raise further _cleanup_temp_dir(temp=input_file_obj) raise ex # Clean up the locally stored file as we don't need it anymore _cleanup_temp_dir(temp=input_file_obj) # Insert everything to the database try: db.query( """ INSERT INTO podcast_episodes ( stories_id, story_enclosures_id, gcs_uri, duration, codec, sample_rate, bcp47_language_code ) VALUES ( %(stories_id)s, %(story_enclosures_id)s, %(gcs_uri)s, %(duration)s, %(codec)s, %(sample_rate)s, %(bcp47_language_code)s ) ON CONFLICT (stories_id) DO UPDATE SET story_enclosures_id = %(story_enclosures_id)s, gcs_uri = %(gcs_uri)s, duration = %(duration)s, codec = %(codec)s, sample_rate = %(sample_rate)s, bcp47_language_code = %(bcp47_language_code)s """, { 'stories_id': stories_id, 'story_enclosures_id': best_enclosure.story_enclosures_id, 'gcs_uri': gcs_uri, 'duration': best_audio_stream.duration, 'codec': best_audio_stream.audio_codec_class.postgresql_enum_value(), 'sample_rate': best_audio_stream.sample_rate, 'bcp47_language_code': bcp_47_language_code, }) except Exception as ex_db: # Try to delete object on GCS first try: gcs.delete_object(object_id=str(stories_id)) except Exception as ex_gcs: # We should be able to delete it as we've just uploaded it raise McPodcastGCSStoreFailureException(( f"Unable to clean up story's {stories_id} audio file from GCS after database insert failure; " f"database insert exception: {ex_db}; " f"GCS exception: {ex_gcs}")) raise McPodcastPostgreSQLException( f"Failed inserting episode for story {stories_id}: {ex_db}")
def test_fetch_and_store_episode(): db = connect_to_db() test_medium = create_test_medium(db=db, label='test') test_feed = create_test_feed(db=db, label='test', medium=test_medium) # 'label' is important as it will be stored in both stories.title and stories.description, which in turn will be # used to guess the probable language of the podcast episode test_story = create_test_story(db=db, label='keeping up with Kardashians', feed=test_feed) stories_id = test_story['stories_id'] with open(TEST_MP3_PATH, mode='rb') as f: test_mp3_data = f.read() # noinspection PyUnusedLocal def __mp3_callback(request: HashServer.Request) -> Union[str, bytes]: response = "".encode('utf-8') response += "HTTP/1.0 200 OK\r\n".encode('utf-8') response += "Content-Type: audio/mpeg\r\n".encode('utf-8') response += f"Content-Length: {len(test_mp3_data)}\r\n".encode('utf-8') response += "\r\n".encode('utf-8') response += test_mp3_data return response port = random_unused_port() pages = { '/test.mp3': { 'callback': __mp3_callback, } } hs = HashServer(port=port, pages=pages) hs.start() mp3_url = f'http://127.0.0.1:{port}/test.mp3' story_enclosure = db.insert(table='story_enclosures', insert_hash={ 'stories_id': stories_id, 'url': mp3_url, 'mime_type': 'audio/mpeg', 'length': len(test_mp3_data), }) conf = RandomPathPrefixConfig() fetch_and_store_episode(db=db, stories_id=stories_id, config=conf) episodes = db.select(table='podcast_episodes', what_to_select='*').hashes() assert len(episodes), f"Only one episode is expected." episode = episodes[0] assert episode['stories_id'] == stories_id assert episode['story_enclosures_id'] == story_enclosure[ 'story_enclosures_id'] assert episode[ 'gcs_uri'] == f"gs://{conf.gc_storage_bucket_name()}/{conf.gc_storage_path_prefix()}/{stories_id}" assert episode['duration'] > 0 assert episode['codec'] == 'MP3' assert episode['sample_rate'] == 44100 assert episode['bcp47_language_code'] == 'en-US' # Try removing test object gcs = GCSStore(config=conf) gcs.delete_object(object_id=str(stories_id))
def test_store_exists_delete(self): config = RandomPathPrefixConfig() gcs = GCSStore(config=config) object_id = 'test' assert gcs.object_exists(object_id=object_id) is False mock_data = os.urandom(1024 * 10) temp_file = os.path.join(tempfile.mkdtemp('test'), 'test') with open(temp_file, mode='wb') as f: f.write(mock_data) gcs.store_object(local_file_path=temp_file, object_id=object_id) assert gcs.object_exists(object_id=object_id) is True # Try storing twice gcs.store_object(local_file_path=temp_file, object_id=object_id) assert gcs.object_exists(object_id=object_id) is True gcs.delete_object(object_id=object_id) assert gcs.object_exists(object_id=object_id) is False # Try deleting nonexistent object gcs.delete_object(object_id='does_not_exist')
def test_remote_path(self): with pytest.raises(McPodcastMisconfiguredGCSException, message="Empty object ID"): GCSStore._remote_path(path_prefix='', object_id='') assert GCSStore._remote_path(path_prefix='', object_id='a') == 'a' assert GCSStore._remote_path(path_prefix='', object_id='/a') == 'a' assert GCSStore._remote_path(path_prefix='/', object_id='a') == 'a' assert GCSStore._remote_path(path_prefix='/', object_id='/a') == 'a' # GCS doesn't like double slashes assert GCSStore._remote_path(path_prefix='//', object_id='a') == 'a' assert GCSStore._remote_path(path_prefix='//', object_id='/a') == 'a' assert GCSStore._remote_path(path_prefix='//', object_id='//a') == 'a' assert GCSStore._remote_path(path_prefix='//', object_id='//a') == 'a' assert GCSStore._remote_path(path_prefix='//', object_id='//a///b//c') == 'a/b/c' assert GCSStore._remote_path(path_prefix='//', object_id='//a///b//../b/c') == 'a/b/c'