async def identify_story_bcp47_language_code( self, stories_id: int) -> Optional[str]: log.info(f"Identifying story language for story {stories_id}...") db = connect_to_db_or_raise() story = db.find_by_id(table='stories', object_id=stories_id) if not story: raise McPermanentError(f"Story {stories_id} was not found.") # Podcast episodes typically come with title and description set so try guessing from that story_title = story['title'] story_description = html_strip(story['description']) sample_text = f"{story_title}\n{story_description}" bcp_47_language_code = None if identification_would_be_reliable(text=sample_text): iso_639_1_language_code = language_code_for_text(text=sample_text) # Convert to BCP 47 identifier bcp_47_language_code = iso_639_1_code_to_bcp_47_identifier( iso_639_1_code=iso_639_1_language_code, url_hint=story['url'], ) log.info( f"Language code for story {stories_id} is {bcp_47_language_code}") return bcp_47_language_code
async def fetch_enclosure_to_gcs(self, stories_id: int, enclosure: StoryEnclosureDict) -> None: log.info(f"Fetching enclosure to GCS for story {stories_id}") log.debug(f"Best enclosure for story {stories_id}: {enclosure}") enclosure = StoryEnclosure.from_dict(enclosure) with tempfile.TemporaryDirectory( prefix='fetch_enclosure_to_gcs') as temp_dir: raw_enclosure_path = os.path.join(temp_dir, 'raw_enclosure') fetch_big_file(url=enclosure.url, dest_file=raw_enclosure_path, max_size=self.config.max_enclosure_size()) if os.stat(raw_enclosure_path).st_size == 0: # Might happen with misconfigured webservers raise McPermanentError( f"Fetched file {raw_enclosure_path} is empty.") gcs = GCSStore(bucket_config=self.config.raw_enclosures()) gcs.upload_object(local_file_path=raw_enclosure_path, object_id=str(stories_id)) log.info(f"Done fetching enclosure to GCS for story {stories_id}")
def download_object(self, object_id: str, local_file_path: str) -> None: """ Download a GCS object to a local file. :param object_id: Object ID of an object that should be downloaded. :param local_file_path: Local file that the object should be stored to. """ if os.path.isfile(local_file_path): raise McProgrammingError( f"Local file '{local_file_path}' already exists.") if not object_id: raise McProgrammingError("Object ID is unset.") log.debug( f"Downloading object ID {object_id} to '{local_file_path}'...") if not self.object_exists(object_id=object_id): raise McPermanentError(f"Object ID {object_id} was not found.") blob = self._blob_from_object_id(object_id=object_id) try: blob.download_to_filename(filename=local_file_path) except Exception as ex: raise McTransientError( f"Unable to download object ID {object_id} to '{local_file_path}': {ex}" )
def fetch_big_file(url: str, dest_file: str, max_size: int = 0) -> None: """ Fetch a huge file from an URL to a local file. Raises one of the _AbstractFetchBigFileException exceptions. :param url: URL that points to a huge file. :param dest_file: Destination path to write the fetched file to. :param max_size: If >0, limit the file size to a defined number of bytes. :raise: ProgrammingError on unexpected fatal conditions. """ if os.path.exists(dest_file): # Something's wrong with the code raise McProgrammingError(f"Destination file '{dest_file}' already exists.") try: # Using "requests" as our UserAgent doesn't support writing directly to files with requests.get(url, stream=True) as r: r.raise_for_status() bytes_read = 0 with open(dest_file, 'wb') as f: for chunk in r.iter_content(chunk_size=65536): # Filter out keep-alive new chunks if chunk: bytes_read += len(chunk) if max_size: if bytes_read > max_size: raise McPermanentError(f"The file is bigger than the max. size of {max_size}") f.write(chunk) f.flush() except McPermanentError as ex: __cleanup_dest_file(dest_file=dest_file) raise ex except requests.exceptions.RequestException as ex: __cleanup_dest_file(dest_file=dest_file) raise McTransientError(f"'requests' exception while fetching {url}: {ex}") except Exception as ex: __cleanup_dest_file(dest_file=dest_file) raise McTransientError(f"Unable to fetch and store {url}: {ex}") if not os.path.isfile(dest_file): __cleanup_dest_file(dest_file=dest_file) # There should be something here so in some way it is us that have messed up raise McProgrammingError(f"Fetched file {dest_file} is not here after fetching it.")
async def transcribe_episode(self, stories_id: int) -> None: bcp47_language_code = await self.activities.identify_story_bcp47_language_code( stories_id) if bcp47_language_code is None: # Default to English in case there wasn't enough sizable text in title / description to make a good guess bcp47_language_code = 'en' enclosure = await self.activities.determine_best_enclosure(stories_id) if not enclosure: raise McPermanentError( f"No viable enclosure found for story {stories_id}") await self.activities.fetch_enclosure_to_gcs(stories_id, enclosure) episode_metadata_dict = await self.activities.fetch_transcode_store_episode( stories_id) episode_metadata = MediaFileInfoAudioStream.from_dict( episode_metadata_dict) max_duration = PodcastTranscribeEpisodeConfig().max_duration() if episode_metadata.duration > max_duration: raise McPermanentError( f"Episode's duration ({episode_metadata.duration} s) exceeds max. duration ({max_duration} s)" ) speech_operation_id = await self.activities.submit_transcribe_operation( stories_id, episode_metadata_dict, bcp47_language_code, ) # Wait for Google Speech API to finish up transcribing await Workflow.sleep(int(episode_metadata.duration * 1.1)) await self.activities.fetch_store_raw_transcript_json( stories_id, speech_operation_id) await self.activities.fetch_store_transcript(stories_id) await self.activities.add_to_extraction_queue(stories_id)
async def determine_best_enclosure( self, stories_id: int) -> Optional[StoryEnclosureDict]: log.info(f"Determining best enclosure for story {stories_id}...") db = connect_to_db_or_raise() # Find the enclosure that might work the best best_enclosure = viable_story_enclosure(db=db, stories_id=stories_id) if not best_enclosure: raise McPermanentError( f"There were no viable enclosures found for story {stories_id}" ) if best_enclosure.length: if best_enclosure.length > self.config.max_enclosure_size(): raise McPermanentError( f"Chosen enclosure {best_enclosure} is too big.") log.info(f"Done determining best enclosure for story {stories_id}") log.debug(f"Best enclosure for story {stories_id}: {best_enclosure}") return best_enclosure.to_dict()
def transcode_file_if_needed(input_file: str, output_file: str) -> bool: """ Transcode file (if needed) to something that Speech API will support. * If input has a video stream, it will be discarded; * If input has more than one audio stream, others will be discarded leaving only one (preferably the one that Speech API can support); * If input doesn't have an audio stream in Speech API-supported codec, it will be transcoded to lossless FLAC 16 bit in order to preserve quality; * If the chosen audio stream has multiple channels (e.g. stereo or 5.1), it will be mixed into a single (mono) channel as Speech API supports multi-channel recognition only when different voices speak into each of the channels. :param input_file: Input media file to consider for transcoding. :param output_file: If we decide to transcode, output media file to transcode to. :return: True if file had to be transcoded into "output_file", or False if input file can be used as it is. """ if not os.path.isfile(input_file): raise McProgrammingError(f"File '{input_file}' does not exist.") # Independently from what <enclosure /> has told us, identify the file type again ourselves media_info = media_file_info(media_file_path=input_file) if not media_info.audio_streams: raise McPermanentError( "Downloaded file doesn't appear to have any audio streams.") ffmpeg_args = [] supported_audio_stream = media_info.best_supported_audio_stream() if supported_audio_stream: log.info(f"Found a supported audio stream") # Test if there is more than one audio stream if len(media_info.audio_streams) > 1: log.info( f"Found other audio streams besides the supported one, will discard those" ) ffmpeg_args.extend([ '-f', supported_audio_stream.audio_codec_class. ffmpeg_container_format() ]) # Select all audio streams ffmpeg_args.extend(['-map', '0:a']) for stream in media_info.audio_streams: # Deselect the unsupported streams if stream != supported_audio_stream: ffmpeg_args.extend( ['-map', f'-0:a:{stream.ffmpeg_stream_index}']) # If a stream of a supported codec was not found, transcode it to FLAC 16 bit in order to not lose any quality else: log.info( f"None of the audio streams are supported by the Speech API, will transcode to FLAC" ) # Map first audio stream to input 0 ffmpeg_args.extend(['-map', '0:a:0']) # Transcode to FLAC (16 bit) in order to not lose any quality ffmpeg_args.extend(['-acodec', 'flac']) ffmpeg_args.extend(['-f', 'flac']) ffmpeg_args.extend(['-sample_fmt', 's16']) # Ensure that we end up with mono audio ffmpeg_args.extend(['-ac', '1']) # If there's video in the file (e.g. video), remove it if media_info.has_video_streams: # Discard all video streams ffmpeg_args.extend(['-map', '-0:v']) if not ffmpeg_args: # No need to transcode -- caller should use the input file as-is return False log.info(f"Transcoding '{input_file}' to '{output_file}'...") # I wasn't sure how to map outputs in "ffmpeg-python" library so here we call ffmpeg directly ffmpeg_command = ['ffmpeg', '-nostdin', '-hide_banner', '-i', input_file ] + ffmpeg_args + [output_file] log.debug(f"FFmpeg command: {ffmpeg_command}") subprocess.check_call(ffmpeg_command) log.info(f"Done transcoding '{input_file}' to '{output_file}'") return True
def media_file_info(media_file_path: str) -> MediaFileInfo: """ Read audio / video media file information, or raise if it can't be read. :param media_file_path: Full path to media file. :return: MediaFileInfo object. """ if not os.path.isfile(media_file_path): # Input file should exist at this point; it it doesn't, we have probably messed up something in the code raise McProgrammingError( f"Input file {media_file_path} does not exist.") try: file_info = ffmpeg.probe(media_file_path) if not file_info: raise Exception("Returned metadata is empty.") except Exception as ex: raise McPermanentError( f"Unable to read metadata from file {media_file_path}: {ex}") if 'streams' not in file_info: # FFmpeg should come up with some sort of a stream in any case raise McProgrammingError("Returned probe doesn't have 'streams' key.") # Test if one of the audio streams is of one of the supported codecs audio_streams = [] has_video_streams = False for stream in file_info['streams']: if stream['codec_type'] == 'audio': try: audio_channel_count = int(stream['channels']) if audio_channel_count == 0: raise Exception("Audio channel count is 0") except Exception as ex: log.warning( f"Unable to read audio channel count from stream {stream}: {ex}" ) # Just skip this stream if we can't figure it out continue audio_codec_class = None # We'll need to transcode audio files with more than one channel count anyway if audio_channel_count == 1: for codec in _SUPPORTED_CODEC_CLASSES: if codec.ffmpeg_stream_is_this_codec(ffmpeg_stream=stream): audio_codec_class = codec break try: if 'duration' in stream: # 'duration': '3.766621' duration = math.floor(float(stream['duration'])) elif 'DURATION' in stream.get('tags', {}): # 'DURATION': '00:00:03.824000000' duration_parts = stream['tags']['DURATION'].split(':') if len(duration_parts) != 3: raise McPermanentError( f"Unable to parse 'DURATION': {duration_parts}") hh = int(duration_parts[0]) mm = int(duration_parts[1]) ss_ms = duration_parts[2].split('.') if len(ss_ms) == 1: ss = int(ss_ms[0]) ms = 0 elif len(ss_ms) == 2: ss = int(ss_ms[0]) ms = int(ss_ms[1]) else: raise McPermanentError( f"Unable to parse 'DURATION': {duration_parts}") duration = hh * 3600 + mm * 60 + ss + (1 if ms > 0 else 0) else: raise McPermanentError( f"Stream doesn't have duration: {stream}") audio_stream = MediaFileInfoAudioStream( ffmpeg_stream_index=stream['index'], audio_codec_class=audio_codec_class, duration=duration, audio_channel_count=audio_channel_count, sample_rate=int(stream['sample_rate']), ) audio_streams.append(audio_stream) except Exception as ex: # Just skip this stream if we can't figure it out log.warning( f"Unable to read audio stream data for stream {stream}: {ex}" ) elif stream['codec_type'] == 'video': has_video_streams = True return MediaFileInfo( audio_streams=audio_streams, has_video_streams=has_video_streams, )