Esempio n. 1
0
    def download_object(self, object_id: str, local_file_path: str) -> None:
        """
        Download a GCS object to a local file.

        :param object_id: Object ID of an object that should be downloaded.
        :param local_file_path: Local file that the object should be stored to.
        """

        if os.path.isfile(local_file_path):
            raise McProgrammingError(
                f"Local file '{local_file_path}' already exists.")

        if not object_id:
            raise McProgrammingError("Object ID is unset.")

        log.debug(
            f"Downloading object ID {object_id} to '{local_file_path}'...")

        if not self.object_exists(object_id=object_id):
            raise McPermanentError(f"Object ID {object_id} was not found.")

        blob = self._blob_from_object_id(object_id=object_id)

        try:
            blob.download_to_filename(filename=local_file_path)
        except Exception as ex:
            raise McTransientError(
                f"Unable to download object ID {object_id} to '{local_file_path}': {ex}"
            )
Esempio n. 2
0
    def object_exists(self, object_id: str) -> bool:
        """
        Test if object exists at remote location.

        :param object_id: Object ID that should be tested.
        :return: True if object already exists under a given object ID.
        """

        if not object_id:
            raise McProgrammingError("Object ID is unset.")

        log.debug(f"Testing if object ID {object_id} exists...")

        blob = self._blob_from_object_id(object_id=object_id)

        log.debug(f"Testing blob for existence: {blob}")

        try:
            # blob.reload() returns metadata too
            blob.reload(retry=_GCS_API_RETRIES)

        except NotFound as ex:
            log.debug(f"Object '{object_id}' was not found: {ex}")
            exists = False

        except Exception as ex:
            raise McProgrammingError(
                f"Unable to test whether GCS object {object_id} exists: {ex}")

        else:
            exists = True

        return exists
Esempio n. 3
0
    def delete_object(self, object_id: str) -> None:
        """
        Delete object from remote location.

        Doesn't raise if object doesn't exist.

        Used mostly for running tests, e.g. to find out what happens if the object to be fetched doesn't exist anymore.

        :param object_id: Object ID that should be deleted.
        """

        if not object_id:
            raise McProgrammingError("Object ID is unset.")

        log.debug(f"Deleting object ID {object_id}...")

        blob = self._blob_from_object_id(object_id=object_id)

        try:
            blob.delete(retry=_GCS_API_RETRIES)

        except NotFound:
            log.warning(f"Object {object_id} doesn't exist.")

        except Exception as ex:
            raise McProgrammingError(
                f"Unable to delete GCS object {object_id}: {ex}")
Esempio n. 4
0
    def upload_object(self, local_file_path: str, object_id: str) -> None:
        """
        Upload a local file to a GCS object.

        Will overwrite existing objects with a warning.

        :param local_file_path: Local file that should be stored.
        :param object_id: Object ID under which the object should be stored.
        """

        if not os.path.isfile(local_file_path):
            raise McProgrammingError(
                f"Local file '{local_file_path}' does not exist.")

        if not object_id:
            raise McProgrammingError("Object ID is unset.")

        log.debug(f"Uploading '{local_file_path}' as object ID {object_id}...")

        if self.object_exists(object_id=object_id):
            log.warning(f"Object {object_id} already exists, will overwrite.")

        blob = self._blob_from_object_id(object_id=object_id)

        try:
            blob.upload_from_filename(filename=local_file_path,
                                      content_type='application/octet-stream')
        except Exception as ex:
            raise McTransientError(
                f"Unable to upload '{local_file_path}' as object ID {object_id}: {ex}"
            )
Esempio n. 5
0
def fetch_big_file(url: str, dest_file: str, max_size: int = 0) -> None:
    """
    Fetch a huge file from an URL to a local file.

    Raises one of the _AbstractFetchBigFileException exceptions.

    :param url: URL that points to a huge file.
    :param dest_file: Destination path to write the fetched file to.
    :param max_size: If >0, limit the file size to a defined number of bytes.
    :raise: ProgrammingError on unexpected fatal conditions.
    """

    if os.path.exists(dest_file):
        # Something's wrong with the code
        raise McProgrammingError(f"Destination file '{dest_file}' already exists.")

    try:

        # Using "requests" as our UserAgent doesn't support writing directly to files
        with requests.get(url, stream=True) as r:
            r.raise_for_status()

            bytes_read = 0

            with open(dest_file, 'wb') as f:
                for chunk in r.iter_content(chunk_size=65536):
                    # Filter out keep-alive new chunks
                    if chunk:

                        bytes_read += len(chunk)
                        if max_size:
                            if bytes_read > max_size:
                                raise McPermanentError(f"The file is bigger than the max. size of {max_size}")

                        f.write(chunk)
                        f.flush()

    except McPermanentError as ex:

        __cleanup_dest_file(dest_file=dest_file)

        raise ex

    except requests.exceptions.RequestException as ex:

        __cleanup_dest_file(dest_file=dest_file)

        raise McTransientError(f"'requests' exception while fetching {url}: {ex}")

    except Exception as ex:

        __cleanup_dest_file(dest_file=dest_file)

        raise McTransientError(f"Unable to fetch and store {url}: {ex}")

    if not os.path.isfile(dest_file):
        __cleanup_dest_file(dest_file=dest_file)

        # There should be something here so in some way it is us that have messed up
        raise McProgrammingError(f"Fetched file {dest_file} is not here after fetching it.")
Esempio n. 6
0
    async def fetch_transcode_store_episode(
            self, stories_id: int) -> MediaFileInfoAudioStreamDict:

        log.info(
            f"Fetching, transcoding, storing episode for story {stories_id}..."
        )

        with tempfile.TemporaryDirectory(
                prefix='fetch_transcode_store_episode') as temp_dir:
            raw_enclosure_path = os.path.join(temp_dir, 'raw_enclosure')

            gcs_raw_enclosures = GCSStore(
                bucket_config=self.config.raw_enclosures())
            gcs_raw_enclosures.download_object(
                object_id=str(stories_id),
                local_file_path=raw_enclosure_path,
            )
            del gcs_raw_enclosures

            if os.stat(raw_enclosure_path).st_size == 0:
                # If somehow the file from GCS ended up being of zero length, then this is very much unexpected
                raise McProgrammingError(
                    f"Fetched file {raw_enclosure_path} is empty.")

            transcoded_episode_path = os.path.join(temp_dir,
                                                   'transcoded_episode')

            raw_enclosure_transcoded = transcode_file_if_needed(
                input_file=raw_enclosure_path,
                output_file=transcoded_episode_path,
            )
            if not raw_enclosure_transcoded:
                transcoded_episode_path = raw_enclosure_path

            del raw_enclosure_path

            gcs_transcoded_episodes = GCSStore(
                bucket_config=self.config.transcoded_episodes())
            gcs_transcoded_episodes.upload_object(
                local_file_path=transcoded_episode_path,
                object_id=str(stories_id))

            # (Re)read the properties of either the original or the transcoded file
            media_info = media_file_info(
                media_file_path=transcoded_episode_path)
            best_audio_stream = media_info.best_supported_audio_stream()

            if not best_audio_stream.audio_codec_class:
                raise McProgrammingError(
                    "Best audio stream doesn't have audio class set")

        log.info(
            f"Done fetching, transcoding, storing episode for story {stories_id}"
        )
        log.debug(
            f"Best audio stream for story {stories_id}: {best_audio_stream}")

        return best_audio_stream.to_dict()
Esempio n. 7
0
    async def submit_transcribe_operation(
            self, stories_id: int,
            episode_metadata: MediaFileInfoAudioStreamDict,
            bcp47_language_code: str) -> str:

        log.info(f"Submitting transcribe operation for story {stories_id}...")
        log.debug(
            f"Episode metadata for story {stories_id}: {episode_metadata}")
        log.debug(
            f"Language code for story {stories_id}: {bcp47_language_code}")

        episode_metadata = MediaFileInfoAudioStream.from_dict(episode_metadata)

        if not episode_metadata.audio_codec_class:
            raise McProgrammingError(
                "Best audio stream doesn't have audio class set")

        gcs_transcoded_episodes = GCSStore(
            bucket_config=self.config.transcoded_episodes())
        gs_uri = gcs_transcoded_episodes.object_uri(object_id=str(stories_id))

        speech_operation_id = submit_transcribe_operation(
            gs_uri=gs_uri,
            episode_metadata=episode_metadata,
            bcp47_language_code=bcp47_language_code,
            gc_auth_config=self.config.gc_auth(),
        )

        log.info(
            f"Done submitting transcribe operation for story {stories_id}")
        log.debug(
            f"Speech operation ID for story {stories_id}: {speech_operation_id}"
        )

        return speech_operation_id
Esempio n. 8
0
    def _blob_from_object_id(self, object_id: str) -> Blob:
        if not object_id:
            raise McProgrammingError("Object ID is unset.")

        remote_path = self._remote_path(
            path_prefix=self.__bucket_config.path_prefix(),
            object_id=object_id)
        blob = self._bucket.blob(remote_path)
        return blob
Esempio n. 9
0
    def _remote_path(cls, path_prefix: str, object_id: str):
        if not object_id:
            raise McProgrammingError("Object ID is unset.")

        path = os.path.join(path_prefix, object_id)

        # GCS doesn't like double slashes...
        path = os.path.normpath(path)

        # ...nor is a fan of slashes at the start of path
        while path.startswith('/'):
            path = path[1:]

        return path
Esempio n. 10
0
    def object_uri(self, object_id: str) -> str:
        """
        Generate Google Cloud Storage URI for the object.

        :param object_id: Object ID to return the URI for.
        :return: Full Google Cloud Storage URI of the object, e.g. "gs://<bucket_name>/<path>/<object_id>".
        """

        if not object_id:
            raise McProgrammingError("Object ID is unset.")

        uri = "gs://{host}/{remote_path}".format(
            host=self.__bucket_config.bucket_name(),
            remote_path=self._remote_path(
                path_prefix=self.__bucket_config.path_prefix(),
                object_id=object_id),
        )

        return uri
Esempio n. 11
0
def transcode_file_if_needed(input_file: str, output_file: str) -> bool:
    """
    Transcode file (if needed) to something that Speech API will support.

    * If input has a video stream, it will be discarded;
    * If input has more than one audio stream, others will be discarded leaving only one (preferably the one that Speech
      API can support);
    * If input doesn't have an audio stream in Speech API-supported codec, it will be transcoded to lossless
      FLAC 16 bit in order to preserve quality;
    * If the chosen audio stream has multiple channels (e.g. stereo or 5.1), it will be mixed into a single (mono)
      channel as Speech API supports multi-channel recognition only when different voices speak into each of the
      channels.

    :param input_file: Input media file to consider for transcoding.
    :param output_file: If we decide to transcode, output media file to transcode to.
    :return: True if file had to be transcoded into "output_file", or False if input file can be used as it is.
    """

    if not os.path.isfile(input_file):
        raise McProgrammingError(f"File '{input_file}' does not exist.")

    # Independently from what <enclosure /> has told us, identify the file type again ourselves
    media_info = media_file_info(media_file_path=input_file)

    if not media_info.audio_streams:
        raise McPermanentError(
            "Downloaded file doesn't appear to have any audio streams.")

    ffmpeg_args = []

    supported_audio_stream = media_info.best_supported_audio_stream()
    if supported_audio_stream:
        log.info(f"Found a supported audio stream")

        # Test if there is more than one audio stream
        if len(media_info.audio_streams) > 1:
            log.info(
                f"Found other audio streams besides the supported one, will discard those"
            )

            ffmpeg_args.extend([
                '-f',
                supported_audio_stream.audio_codec_class.
                ffmpeg_container_format()
            ])

            # Select all audio streams
            ffmpeg_args.extend(['-map', '0:a'])

            for stream in media_info.audio_streams:
                # Deselect the unsupported streams
                if stream != supported_audio_stream:
                    ffmpeg_args.extend(
                        ['-map', f'-0:a:{stream.ffmpeg_stream_index}'])

    # If a stream of a supported codec was not found, transcode it to FLAC 16 bit in order to not lose any quality
    else:
        log.info(
            f"None of the audio streams are supported by the Speech API, will transcode to FLAC"
        )

        # Map first audio stream to input 0
        ffmpeg_args.extend(['-map', '0:a:0'])

        # Transcode to FLAC (16 bit) in order to not lose any quality
        ffmpeg_args.extend(['-acodec', 'flac'])
        ffmpeg_args.extend(['-f', 'flac'])
        ffmpeg_args.extend(['-sample_fmt', 's16'])

        # Ensure that we end up with mono audio
        ffmpeg_args.extend(['-ac', '1'])

    # If there's video in the file (e.g. video), remove it
    if media_info.has_video_streams:
        # Discard all video streams
        ffmpeg_args.extend(['-map', '-0:v'])

    if not ffmpeg_args:
        # No need to transcode -- caller should use the input file as-is
        return False

    log.info(f"Transcoding '{input_file}' to '{output_file}'...")

    # I wasn't sure how to map outputs in "ffmpeg-python" library so here we call ffmpeg directly
    ffmpeg_command = ['ffmpeg', '-nostdin', '-hide_banner', '-i', input_file
                      ] + ffmpeg_args + [output_file]
    log.debug(f"FFmpeg command: {ffmpeg_command}")
    subprocess.check_call(ffmpeg_command)

    log.info(f"Done transcoding '{input_file}' to '{output_file}'")

    return True
Esempio n. 12
0
def submit_transcribe_operation(gs_uri: str,
                                episode_metadata: MediaFileInfoAudioStream,
                                bcp47_language_code: str,
                                gc_auth_config: Optional[GCAuthConfig] = None) -> str:
    """
    Submit a Speech API long running operation to transcribe a podcast episode.

    :param gs_uri: Google Cloud Storage URI to a transcoded episode.
    :param episode_metadata: Metadata derived from the episode while transcoding it.
    :param bcp47_language_code: Episode's BCP 47 language code guessed from story's title + description.
    :param gc_auth_config: Google Cloud authentication configuration instance.
    :return Google Speech API operation ID by which the transcription operation can be referred to.
    """

    if not gc_auth_config:
        gc_auth_config = GCAuthConfig()

    try:
        client = SpeechClient.from_service_account_json(gc_auth_config.json_file())
    except Exception as ex:
        raise McProgrammingError(f"Unable to create Speech API client: {ex}")

    try:
        # noinspection PyTypeChecker
        config = RecognitionConfig(
            encoding=getattr(RecognitionConfig.AudioEncoding, episode_metadata.audio_codec_class.speech_api_codec()),
            sample_rate_hertz=episode_metadata.sample_rate,
            # We always set the channel count to 1 and disable separate recognition per channel as our inputs are all
            # mono audio files and do not have separate speakers per audio channel.
            audio_channel_count=1,
            enable_separate_recognition_per_channel=False,
            language_code=bcp47_language_code,
            alternative_language_codes=[],
            speech_contexts=[
                # Speech API works pretty well without custom contexts
            ],
            # Don't care that much about word confidence
            enable_word_confidence=False,
            # Punctuation doesn't work that well but we still enable it here
            enable_automatic_punctuation=True,
            # Not setting 'model' as 'use_enhanced' will then choose the best model for us
            # Using enhanced (more expensive) model, where available
            use_enhanced=True,
        )
    except Exception as ex:
        raise McProgrammingError(f"Unable to initialize Speech API configuration: {ex}")

    log.info(f"Submitting a Speech API operation for URI {gs_uri}...")

    try:

        # noinspection PyTypeChecker
        audio = RecognitionAudio(uri=gs_uri)

        speech_operation = client.long_running_recognize(config=config, audio=audio, retry=_GOOGLE_API_RETRIES)

    except Exception as ex:
        # If client's own retry mechanism doesn't work, then it's probably a programming error, e.g. outdated API client
        raise McProgrammingError(f"Unable to submit a Speech API operation: {ex}")

    try:
        # We get the operation name in a try-except block because accessing it is not that well documented, so Google
        # might change the property names whenever they please and we wouldn't necessarily notice otherwise
        operation_id = speech_operation.operation.name
        if not operation_id:
            raise McProgrammingError(f"Operation name is empty.")
    except Exception as ex:
        raise McProgrammingError(f"Unable to get operation name: {ex}")

    log.info(f"Submitted Speech API operation for URI {gs_uri}")

    return operation_id
Esempio n. 13
0
def fetch_transcript(speech_operation_id: str, gc_auth_config: Optional[GCAuthConfig] = None) -> Optional[Transcript]:
    """
    Try to fetch a transcript for a given speech operation ID.

    :param speech_operation_id: Speech operation ID.
    :param gc_auth_config: Google Cloud authentication configuration instance.
    :return: Transcript, or None if the transcript hasn't been prepared yet.
    """
    if not speech_operation_id:
        raise McProgrammingError(f"Speech operation ID is unset.")

    if not gc_auth_config:
        gc_auth_config = GCAuthConfig()

    try:
        client = SpeechClient.from_service_account_json(gc_auth_config.json_file())
    except Exception as ex:
        raise McProgrammingError(f"Unable to initialize Speech API operations client: {ex}")

    try:
        operation = client.transport.operations_client.get_operation(
            name=speech_operation_id,
            retry=_GOOGLE_API_RETRIES,
        )
    except InvalidArgument as ex:
        raise McProgrammingError(f"Invalid operation ID '{speech_operation_id}': {ex}")
    except NotFound as ex:
        raise McProgrammingError(f"Operation ID '{speech_operation_id}' was not found: {ex}")
    except Exception as ex:
        # On any other errors, raise a hard exception
        raise McProgrammingError(f"Error while fetching operation ID '{speech_operation_id}': {ex}")

    if not operation:
        raise McProgrammingError(f"Operation is unset.")

    try:
        gapic_operation: Operation = from_gapic(
            operation=operation,
            operations_client=client.transport.operations_client,
            result_type=LongRunningRecognizeResponse,
            metadata_type=LongRunningRecognizeMetadata,
            retry=_GOOGLE_API_RETRIES,
        )
    except Exception as ex:
        raise McProgrammingError(f"Unable to create GAPIC operation: {ex}")

    log.debug(f"GAPIC operation: {gapic_operation}")
    log.debug(f"Operation metadata: {gapic_operation.metadata}")
    log.debug(f"Operation is done: {gapic_operation.done()}")
    log.debug(f"Operation error: {gapic_operation.done()}")

    try:
        operation_is_done = gapic_operation.done(retry=_GOOGLE_API_RETRIES)
    except Exception as ex:
        # 'done' attribute might be gone in a newer version of the Speech API client
        raise McProgrammingError(
            f"Unable to test whether operation '{speech_operation_id}' is done: {ex}"
        )

    if not operation_is_done:
        log.info(f"Operation '{speech_operation_id}' is still not done.")
        return None

    utterances = []

    try:
        for result in gapic_operation.result(retry=_GOOGLE_API_RETRIES).results:

            alternatives = []
            for alternative in result.alternatives:
                alternatives.append(
                    UtteranceAlternative(
                        text=alternative.transcript.strip(),
                        confidence=alternative.confidence,
                    )
                )

            utterances.append(
                Utterance(
                    alternatives=alternatives,
                    bcp47_language_code=result.language_code,
                )
            )

    except Exception as ex:
        raise McProgrammingError(
            f"Unable to read transcript for operation '{speech_operation_id}' due to other error: {ex}"
        )

    return Transcript(utterances=utterances)
Esempio n. 14
0
def media_file_info(media_file_path: str) -> MediaFileInfo:
    """
    Read audio / video media file information, or raise if it can't be read.

    :param media_file_path: Full path to media file.
    :return: MediaFileInfo object.
    """
    if not os.path.isfile(media_file_path):
        # Input file should exist at this point; it it doesn't, we have probably messed up something in the code
        raise McProgrammingError(
            f"Input file {media_file_path} does not exist.")

    try:
        file_info = ffmpeg.probe(media_file_path)
        if not file_info:
            raise Exception("Returned metadata is empty.")
    except Exception as ex:
        raise McPermanentError(
            f"Unable to read metadata from file {media_file_path}: {ex}")

    if 'streams' not in file_info:
        # FFmpeg should come up with some sort of a stream in any case
        raise McProgrammingError("Returned probe doesn't have 'streams' key.")

    # Test if one of the audio streams is of one of the supported codecs
    audio_streams = []
    has_video_streams = False
    for stream in file_info['streams']:
        if stream['codec_type'] == 'audio':

            try:
                audio_channel_count = int(stream['channels'])
                if audio_channel_count == 0:
                    raise Exception("Audio channel count is 0")
            except Exception as ex:
                log.warning(
                    f"Unable to read audio channel count from stream {stream}: {ex}"
                )
                # Just skip this stream if we can't figure it out
                continue

            audio_codec_class = None

            # We'll need to transcode audio files with more than one channel count anyway
            if audio_channel_count == 1:
                for codec in _SUPPORTED_CODEC_CLASSES:
                    if codec.ffmpeg_stream_is_this_codec(ffmpeg_stream=stream):
                        audio_codec_class = codec
                        break

            try:

                if 'duration' in stream:
                    # 'duration': '3.766621'
                    duration = math.floor(float(stream['duration']))

                elif 'DURATION' in stream.get('tags', {}):
                    # 'DURATION': '00:00:03.824000000'
                    duration_parts = stream['tags']['DURATION'].split(':')
                    if len(duration_parts) != 3:
                        raise McPermanentError(
                            f"Unable to parse 'DURATION': {duration_parts}")

                    hh = int(duration_parts[0])
                    mm = int(duration_parts[1])
                    ss_ms = duration_parts[2].split('.')

                    if len(ss_ms) == 1:
                        ss = int(ss_ms[0])
                        ms = 0
                    elif len(ss_ms) == 2:
                        ss = int(ss_ms[0])
                        ms = int(ss_ms[1])
                    else:
                        raise McPermanentError(
                            f"Unable to parse 'DURATION': {duration_parts}")

                    duration = hh * 3600 + mm * 60 + ss + (1 if ms > 0 else 0)

                else:
                    raise McPermanentError(
                        f"Stream doesn't have duration: {stream}")

                audio_stream = MediaFileInfoAudioStream(
                    ffmpeg_stream_index=stream['index'],
                    audio_codec_class=audio_codec_class,
                    duration=duration,
                    audio_channel_count=audio_channel_count,
                    sample_rate=int(stream['sample_rate']),
                )
                audio_streams.append(audio_stream)

            except Exception as ex:
                # Just skip this stream if we can't figure it out
                log.warning(
                    f"Unable to read audio stream data for stream {stream}: {ex}"
                )

        elif stream['codec_type'] == 'video':
            has_video_streams = True

    return MediaFileInfo(
        audio_streams=audio_streams,
        has_video_streams=has_video_streams,
    )