Exemple #1
0
    def generate_transcription_and_sanitize(
        self,
        audio_id,
        local_clean_path,
        local_rejected_path,
        remote_file_path,
        stt_language,
        transcription_client,
        utterance_metadata,
    ):
        if ".wav" not in remote_file_path:
            return

        transcription_file_name = local_clean_path.replace(".wav", ".txt")
        self.fs_interface.download_file_to_location(remote_file_path,
                                                    local_clean_path)

        reason = None

        try:
            transcript = transcription_client.generate_transcription(
                stt_language, local_clean_path)
            original_transcript = transcript

            curr_language = self.audio_transcription_config.get(AUDIO_LANGUAGE)

            LOGGER.info(
                f"Getting transacription sanitizer for the language {curr_language}"
            )

            all_transcription_sanitizers = get_transcription_sanitizers()
            transcription_sanitizer = all_transcription_sanitizers.get(
                curr_language)

            if not transcription_sanitizer:
                LOGGER.info(
                    f"No transacription sanitizer found for the language {curr_language}, hence falling back to the default sanitizer."
                )
                transcription_sanitizer = all_transcription_sanitizers.get(
                    "default")

            transcript = transcription_sanitizer.sanitize(transcript)

            if original_transcript != transcript:
                old_file_name = get_file_name(transcription_file_name)
                new_file_name = "original_" + get_file_name(
                    transcription_file_name)
                file_name_with_original_prefix = transcription_file_name.replace(
                    old_file_name, new_file_name)
                LOGGER.info("saving original transcription to:" +
                            file_name_with_original_prefix)
                self.save_transcription(original_transcript,
                                        file_name_with_original_prefix)

            self.save_transcription(transcript, transcription_file_name)

        except TranscriptionSanitizationError as tse:
            LOGGER.error("Transcription not valid: " + str(tse))
            reason = "sanitization error:" + str(tse.args)

        except (AzureTranscriptionClientError,
                GoogleTranscriptionClientError) as e:
            LOGGER.error("STT API call failed: " + str(e))
            reason = "STT API error:" + str(e.args)

        except Exception as ex:
            LOGGER.error("Error: " + str(ex))
            reason = ex.args

        if reason is not None:
            self.handle_error(
                audio_id,
                local_clean_path,
                local_rejected_path,
                utterance_metadata,
                reason,
            )
Exemple #2
0
 def setUp(self):
     transcription_sanitizers = get_transcription_sanitizers()
     self.punjabi_transcription_sanitizers = transcription_sanitizers.get("punjabi")
 def setUp(self):
     transcription_sanitizers = get_transcription_sanitizers()
     self.indian_english_transcription_sanitizers = transcription_sanitizers.get("indian_english")
Exemple #4
0
 def setUp(self):
     transcription_sanitizers = get_transcription_sanitizers()
     self.hindi_transcription_sanitizers = transcription_sanitizers.get(
         "hindi")
 def setUp(self):
     transcription_sanitizers = get_transcription_sanitizers()
     self.kannada_transcription_sanitizers = transcription_sanitizers.get(
         "kannada")