Example #1
0
def summary(input_path, lang):
    lang = datasets.Language(lang)
    transcript = json.load(open(input_path))
    for t in transcript:
        hyp, ref = map(lang.normalize_text, [t['hyp'], t['ref']])
        t['cer'] = t.get('cer', metrics.cer(hyp, ref))
        t['wer'] = t.get('wer', metrics.wer(hyp, ref))

    cer_, wer_ = [
        torch.tensor([t[k] for t in transcript]) for k in ['cer', 'wer']
    ]
    cer_avg, wer_avg = float(cer_.mean()), float(wer_.mean())
    print(f'CER: {cer_avg:.02f} | WER: {wer_avg:.02f}')

    loss_ = torch.tensor([t.get('loss', 0) for t in transcript])
    loss_ = loss_[~(torch.isnan(loss_) | torch.isinf(loss_))]
    #min, max, steps = 0.0, 2.0, 20
    #bins = torch.linspace(min, max, steps = steps)
    #hist = torch.histc(loss_, bins = steps, min = min, max = max)
    #for b, h in zip(bins.tolist(), hist.tolist()):
    #	print(f'{b:.02f}\t{h:.0f}')

    plt.figure(figsize=(8, 4))
    plt.suptitle(os.path.basename(input_path))
    plt.subplot(211)
    plt.title('cer PDF')
    #plt.hist(cer_, range = (0.0, 1.2), bins = 20, density = True)
    seaborn.distplot(cer_, bins=20, hist=True)
    plt.xlim(0, 1)
    plt.subplot(212)
    plt.title('cer CDF')
    plt.hist(cer_, bins=20, density=True, cumulative=True)
    plt.xlim(0, 1)
    plt.xticks(torch.arange(0, 1.01, 0.1))
    plt.grid(True)

    #plt.subplot(223)
    #plt.title('loss PDF')
    #plt.hist(loss_, range = (0.0, 2.0), bins = 20, density = True)
    #seaborn.distplot(loss_, bins = 20, hist = True)
    #plt.xlim(0, 3)
    #plt.subplot(224)
    #plt.title('loss CDF')
    #plt.hist(loss_, bins = 20, density = True, cumulative = True)
    #plt.grid(True)
    plt.subplots_adjust(hspace=0.4)
    plt.savefig(input_path + '.png', dpi=150)
Example #2
0
def normalize(input_path, lang, dry = True):
	lang = datasets.Language(lang)
	labels = datasets.Labels(lang)
	for transcript_path in input_path:
		with open(transcript_path) as f:
			transcript = json.load(f)
		for t in transcript:
			if 'ref' in t:
				t['ref'] = labels.postprocess_transcript(lang.normalize_text(t['ref']))
			if 'hyp' in t:
				t['hyp'] = labels.postprocess_transcript(lang.normalize_text(t['hyp']))

			if 'ref' in t and 'hyp' in t:
				t['cer'] = t['cer'] if 'cer' in t else metrics.cer(t['hyp'], t['ref'])
				t['wer'] = t['wer'] if 'wer' in t else metrics.wer(t['hyp'], t['ref'])

		if not dry:
			json.dump(transcript, open(transcript_path, 'w'), ensure_ascii = False, indent = 2, sort_keys = True)
		else:
			return transcript
Example #3
0
def main():

    # Load setting file
    settings = configparser.ConfigParser()
    settings_filepath = 'settings.ini'
    settings.read(settings_filepath)

    asr_systems = settings.get('general', 'asr_systems').split(',')
    data_folders = settings.get('general', 'data_folders').split(',')
    supported_speech_file_types = sorted(['flac', 'mp3', 'ogg', 'wav'])

    print('asr_systems: {0}'.format(asr_systems))
    print('data_folders: {0}'.format(data_folders))

    for data_folder in data_folders:
        print('\nWorking on data folder "{0}"'.format(data_folder))
        speech_file_type = settings.get('general', 'speech_file_type')

        # Automatically detect the speech file type.
        # Heuristic: the detected speech file type is the one that has the more speech files in data_folder
        #            e.g., if in data_folder there are 10 mp3s and 25 flacs, then choose flac
        if speech_file_type == 'auto':
            maximum_number_of_speech_files = 0
            detected_speech_file_type = None
            for supported_speech_file_type in supported_speech_file_types:
                potential_speech_filepaths = sorted(
                    glob.glob(
                        os.path.join(
                            data_folder,
                            '*.{0}'.format(supported_speech_file_type))))
                if maximum_number_of_speech_files < len(
                        potential_speech_filepaths):
                    maximum_number_of_speech_files = len(
                        potential_speech_filepaths)
                    detected_speech_file_type = supported_speech_file_type
            speech_file_type = detected_speech_file_type
            print('Detected speech file type: {0}'.format(speech_file_type))
            if detected_speech_file_type is None:
                raise ValueError(
                    'You have set speech_file_type to be "auto" in {1}. We couldn\'t detect any speech file. Speech file extensions should be {2}.'
                    .format(speech_file_type, settings_filepath,
                            supported_speech_file_types))

        if speech_file_type not in supported_speech_file_types:
            raise ValueError(
                'You have set speech_file_type to be "{0}" in {1}. This is invalid. speech_file_type should be flac, ogg, mp3, or wav.'
                .format(speech_file_type, settings_filepath))

        speech_filepaths = sorted(
            glob.glob(
                os.path.join(data_folder, '*.{0}'.format(speech_file_type))))

        if settings.getboolean('general', 'transcribe'):

            # Make sure there are files to transcribe
            if len(speech_filepaths) <= 0:
                raise ValueError(
                    'There is no file with the extension "{0}"  in the folder "{1}"'
                    .format(speech_file_type, data_folder))

            # Transcribe
            print(
                '\n### Call the ASR engines to compute predicted transcriptions'
            )
            for speech_file_number, speech_filepath in enumerate(
                    speech_filepaths):
                # Convert the speech file from FLAC/MP3/Ogg to WAV
                if speech_file_type in ['flac', 'mp3', 'ogg']:
                    from pydub import AudioSegment
                    print('speech_filepath: {0}'.format(speech_filepath))
                    sound = AudioSegment.from_file(speech_filepath,
                                                   format=speech_file_type)
                    new_speech_filepath = speech_filepath[:-len(
                        speech_file_type) - 1] + '.wav'
                    sound.export(new_speech_filepath, format="wav")
                    speech_filepath = new_speech_filepath

                # Transcribe the speech file
                all_transcription_skipped = True
                for asr_system in asr_systems:
                    transcription, transcription_skipped = transcribe.transcribe(
                        speech_filepath,
                        asr_system,
                        settings,
                        save_transcription=True)
                    all_transcription_skipped = all_transcription_skipped and transcription_skipped

                # If the speech file was converted from FLAC/MP3/Ogg to WAV, remove the WAV file
                if speech_file_type in ['flac', 'mp3', 'ogg']:
                    os.remove(new_speech_filepath)

                if not all_transcription_skipped:
                    time.sleep(
                        settings.getint(
                            'general',
                            'delay_in_seconds_between_transcriptions'))

        if settings.getboolean('general', 'evaluate_transcriptions'):
            # Evaluate transcriptions
            all_texts = {}
            print(
                '\n### Final evaluation of all the ASR engines based on their predicted jurisdictions'
            )

            for asr_system in asr_systems:
                all_texts[asr_system] = {}

                all_predicted_transcription_filepath = 'all_predicted_transcriptions_' + asr_system + '.txt'
                all_gold_transcription_filepath = 'all_gold_transcriptions.txt'
                all_predicted_transcription_file = codecs.open(
                    all_predicted_transcription_filepath, 'w',
                    settings.get('general',
                                 'predicted_transcription_encoding'))
                all_gold_transcription_filepath = codecs.open(
                    all_gold_transcription_filepath, 'w',
                    settings.get('general', 'gold_transcription_encoding'))

                number_of_tokens_in_gold = 0
                number_of_empty_predicted_transcription_txt_files = 0
                number_of_missing_predicted_transcription_txt_files = 0
                edit_types = [
                    'corrects', 'deletions', 'insertions', 'substitutions',
                    'changes'
                ]
                number_of_edits = {}

                for edit_type in edit_types:
                    number_of_edits[edit_type] = 0

                for speech_filepath in speech_filepaths:
                    predicted_transcription_filepath_base = '.'.join(
                        speech_filepath.split('.')[:-1]) + '_' + asr_system
                    predicted_transcription_txt_filepath = predicted_transcription_filepath_base + '.txt'

                    if not os.path.isfile(
                            predicted_transcription_txt_filepath):
                        number_of_missing_predicted_transcription_txt_files += 1
                        predicted_transcription = ''
                    else:
                        predicted_transcription = codecs.open(
                            predicted_transcription_txt_filepath, 'r',
                            settings.get('general',
                                         'predicted_transcription_encoding')
                        ).read().strip()
                        if len(predicted_transcription) == 0:
                            #print('predicted_transcription_txt_filepath {0} is empty'.format(predicted_transcription_txt_filepath))
                            number_of_empty_predicted_transcription_txt_files += 1

                    gold_transcription_filepath_base = '.'.join(
                        speech_filepath.split('.')[:-1]) + '_' + 'gold'
                    gold_transcription_filepath_text = gold_transcription_filepath_base + '.txt'
                    gold_transcription = codecs.open(
                        gold_transcription_filepath_text, 'r',
                        settings.get('general',
                                     'gold_transcription_encoding')).read()
                    gold_transcription = metrics.normalize_text(
                        gold_transcription,
                        lower_case=True,
                        remove_punctuation=True,
                        write_numbers_in_letters=True)
                    predicted_transcription = metrics.normalize_text(
                        predicted_transcription,
                        lower_case=True,
                        remove_punctuation=True,
                        write_numbers_in_letters=True)

                    all_predicted_transcription_file.write(
                        '{0}\n'.format(predicted_transcription))
                    all_gold_transcription_filepath.write(
                        '{0}\n'.format(gold_transcription))
                    #print('\npredicted_transcription\t: {0}'.format(predicted_transcription))
                    #print('gold_transcription\t: {0}'.format(gold_transcription))
                    wer = metrics.wer(gold_transcription.split(' '),
                                      predicted_transcription.split(' '),
                                      False)
                    #print('wer: {0}'.format(wer))

                    #if len(predicted_transcription) == 0: continue

                    number_of_tokens_in_gold += len(
                        gold_transcription.split(' '))
                    for edit_type in edit_types:
                        number_of_edits[edit_type] += wer[edit_type]

                all_predicted_transcription_file.close()
                all_gold_transcription_filepath.close()

                wer = number_of_edits['changes'] / number_of_tokens_in_gold
                #print('\nGlobal WER based on the all predicted transcriptions:')
                #print('{3}\twer: {0:.5f}% ({1}; number_of_tokens_in_gold = {2})'.format(wer*100, number_of_edits, number_of_tokens_in_gold,asr_system))
                print(
                    '{5}\twer: {0:.5f}% \t(deletions: {1}\t; insertions: {2}\t; substitutions: {3}\t; number_of_tokens_in_gold = {4})'
                    .format(wer * 100, number_of_edits['deletions'],
                            number_of_edits['insertions'],
                            number_of_edits['substitutions'],
                            number_of_tokens_in_gold, asr_system))
                print('Number of speech files: {0}'.format(
                    len(speech_filepaths)))
                print('Number of missing predicted prescription files: {0}'.
                      format(
                          number_of_missing_predicted_transcription_txt_files))
                print(
                    'Number of empty predicted prescription files: {0}'.format(
                        number_of_empty_predicted_transcription_txt_files))