Esempio n. 1
0
def save_and_plot_fn(args, log_dir, step, loss, prefix):
    idx, (seq, spec, align) = args

    audio_path = os.path.join(
        log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
    align_path = os.path.join(
        log_dir, '{}-step-{:09d}-align{:03d}.png'.format(prefix, step, idx))

    waveform = inv_spectrogram(spec.T)
    save_audio(waveform, audio_path)

    info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
    if 'korean_cleaners' in [x.strip() for x in hparams.cleaners.split(',')]:
        log('Training korean : Use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=True),
                            isKorean=True)
    else:
        log('Training non-korean : X use jamo')
        plot.plot_alignment(align,
                            align_path,
                            info=info_text,
                            text=sequence_to_text(seq,
                                                  skip_eos_and_pad=True,
                                                  combine_jamo=False),
                            isKorean=False)
def split_on_silence_with_librosa(audio_path,
                                  top_db=40,
                                  frame_length=1024,
                                  hop_length=256,
                                  skip_idx=0,
                                  out_ext="wav",
                                  min_segment_length=3,
                                  max_segment_length=8,
                                  pre_silence_length=0,
                                  post_silence_length=0):

    filename = os.path.basename(audio_path).split('.', 1)[0]
    in_ext = audio_path.rsplit(".")[1]

    audio = load_audio(audio_path)

    edges = librosa.effects.split(audio,
                                  top_db=top_db,
                                  frame_length=frame_length,
                                  hop_length=hop_length)

    new_audio = np.zeros_like(audio)
    for idx, (start, end) in enumerate(edges[skip_idx:]):
        new_audio[start:end] = remove_breath(audio[start:end])

    save_audio(new_audio, add_postfix(audio_path, "no_breath"))
    audio = new_audio
    edges = librosa.effects.split(audio,
                                  top_db=top_db,
                                  frame_length=frame_length,
                                  hop_length=hop_length)

    audio_paths = []
    for idx, (start, end) in enumerate(edges[skip_idx:]):
        segment = audio[start:end]
        duration = get_duration(segment)

        if duration <= min_segment_length or duration >= max_segment_length:
            continue

        output_path = "{}/{}.{:04d}.{}".format(os.path.dirname(audio_path),
                                               filename, idx, out_ext)

        padded_segment = np.concatenate([
            get_silence(pre_silence_length),
            segment,
            get_silence(post_silence_length),
        ])

        save_audio(padded_segment, output_path)
        audio_paths.append(output_path)

    return audio_paths
Esempio n. 3
0
def	save_and_plot_fn(args, log_dir, step, loss, prefix):
	idx, (seq, spec, align) = args

	audio_path = os.path.join(
		log_dir, '{}-step-{:09d}-audio{:03d}.wav'.format(prefix, step, idx))
	align_path = os.path.join(
		log_dir, '{}-step-{:09d}-audio{:03d}.png'.format(prefix, step, idx))

	waveform = inv_spectrogram(spec.T)
	save_audio(waveform, audio_path)

	info_text = 'step={:d}, loss={:.5f}'.format(step, loss)
	plot.plot_alignment(
		align, align_path, info=info_text,
		text=sequence_to_text(seq,
			skip_eos_and_pad=True, combine_jamo=True))
Esempio n. 4
0
def text_recognition(path, config):
    root, ext = os.path.splitext(path)
    txt_path = root + ".txt"

    if os.path.exists(txt_path):
        with open(txt_path) as f:
            out = json.loads(open(txt_path).read())
            return out

    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types

    out = {}
    error_count = 0

    tmp_path = os.path.splitext(path)[0] + ".tmp.wav"

    while True:
        try:
            client = speech.SpeechClient()

            content = load_audio(
                path,
                pre_silence_length=config.pre_silence_length,
                post_silence_length=config.post_silence_length)

            max_duration = config.max_duration - \
                    config.pre_silence_length - config.post_silence_length
            audio_duration = get_duration(content)

            if audio_duration >= max_duration:
                print(" [!] Skip {} because of duration: {} > {}". \
                        format(path, audio_duration, max_duration))
                return {}

            content = resample_audio(content, config.sample_rate)
            save_audio(content, tmp_path, config.sample_rate)

            with io.open(tmp_path, 'rb') as f:
                audio = types.RecognitionAudio(content=f.read())

            config = types.RecognitionConfig(
                encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=config.sample_rate,
                language_code='ko-KR')

            response = client.recognize(config, audio)
            if len(response.results) > 0:
                alternatives = response.results[0].alternatives

                results = [
                    alternative.transcript for alternative in alternatives
                ]
                assert len(results) == 1, "More than 1 results: {}".format(
                    results)

                out = {path: "" if len(results) == 0 else results[0]}
                print(path, results[0])
                break
            break
        except Exception as err:
            raise Exception("OS error: {0}".format(err))

            error_count += 1
            print("Skip warning for {} for {} times". \
                    format(path, error_count))

            if error_count > 5:
                break
            else:
                continue

    remove_file(tmp_path)
    with open(txt_path, 'w') as f:
        json.dump(out, f, indent=2, ensure_ascii=False)

    return out
def plot_graph_and_save_audio(args,
                              base_path=None,
                              start_of_sentence=None,
                              end_of_sentence=None,
                              pre_word_num=0,
                              post_word_num=0,
                              pre_surplus_idx=0,
                              post_surplus_idx=1,
                              use_short_concat=False,
                              use_manual_attention=False,
                              save_alignment=False,
                              librosa_trim=False,
                              attention_trim=False,
                              time_str=None,
                              isKorean=True):

    idx, (wav, alignment, path, text, sequence) = args

    if base_path:
        plot_path = "{}/{}.png".format(base_path, get_time())
    elif path:
        plot_path = path.rsplit('.', 1)[0] + ".png"
    else:
        plot_path = None

    #plot_path = add_prefix(plot_path, time_str)
    if use_manual_attention:
        plot_path = add_postfix(plot_path, "manual")

    if plot_path:
        plot.plot_alignment(alignment, plot_path, text=text, isKorean=isKorean)

    if use_short_concat:
        wav = short_concat(wav, alignment, text, start_of_sentence,
                           end_of_sentence, pre_word_num, post_word_num,
                           pre_surplus_idx, post_surplus_idx)

    if attention_trim and end_of_sentence:
        end_idx_counter = 0
        attention_argmax = alignment.argmax(0)
        end_idx = min(len(sequence) - 1, max(attention_argmax))
        max_counter = min((attention_argmax == end_idx).sum(), 5)

        for jdx, attend_idx in enumerate(attention_argmax):
            if len(attention_argmax) > jdx + 1:
                if attend_idx == end_idx:
                    end_idx_counter += 1

                if attend_idx == end_idx and attention_argmax[jdx +
                                                              1] > end_idx:
                    break

                if end_idx_counter >= max_counter:
                    break
            else:
                break

        spec_end_idx = hparams.reduction_factor * jdx + 3
        wav = wav[:spec_end_idx]

    audio_out = inv_spectrogram(wav.T)

    if librosa_trim and end_of_sentence:
        yt, index = librosa.effects.trim(audio_out,
                                         frame_length=5120,
                                         hop_length=256,
                                         top_db=50)
        audio_out = audio_out[:index[-1]]

    if save_alignment:
        alignment_path = "{}/{}.npy".format(base_path, idx)
        np.save(alignment_path, alignment, allow_pickle=False)

    if path or base_path:
        if path:
            current_path = add_postfix(path, idx)
        elif base_path:
            current_path = plot_path.replace(".png", ".wav")

        save_audio(audio_out, current_path)
        return True
    else:
        io_out = io.BytesIO()
        save_audio(audio_out, io_out)
        result = io_out.getvalue()
        return result