Esempi in Python per text_to_sequence, esempi in Python per text.text_to_sequence

Esempio n. 1

0

Mostra file

File: synthesizer.py Progetto: keithito/tacotron

 def synthesize(self, text):
   cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
   seq = text_to_sequence(text, cleaner_names)
   feed_dict = {
     self.model.inputs: [np.asarray(seq, dtype=np.int32)],
     self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
   }
   wav = self.session.run(self.wav_output, feed_dict=feed_dict)
   wav = audio.inv_preemphasis(wav)
   wav = wav[:audio.find_endpoint(wav)]
   out = io.BytesIO()
   audio.save_wav(wav, out)
   return out.getvalue()

Esempio n. 2

0

Mostra file

File: datafeeder.py Progetto: keithito/tacotron

  def _get_next_example(self):
    '''Loads a single example (input, mel_target, linear_target, cost) from disk'''
    if self._offset >= len(self._metadata):
      self._offset = 0
      random.shuffle(self._metadata)
    meta = self._metadata[self._offset]
    self._offset += 1

    text = meta[3]
    if self._cmudict and random.random() < _p_cmudict:
      text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')])

    input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
    linear_target = np.load(os.path.join(self._datadir, meta[0]))
    mel_target = np.load(os.path.join(self._datadir, meta[1]))
    return (input_data, mel_target, linear_target, len(linear_target))

Esempio n. 3

0

Mostra file

 def get_text(self, text, lang_code):
     text_norm = torch.IntTensor(
         text_to_sequence(text, self.text_cleaners, lang_code,
                          self.cmudict))
     return text_norm

Esempio n. 4

0

Mostra file

File: dataset.py Progetto: icmaker-jx/Tacotron2-PyTorch

def get_text(text):
    return torch.IntTensor(text_to_sequence(text, hps.text_cleaners))

Esempio n. 5

0

Mostra file

def transform_text(text, text_cleaners):
    return text_to_sequence(text, text_cleaners)

Esempio n. 6

0

Mostra file

File: audio_generator_tacotron.py Progetto: bart-wojtala/tts

    def generate(self):
        for message in self.messages:
            if message.voice in self.models_22khz:
                self.hparams.sampling_rate = self.default_sampling_rate
                waveglow_path = ""
                if message.voice == "vader:" or message.voice == "duke:":
                    waveglow_path = self.models_path + \
                        self.waveglow_22khz["vader:"]
                elif message.voice == "keanu:" or message.voice == "hal:":
                    waveglow_path = self.models_path + \
                        self.waveglow_22khz["david:"]
                elif message.voice == "johnny:":
                    waveglow_path = self.models_path + \
                        self.waveglow_22khz["johnny:"]
                else:
                    waveglow_path = self.models_path + \
                        self.waveglow_22khz["default"]

                waveglow = torch.load(waveglow_path)["model"]
                waveglow.cuda().eval().half()
                for k in waveglow.convinv:
                    k.float()
                denoiser = Denoiser(waveglow)

                if len(message.text) > 127:
                    self.hparams.max_decoder_steps = 100000
                else:
                    self.hparams.max_decoder_steps = 10000

                trimmed_message_length = len("".join(c for c in message.text
                                                     if c.isalnum()))
                if trimmed_message_length < 4:
                    if message.voice == "vader:" or message.voice == "carlson:":
                        self.hparams.max_decoder_steps = 1000
                        self.hparams.gate_threshold = 0.001
                        if any(char.isdigit() for char in message.text):
                            self.hparams.max_decoder_steps = 10000
                            self.hparams.gate_threshold = 0.5
                if trimmed_message_length >= 4 and trimmed_message_length < 7:
                    self.hparams.gate_threshold = 0.01
                    if message.voice == "vader:" or message.voice == "carlson:":
                        self.hparams.gate_threshold = 0.01
                        if any(char.isdigit() for char in message.text):
                            self.hparams.gate_threshold = 0.5
                    else:
                        self.hparams.gate_threshold = 0.01
                        if any(char.isdigit() for char in message.text):
                            self.hparams.gate_threshold = 0.1
                elif trimmed_message_length >= 7 and trimmed_message_length < 15:
                    self.hparams.gate_threshold = 0.1
                    if message.voice == "vader:" or message.voice == "carlson:":
                        self.hparams.gate_threshold = 0.01
                        if any(char.isdigit() for char in message.text):
                            self.hparams.gate_threshold = 0.5
                    else:
                        self.hparams.gate_threshold = 0.1
                        if any(char.isdigit() for char in message.text):
                            self.hparams.gate_threshold = 0.2
                else:
                    self.hparams.gate_threshold = 0.5

                message_extended = False
                if trimmed_message_length < 11:
                    if message.voice == "vader:":
                        message.text = "{} -. -------. -------.".format(
                            message.text)
                    else:
                        message.text = "{} -------. -------.".format(
                            message.text)
                    message_extended = True

                model = load_model(self.hparams)
                model.load_state_dict(
                    torch.load(self.models_path +
                               self.models_22khz[message.voice])["state_dict"])
                _ = model.cuda().eval().half()

                sequence = np.array(
                    text_to_sequence(message.text,
                                     ["english_cleaners"]))[None, :]
                sequence = torch.autograd.Variable(
                    torch.from_numpy(sequence)).cuda().long()

                mel_outputs_postnet, requires_cutting = model.inference(
                    sequence)

                with torch.no_grad():
                    audio = waveglow.infer(mel_outputs_postnet, sigma=1)
                # audio_denoised = denoiser(audio, strength=0.001)[:, 0]
                # if np.isnan(audio_denoised.cpu().numpy()[0][0]):
                #     audio_data = audio.cpu().numpy()[0]
                # else:
                #     audio_data = audio_denoised.cpu().numpy()[0]
                audio_data = audio.cpu().numpy()[0]

                scaled_audio = np.int16(audio_data /
                                        np.max(np.abs(audio_data)) *
                                        self.audio_length_parameter)
                if message_extended or requires_cutting:
                    cut_index = 0
                    silence_length = 0
                    for i, val in enumerate(scaled_audio):
                        if val == 0:
                            silence_length += 1
                        if silence_length > 500:
                            cut_index = i
                            break
                    scaled_audio = scaled_audio[:cut_index]
                if message.voice == "vader:":
                    _, effect = read("extras/breathing.wav")
                    scaled_audio = np.concatenate((effect, scaled_audio))

                scaled_audio = np.concatenate((scaled_audio, self.silence))
                self.joined_audio = np.concatenate(
                    (self.joined_audio, scaled_audio))
                if requires_cutting:
                    torch.cuda.empty_cache()
            else:
                engine = pyttsx3.init()
                if self.current_os == "Windows":
                    engine.setProperty(
                        "voice", self.synth_voices_windows[message.voice])
                else:
                    engine.setProperty("voice",
                                       self.synth_voices_linux[message.voice])
                engine.setProperty("rate", 120)
                engine.save_to_file(message.text, self.temp_file)
                engine.runAndWait()

                while not os.path.isfile(self.temp_file):
                    time.sleep(1.5)

                if os.path.isfile(self.temp_file):
                    del engine
                    file = read(
                        os.path.join(os.path.abspath("."), self.temp_file))
                    audio = np.array(file[1], dtype=np.int16)
                    audio = np.concatenate((audio, self.silence))
                    self.joined_audio = np.concatenate(
                        (self.joined_audio, audio))
                    os.remove(self.temp_file)

        scaled_audio = np.int16(self.joined_audio /
                                np.max(np.abs(self.joined_audio)) *
                                self.audio_length_parameter)
        if scaled_audio[0] == self.audio_length_parameter:
            scaled_audio = scaled_audio[1:]

        return scaled_audio, self.hparams.sampling_rate

Esempio n. 7

0

Mostra file

File: synthesizer.py Progetto: Charlottecuc/multi_emotional_tacotron

    def synthesize(self,
                   texts=None,
                   tokens=None,
                   base_path=None,
                   paths=None,
                   speaker_ids=None,
                   start_of_sentence=None,
                   end_of_sentence=True,
                   pre_word_num=0,
                   post_word_num=0,
                   pre_surplus_idx=0,
                   post_surplus_idx=1,
                   use_short_concat=False,
                   manual_attention_mode=0,
                   base_alignment_path=None,
                   librosa_trim=False,
                   attention_trim=True,
                   isKorean=True):
        # manual_attention_mode가 on되면, manual attention 적용하지 않음 버전과 적용한 버전해서, 2개가 만들어 진다.
        # Possible inputs:
        # 1) text=text
        # 2) text=texts
        # 3) tokens=tokens, texts=texts # use texts as guide

        if type(texts) == str:
            texts = [texts]

        if texts is not None and tokens is None:
            sequences = np.array([text_to_sequence(text) for text in texts])
            sequences = _prepare_inputs(sequences)
        elif tokens is not None:
            sequences = tokens

        #sequences = np.pad(sequences,[(0,0),(0,5)],'constant',constant_values=(0))  # case by case ---> overfitting?

        if paths is None:
            paths = [None] * len(sequences)
        if texts is None:
            texts = [None] * len(sequences)

        time_str = get_time()

        def plot_and_save_parallel(wavs, alignments, use_manual_attention,
                                   mels):

            items = list(
                enumerate(zip(wavs, alignments, paths, texts, sequences,
                              mels)))

            fn = partial(plot_graph_and_save_audio,
                         base_path=base_path,
                         start_of_sentence=start_of_sentence,
                         end_of_sentence=end_of_sentence,
                         pre_word_num=pre_word_num,
                         post_word_num=post_word_num,
                         pre_surplus_idx=pre_surplus_idx,
                         post_surplus_idx=post_surplus_idx,
                         use_short_concat=use_short_concat,
                         use_manual_attention=use_manual_attention,
                         librosa_trim=librosa_trim,
                         attention_trim=attention_trim,
                         time_str=time_str,
                         isKorean=isKorean)
            return parallel_run(fn,
                                items,
                                desc="plot_graph_and_save_audio",
                                parallel=False)

        #input_lengths = np.argmax(np.array(sequences) == 1, 1)+1
        input_lengths = [np.argmax(a == 1) + 1 for a in sequences]

        fetches = [
            #self.wav_output,
            self.model.linear_outputs,
            self.model.
            alignments,  #  # batch_size, text length(encoder), target length(decoder)
            self.model.mel_outputs,
        ]

        feed_dict = {
            self.model.inputs: sequences,
            self.model.input_lengths: input_lengths,
        }
        if base_alignment_path is None:
            feed_dict.update({
                self.model.manual_alignments: np.zeros([1, 1, 1]),
                self.model.is_manual_attention: False,
            })
        else:
            manual_alignments = []
            #alignment_path = os.path.join(base_alignment_path,os.path.basename(base_path))
            alignment_path = os.path.join(os.path.basename(base_path),
                                          base_alignment_path)

            for idx in range(len(sequences)):
                numpy_path = "{}{}.npy".format(alignment_path, idx)
                manual_alignments.append(np.load(numpy_path))

            alignments_T = np.transpose(manual_alignments, [0, 2, 1])
            feed_dict.update({
                self.model.manual_alignments: alignments_T,
                self.model.is_manual_attention: True
            })

        if speaker_ids is not None:
            if type(speaker_ids) == dict:
                speaker_embed_table = sess.run(self.model.speaker_embed_table)

                speaker_embed = [
                    speaker_ids[speaker_id] * speaker_embed_table[speaker_id]
                    for speaker_id in speaker_ids
                ]
                feed_dict.update({self.model.speaker_embed_table: np.tile()})
            else:
                feed_dict[self.model.speaker_id] = speaker_ids

        wavs, alignments, mels = self.sess.run(fetches, feed_dict=feed_dict)
        results = plot_and_save_parallel(
            wavs, alignments, use_manual_attention=False, mels=mels
        )  # use_manual_attention = True/False는 출력파일명에 'manual'을 넣고 빼고 차이 뿐.

        if manual_attention_mode > 0:
            # argmax one hot
            if manual_attention_mode == 1:
                alignments_T = np.transpose(
                    alignments, [0, 2, 1]
                )  #   [batch_size, Encoder length, Decoder_length] ==>    [N,D,E].   (1, 50, 200) -->((1,200,50)
                new_alignments = np.zeros_like(
                    alignments_T)  # model에서 attention은 (N,D,E)이므로

                for idx in range(len(alignments)):  # batch에 대한 loop
                    argmax = alignments[idx].argmax(
                        1)  # text가 소리의 어디쯤에서 가장 영향을 많이 주었나? 즉 어디서 발음되나?
                    new_alignments[idx][(argmax, range(len(
                        argmax)))] = 1  # 최대값을 가지는 위치만 1로 바꾸어주는 효과. 나머지는 모두 0
            # sharpening
            elif manual_attention_mode == 2:
                new_alignments = np.transpose(
                    alignments, [0, 2, 1])  # [N, E, D]  ==> [N,D,E]

                for idx in range(len(alignments)):  # batch에 대한 loop
                    # 분산, 평균을 계산한 후, 사용하지도 않네... 뭐야!!!
                    var = np.var(
                        new_alignments[idx], 1
                    )  # variance  [N,D].  각 Decoder time별 attention variance
                    mean_var = var[:input_lengths[idx]].mean()

                    new_alignments[idx] = np.power(new_alignments[idx], 2)
            # prunning
            elif manual_attention_mode == 3:
                new_alignments = np.transpose(alignments,
                                              [0, 2, 1])  # [N, E, D]

                for idx in range(len(alignments)):
                    argmax = alignments[idx].argmax(1)
                    new_alignments[idx][(argmax, range(len(
                        argmax)))] = 1  # 최대값을 가지는 위치만 1로 바꾸어주는 효과. 나머지는 모두 유지

            feed_dict.update({
                self.model.manual_alignments: new_alignments,
                self.model.is_manual_attention: True,
            })

            new_wavs, new_alignments = self.sess.run(fetches,
                                                     feed_dict=feed_dict)
            results = plot_and_save_parallel(new_wavs, new_alignments, True)

        return results

Esempio n. 8

0

Mostra file

File: infer.py Progetto: Welsun/chinese_tacotron

    # print(mel_spec.shape)
    # plt.imsave("note/test.jpg",mel_spec.numpy(),cmap='hot')
    checkpoint = './news_output_22k/checkpoint_50000'
    model = load_model(hp)
    model.load_state_dict(torch.load(checkpoint)['state_dict'])
    _ = model.cuda().eval().half()
    waveglow = torch.load(waveglow_path, map_location="cpu")['model']
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    denoiser = Denoiser(waveglow).cuda()


    # 自定义文本
    text="各位老师，大家早上好，这是我目前取得的初步结果，生成的嘴唇和声音，可以保持一定的唇音同步，感谢各位老师的指导。"
    text, _ = get_pyin(text)
    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(torch.from_numpy(sequence).cuda().long())
    mel_output, mel_output_posnet, _, alignment = model.inference(sequence)
    mel_output = mel_output.float().data.cpu()[0]
    mel_output_posnet = mel_output_posnet.float().data.cpu()[0]
    mel = mel_output_posnet.unsqueeze(0)
    #mel=mel_spec.unsqueeze(0)
    denoiser_strength = 0.1
    output_dir = "note/"
    sampling_rate = 22050
    sigma = 0.66
    i=1
    with torch.no_grad():
        audio = waveglow.infer(mel.cuda(), sigma=sigma)

        audio = denoiser(audio, 0.1)

Esempio n. 9

0

Mostra file

# where the clip file will be written:
save_path = 'audio_test.wav'
# where the pre-trained model is located:
# Inputs for the synthesis:
test_text = "the recommended book for natural language interaction is neural network methods from goldberg"

#GST scores
gst_head_scores = np.array([0.4, 0.2, 0.4])
gst_scores = torch.from_numpy(gst_head_scores).cuda().float()
print('Input sequence and GST weights loaded...')

# TEXT2MEL:
torch.manual_seed(1234)
from text import text_to_sequence
#preprocessing:
sequence = np.array(text_to_sequence(test_text, ['english_cleaners']))[None, :]
sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64)
print("Input text sequence pre-processed successfully...")
#text to mel inference:
t1 = time.time()
with torch.no_grad():
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
        sequence, gst_scores)

# MEL2WAV :
from audio_processing import griffin_lim
from nn_layers import TacotronSTFT
torch.manual_seed(1234)

# Griffin Lim vocoder synthesis:
# griffin_iters = 60

Esempio n. 10

0

Mostra file

File: train.py Progetto: BenJamesbabala/tacotron_pytorch

 def collect_features(self, text):
     return np.asarray(text_to_sequence(text, self._cleaner_names),
                       dtype=np.int32)

Esempio n. 11

0

Mostra file

File: inference.py Progetto: xishaoheng/Tacotron2-PyTorch

def infer(text, model):
	sequence = text_to_sequence(text, hps.text_cleaners)
	sequence = mode(torch.IntTensor(sequence)[None, :]).long()
	mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
	return (mel_outputs, mel_outputs_postnet, alignments)

Esempio n. 12

0

Mostra file

File: data_utils.py Progetto: 637t0yvgcrmw/code01

 def get_text(self, text):
     if self.add_space:
       text = " " + text.strip() + " "
     text_norm = torch.IntTensor(
         text_to_sequence(text, self.text_cleaners, getattr(self, "cmudict", None)))
     return text_norm

Esempio n. 13

0

Mostra file

# "models/mellotron_libritts.pt"
mellotron = load_model(hparams).cuda().eval()
mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])
waveglow_path = 'models/waveglow_256channels_v4.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
audio_paths = 'data/examples_filelist_korean.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)
file_idx = 0
audio_path, text, sid, lang_code = dataloader.audiopaths_and_text[file_idx]

# get audio path, encoded text, pitch contour and mel for gst
text_encoded = torch.LongTensor(
    text_to_sequence(text, hparams.text_cleaners, int(lang_code),
                     arpabet_dict))[None, :].cuda()
pitch_contour = dataloader[file_idx][3][None].cuda()
mel = load_mel(audio_path)
print(audio_path, text)

# load source data to obtain rhythm using tacotron 2 as a forced aligner
x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))
ipd.Audio(audio_path, rate=hparams.sampling_rate)
speaker_ids = TextMelLoader(
    "filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt",
    hparams).speaker_ids
speakers = pd.read_csv('filelists/libritts_speakerinfo.txt',
                       engine='python',
                       header=None,
                       comment=';',
                       sep=' *\| *',

Esempio n. 14

0

Mostra file

File: INFERENCE_SYNTHESIS_NEW.py Progetto: alvarorivasg/GST_Tacotron2

    gst_scores = torch.from_numpy(gst_head_scores[j])
    gst_scores = torch.autograd.Variable(gst_scores).cuda().float()
    gst_name = gst_head_names[j]  # is a string

    for i in range(3):

        test_short = test_text_short[i]
        test_medium = test_text_medium[i]
        test_large = test_text_large[i]
        tests_aux = (test_short, test_medium, test_large)

        for k in range(3):

            sequence = np.array(
                text_to_sequence(tests_aux[k], ['english_cleaners']))[None, :]
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).cuda().long()
            # text2mel:
            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
                sequence, gst_scores)

            # save the predicted outputs from tacotron2:
            mel_outputs_path = predicted_melspec_folder + "output.pt"
            mel_outputs_postnet_path = predicted_melspec_folder + "output_postnet.pt"
            alignments_path = predicted_melspec_folder + "alignment.pt"
            torch.save(mel_outputs, mel_outputs_path)
            torch.save(mel_outputs_postnet, mel_outputs_postnet_path)
            torch.save(alignments, alignments_path)

            print("text2mel prediction successfully performed...")

Esempio n. 15

0

Mostra file

model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
_ = model.cuda().eval().half()

"""**`Step 7: Load WaveGlow for mel2audio synthesis and denoiser`**"""

waveglow_path = '/content/drive/MyDrive/SSMT/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
    k.float()
denoiser = Denoiser(waveglow)

"""**Step 8: Prepare text input**"""

text = "मैं बाज़ार जाता हूँ "
sequence = np.array(text_to_sequence(text, ['transliteration_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()

"""**Step 9: Decode text input and plot results**"""

mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

"""***A short summary about melspectrograms:***

Sound is heard as a result of the variation of pressure with time. However, speech signals are complex entities and a simple pressure variation does not capture enough information for the deep learning model to be trained. Hence in short, a melspectrogram, is a graph which plots three quanitites - Time on the X axis, Frequency on the Y axis and the colors represent the loudness of the sound. 

The alignment graph seen above is a simple representation of the trajectory of the final output compared to its initial text input

Esempio n. 16

0

Mostra file

 def get_text(self, text):
     sequence = text_to_sequence(text, self.text_cleaners)
     text_norm = torch.IntTensor(sequence)
     ctc_text_norm = torch.IntTensor(sequence_to_ctc_sequence(sequence))
     return text_norm, ctc_text_norm

Esempio n. 17

0

Mostra file

File: mw06.py Progetto: ady95/Tacotron2-Wavenet-Korean-TTS

def _process_utterance(out_dir, wav_path, text, hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #rescale wav
    if hparams.rescaling:  # hparams.rescale = True
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #M-AILABS extra silence specific
    if hparams.trim_silence:  # hparams.trim_silence = True
        wav = audio.trim_silence(wav,
                                 hparams)  # Trim leading and trailing silence

    #Mu-law quantize, default 값은 'raw'
    if hparams.input_type == 'mulaw-quantize':
        #[0, quantize_channels)
        out = audio.mulaw_quantize(wav, hparams.quantize_channels)

        #Trim silences
        start, end = audio.start_and_end_indices(out,
                                                 hparams.silence_threshold)
        wav = wav[start:end]
        out = out[start:end]

        constant_values = mulaw_quantize(0, hparams.quantize_channels)
        out_dtype = np.int16

    elif hparams.input_type == 'mulaw':
        #[-1, 1]
        out = audio.mulaw(wav, hparams.quantize_channels)
        constant_values = audio.mulaw(0., hparams.quantize_channels)
        out_dtype = np.float32

    else:  # raw
        #[-1, 1]
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:  # hparams.max_mel_frames = 1000, hparams.clip_mels_length = True
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:  # hparams.use_lws = False
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.fft_size if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        pad = audio.librosa_pad_lr(wav, hparams.fft_size,
                                   audio.get_hop_size(hparams))

        #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, pad, mode='reflect')

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    wav_id = os.path.splitext(os.path.basename(wav_path))[0]

    # Write the spectrograms to disk:
    audio_filename = '{}-audio.npy'.format(wav_id)
    mel_filename = '{}-mel.npy'.format(wav_id)
    linear_filename = '{}-linear.npy'.format(wav_id)
    npz_filename = '{}.npz'.format(wav_id)
    npz_flag = True
    if npz_flag:
        # Tacotron 코드와 맞추기 위해, 같은 key를 사용한다.
        data = {
            'audio': out.astype(out_dtype),
            'mel': mel_spectrogram.T,
            'linear': linear_spectrogram.T,
            'time_steps': time_steps,
            'mel_frames': mel_frames,
            'text': text,
            'tokens': text_to_sequence(text),  # eos(~)에 해당하는 "1"이 끝에 붙는다.
            'loss_coeff': 1  # For Tacotron
        }

        np.savez(os.path.join(out_dir, npz_filename),
                 **data,
                 allow_pickle=False)
    else:
        np.save(os.path.join(out_dir, audio_filename),
                out.astype(out_dtype),
                allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename),
                mel_spectrogram.T,
                allow_pickle=False)
        np.save(os.path.join(out_dir, linear_filename),
                linear_spectrogram.T,
                allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text, npz_filename)

Esempio n. 18

0

Mostra file

File: gui.py Progetto: lokkelvin2/tacotron2_GUI

 def execute_this_fn(self, TOKEN, min_donation, channel, se_opts, use_cuda,
                     model, waveglow, offset, prev_time, startup_time,
                     progress_callback, elapsed_callback, text_ready,
                     fn_callback):
     # TODO: refactor this messy block
     fn_callback.emit(('GUI: start of polling loop', None))
     text_ready.emit("Sta2:Connecting to StreamElements")
     url = "https://api.streamelements.com/kappa/v2/tips/" + self.channel_id
     headers = {
         'accept': 'application/json',
         "Authorization": "Bearer " + TOKEN
     }
     text_ready.emit('Log2:Initializing')
     text_ready.emit('Log2:Minimum amount for TTS: ' + str(min_donation))
     while True:
         _mutex2.lock()
         if _running2 == False:
             _mutex2.unlock()
             break
         else:
             _mutex2.unlock()
         if not channel.get_busy():
             #print('Polling', datetime.datetime.utcnow().isoformat())
             text_ready.emit("Sta2:Waiting for incoming donations . . .")
             current_time = datetime.datetime.utcnow().isoformat()
             # TODO: possible bug: missed donations once time pasts midnight
             querystring = {
                 "offset": offset,
                 "limit": "1",
                 "sort": "createdAt",
                 "after": startup_time,
                 "before": current_time
             }
             response = requests.request("GET",
                                         url,
                                         headers=headers,
                                         params=querystring)
             data = json.loads(response.text)
             for dono in data['docs']:
                 text_ready.emit("Sta2:Processing donations")
                 dono_time = dono['createdAt']
                 offset += 1
                 if dono_time > prev_time:  # Str comparison
                     amount = dono['donation']['amount']  # Int
                     if float(amount) >= min_donation and dono[
                             'approved'] == 'allowed':
                         name = dono['donation']['user']['username']
                         msg = dono['donation']['message']
                         if msg.isspace(): break  # Check for empty line
                         ## TODO Allow multiple speaker in msg
                         currency = dono['donation']['currency']
                         dono_id = dono['_id']
                         text_ready.emit(
                             "Log2:\n###########################")
                         text_ready.emit("Log2:" + name + ' donated ' +
                                         currency + str(amount))
                         text_ready.emit("Log2:" + msg)
                         lines = preprocess_text(msg)
                         if se_opts[
                                 'read dono amount'] == 1:  # reads dono name and amount
                             msg = '{} donated {} {}.'.format(
                                 name, str(amount),
                                 cleaners.expand_currency(currency))
                             lines.insert(0, msg)  # Add to head to list
                         output = []
                         for count, line in enumerate(lines):
                             fn_callback.emit(
                                 ('GUI: progress bar 2 text', (count,
                                                               len(lines))))
                             sequence = np.array(
                                 text_to_sequence(
                                     line, ['english_cleaners']))[None, :]
                             # Inference
                             device = torch.device(
                                 'cuda' if use_cuda else 'cpu')
                             sequence = torch.autograd.Variable(
                                 torch.from_numpy(sequence)).to(
                                     device).long()
                             # Decode text input
                             mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
                                 sequence)
                             with torch.no_grad():
                                 audio = waveglow.infer(
                                     mel_outputs_postnet,
                                     sigma=0.666,
                                     progress_callback=progress_callback,
                                     elapsed_callback=None,
                                     get_interruptflag=self.
                                     get_interruptflag2)
                                 if type(audio) != torch.Tensor:
                                     # Catches when waveglow is interrupted and returns none
                                     break
                                 fn_callback.emit(
                                     ('GUI: progress bar 2 text',
                                      (count + 1, len(lines))))
                                 wav = audio[0].data.cpu().numpy()
                             output.append(wav)
                         _mutex3.lock()
                         if _running3 == True:
                             _mutex3.unlock()
                             outwav = np.concatenate(output)
                             # Playback
                             fn_callback.emit(('Wav: playback', outwav))
                         else:
                             _mutex3.unlock()
                         prev_time = dono_time  # Increment time
         time.sleep(0.5)
     fn_callback.emit(('GUI: end of polling loop', None))
     text_ready.emit('Log2:\nDisconnected')
     text_ready.emit('Sta2:Ready')
     fn_callback.emit(('Var: offset', offset))
     fn_callback.emit(('Var: prev_time', prev_time))
     return  #'Return value of execute_this_fn'

Esempio n. 19

0

Mostra file

File: synthesizer.py Progetto: Ella77/Tacotron2-Wavenet-Korean-TTS

    def synthesize(self,
                   texts=None,
                   tokens=None,
                   base_path=None,
                   paths=None,
                   speaker_ids=None,
                   start_of_sentence=None,
                   end_of_sentence=True,
                   pre_word_num=0,
                   post_word_num=0,
                   pre_surplus_idx=0,
                   post_surplus_idx=1,
                   use_short_concat=False,
                   base_alignment_path=None,
                   librosa_trim=False,
                   attention_trim=True,
                   isKorean=True):
        # Possible inputs:
        # 1) text=text
        # 2) text=texts
        # 3) tokens=tokens, texts=texts # use texts as guide

        if type(texts) == str:
            texts = [texts]

        if texts is not None and tokens is None:
            sequences = np.array([text_to_sequence(text) for text in texts])
            sequences = _prepare_inputs(sequences)
        elif tokens is not None:
            sequences = tokens

        #sequences = np.pad(sequences,[(0,0),(0,5)],'constant',constant_values=(0))  # case by case ---> overfitting?

        if paths is None:
            paths = [None] * len(sequences)
        if texts is None:
            texts = [None] * len(sequences)

        time_str = get_time()

        def plot_and_save_parallel(wavs, alignments, mels):

            items = list(
                enumerate(zip(wavs, alignments, paths, texts, sequences,
                              mels)))

            fn = partial(plot_graph_and_save_audio,
                         base_path=base_path,
                         start_of_sentence=start_of_sentence,
                         end_of_sentence=end_of_sentence,
                         pre_word_num=pre_word_num,
                         post_word_num=post_word_num,
                         pre_surplus_idx=pre_surplus_idx,
                         post_surplus_idx=post_surplus_idx,
                         use_short_concat=use_short_concat,
                         librosa_trim=librosa_trim,
                         attention_trim=attention_trim,
                         time_str=time_str,
                         isKorean=isKorean)
            return parallel_run(fn,
                                items,
                                desc="plot_graph_and_save_audio",
                                parallel=False)

        #input_lengths = np.argmax(np.array(sequences) == 1, 1)+1
        input_lengths = [np.argmax(a == 1) + 1 for a in sequences]

        fetches = [
            #self.wav_output,
            self.model.linear_outputs,
            self.model.
            alignments,  #  # batch_size, text length(encoder), target length(decoder)
            self.model.mel_outputs,
        ]

        feed_dict = {
            self.model.inputs: sequences,
            self.model.input_lengths: input_lengths,
        }

        if speaker_ids is not None:
            if type(speaker_ids) == dict:
                speaker_embed_table = sess.run(self.model.speaker_embed_table)

                speaker_embed = [
                    speaker_ids[speaker_id] * speaker_embed_table[speaker_id]
                    for speaker_id in speaker_ids
                ]
                feed_dict.update({self.model.speaker_embed_table: np.tile()})
            else:
                feed_dict[self.model.speaker_id] = speaker_ids

        wavs, alignments, mels = self.sess.run(fetches, feed_dict=feed_dict)
        results = plot_and_save_parallel(wavs, alignments, mels=mels)

        return results

Esempio n. 20

0

Mostra file

denoiser = Denoiser(waveglow).cuda().eval()

# ## Setup dataloaders

arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
audio_paths = 'data/examples_filelist.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)

# ## Load data
file_idx = 0
audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

# get audio path, encoded text, pitch contour and mel for gst
text_encoded = torch.LongTensor(
    text_to_sequence(text, hparams.text_cleaners, arpabet_dict,
                     0.0))[None, :].cuda()
pitch_contour = dataloader[file_idx][3][None].cuda()
mel = load_mel(audio_path)
# print(audio_path, text)

# load source data to obtain rhythm using tacotron 2 as a forced aligner
x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))

ipd.Audio(audio_path, rate=hparams.sampling_rate)

## Define Speakers Set

speaker_ids = TextMelLoader(
    "filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt",
    hparams).speaker_ids
speakers = pd.read_csv('filelists/aidatatang_speakerinfo.txt',

Esempio n. 21

0

Mostra file

File: code_test.py Progetto: DonggeunYu/Text2Speech

import torch
from torch import nn
import numpy as np

from text import text_to_sequence

a = '안녕하세요'
b = '요안'
print(text_to_sequence(a))
print(text_to_sequence(b))

Esempio n. 22

0

Mostra file

    if hparams.bert:
        bert, tokenizer = load_bert(args.bert_folder)

    # Extract phonemic features
    with open(args.text, 'r') as f:
        texts = []
        for line in f.readlines():
            name, sen = line.strip().split(' ')
            if sen[-1] not in ['。', '？', '！']:
                texts.append((name, sen + '。'))
            else:
                texts.append((name, sen))

    for i, (name, text) in tqdm(enumerate(texts)):

        phone_seq = np.array(text_to_sequence(text,
                                              ['chinese_cleaners']))[None, :]
        phones = torch.autograd.Variable(
            torch.from_numpy(phone_seq)).cuda().long()
        if hparams.bert == False:
            sequence = phones
        # Extract BERT embeddings
        else:
            features = extract_embeddings(bert, tokenizer, text)
            sequence = (phones, features)

        mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
            sequence)

        if args.alignment:
            plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
                       alignments.float().data.cpu().numpy()[0].T),

Esempio n. 23

0

Mostra file

    audiopath, test_text, speaker = dataloader.audiopaths_and_text[batch_idx[i]]
    #copyfile(audiopath, os.path.join(output_dir, 'ref_true.wav'))
    fname_wav = os.path.join(output_dir, 'ref_true_{}.wav'.format(i))
    mel_outputs_postnet = batch['support']['mel_padded'][ref_idx:ref_idx+1]
    # remove pad
    #mel_len = int(batch['support']['f0_padded'][ref_idx].sum().item())
    mel_len = (mel_outputs_postnet.mean(1) != 0).sum()
    mel_outputs_postnet = mel_outputs_postnet[:,:,:mel_len]
    audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:,0]
    write(fname_wav, hparams.sampling_rate, audio[0].data.cpu().numpy())
    save_figure(mel_outputs_postnet[0].data.cpu().numpy(),
            np.zeros((10,10)), fname_wav.replace('.wav', '.png'),
            description=test_text)
    text_encoded = torch.LongTensor(
            text_to_sequence(test_text,
                hparams.text_cleaners,
                arpabet_dict)
            )[None,:].cuda()
    text_lengths = torch.LongTensor(
            [len(text_encoded)]).cuda()

    input_dict = {'query': {'text_padded': text_encoded, 'input_lengths': text_lengths},
            'support': batch['support']}

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, alignments = model.inference(input_dict)
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:,0]

    fname_wav = os.path.join(output_dir, 'ref_pred_{}.wav'.format(i))
    write(fname_wav, hparams.sampling_rate, audio[0].data.cpu().numpy())
    save_figure(mel_outputs_postnet[0].data.cpu().numpy(),

Esempio n. 24

0

Mostra file

    copyfile(audio_path, fname_wav)

    # save waveglow original mel
    mel = load_mel(audio_path)

    fname_wav = os.path.join(output_dir, 'ref_recon_{}.wav'.format(idx))
    with torch.no_grad():
        audio = denoiser(waveglow.infer(mel, sigma=0.8), 0.01)[:, 0]
    write(fname_wav, hparams.sampling_rate, audio[0].data.cpu().numpy())
    fname_fig = os.path.join(output_dir, 'true_mel_{}.png'.format(idx))
    save_figure(mel[0].data.cpu().numpy(), np.zeros((10, 10)), fname_fig, text)

    # save waveglow prediction mel
    fname_wav = os.path.join(output_dir, 'pred_{}.wav'.format(idx))
    text_encoded = torch.LongTensor(\
            text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None,:].cuda().long()
    with torch.no_grad():
        _, mel_post, _, attn = model.inference((text_encoded, mel))
        audio = denoiser(waveglow.infer(mel_post, sigma=0.8), 0.01)[:, 0]
    write(fname_wav, hparams.sampling_rate, audio[0].data.cpu().numpy())

    fname_fig = os.path.join(output_dir, 'pred_mel_{}.png'.format(idx))
    save_figure(mel_post[0].data.cpu().numpy(), attn[0].data.cpu().numpy(),
                fname_fig, text)

    print(idx, text)

    # non-parallel predictions
    for text in test_text_list:
        text_encoded = torch.LongTensor(
            text_to_sequence(text, hparams.text_cleaners,

Esempio n. 25

0

Mostra file

    def get_text(self, text):
        text_norm = torch.IntTensor(text_to_sequence(
            text, self.text_cleaners))  # self.cmudict, self.p_arpabet))

        return text_norm

Esempio n. 26

0

Mostra file

File: speech_dataset.py Progetto: hyeokjinson/sba_speech

 def get_text(self, transcript):
     text = text_to_sequence(transcript, cleaner_names=hps.cleaner_names)
     text = torch.IntTensor(text)
     return text

Esempio n. 27

0

Mostra file

File: inference.py Progetto: trinco-cn1/zhrtvc

melgan_path = 'models/multi_speaker.pt'
load_vocoder_melgan(melgan_path)

## Setup dataloaders
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
audio_paths = 'data/examples_filelist.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)

## Load data
file_idx = 0
audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

# get audio path, encoded text, pitch contour and mel for gst
text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda()
pitch_contour = dataloader[file_idx][3][None].cuda()
mel = load_mel(audio_path)
print(audio_path, text)

## Define Speakers Set
speaker_ids = TextMelLoader("filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt",
                            hparams).speaker_ids
speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python', header=None, comment=';', sep=' *\| *',
                       names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
speakers['MELLOTRON_ID'] = speakers['ID'].apply(lambda x: speaker_ids[x] if x in speaker_ids else -1)
female_speakers = cycle(
    speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())
male_speakers = cycle(
    speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")['MELLOTRON_ID'].sample(frac=1).tolist())

Esempio n. 28

0

Mostra file

File: data_utils.py Progetto: Welsun/chinese_tacotron

 def get_text(self, text):
     #print(text)
     pyin, txt = get_pyin(text)
     #print(pyin)
     text_norm = torch.IntTensor(text_to_sequence(pyin, self.text_cleaners))
     return text_norm

Esempio n. 29

0

Mostra file

File: synthesizer.py Progetto: corgi-sw/multi-speaker-tacotron-tensorflow

    def synthesize(self,
                   texts=None,
                   tokens=None,
                   base_path=None,
                   paths=None,
                   speaker_ids=None,
                   start_of_sentence=None,
                   end_of_sentence=True,
                   pre_word_num=0,
                   post_word_num=0,
                   pre_surplus_idx=0,
                   post_surplus_idx=1,
                   use_short_concat=False,
                   manual_attention_mode=0,
                   base_alignment_path=None,
                   librosa_trim=False,
                   attention_trim=True,
                   isKorean=True):

        # Possible inputs:
        # 1) text=text
        # 2) text=texts
        # 3) tokens=tokens, texts=texts # use texts as guide

        if type(texts) == str:
            texts = [texts]

        if texts is not None and tokens is None:
            sequences = [text_to_sequence(text) for text in texts]
        elif tokens is not None:
            sequences = tokens

        if paths is None:
            paths = [None] * len(sequences)
        if texts is None:
            texts = [None] * len(sequences)

        time_str = get_time()

        def plot_and_save_parallel(wavs, alignments, use_manual_attention):

            items = list(
                enumerate(zip(wavs, alignments, paths, texts, sequences)))

            fn = partial(plot_graph_and_save_audio,
                         base_path=base_path,
                         start_of_sentence=start_of_sentence,
                         end_of_sentence=end_of_sentence,
                         pre_word_num=pre_word_num,
                         post_word_num=post_word_num,
                         pre_surplus_idx=pre_surplus_idx,
                         post_surplus_idx=post_surplus_idx,
                         use_short_concat=use_short_concat,
                         use_manual_attention=use_manual_attention,
                         librosa_trim=librosa_trim,
                         attention_trim=attention_trim,
                         time_str=time_str,
                         isKorean=isKorean)
            return parallel_run(fn,
                                items,
                                desc="plot_graph_and_save_audio",
                                parallel=False)

        input_lengths = np.argmax(np.array(sequences) == 1, 1)

        fetches = [
            #self.wav_output,
            self.model.linear_outputs,
            self.model.alignments,
        ]

        feed_dict = {
            self.model.inputs: sequences,
            self.model.input_lengths: input_lengths,
        }
        if base_alignment_path is None:
            feed_dict.update({
                self.model.manual_alignments: np.zeros([1, 1, 1]),
                self.model.is_manual_attention: False,
            })
        else:
            manual_alignments = []
            alignment_path = os.path.join(base_alignment_path,
                                          os.path.basename(base_path))

            for idx in range(len(sequences)):
                numpy_path = "{}.{}.npy".format(alignment_path, idx)
                manual_alignments.append(np.load(numpy_path))

            alignments_T = np.transpose(manual_alignments, [0, 2, 1])
            feed_dict.update({
                self.model.manual_alignments: alignments_T,
                self.model.is_manual_attention: True,
            })

        if speaker_ids is not None:
            if type(speaker_ids) == dict:
                speaker_embed_table = sess.run(self.model.speaker_embed_table)

                speaker_embed =  [speaker_ids[speaker_id] * \
                        speaker_embed_table[speaker_id] for speaker_id in speaker_ids]
                feed_dict.update({self.model.speaker_embed_table: np.tile()})
            else:
                feed_dict[self.model.speaker_id] = speaker_ids

        wavs, alignments = \
                self.sess.run(fetches, feed_dict=feed_dict)
        results = plot_and_save_parallel(wavs, alignments, True)

        if manual_attention_mode > 0:
            # argmax one hot
            if manual_attention_mode == 1:
                alignments_T = np.transpose(alignments, [0, 2, 1])  # [N, E, D]
                new_alignments = np.zeros_like(alignments_T)

                for idx in range(len(alignments)):
                    argmax = alignments[idx].argmax(1)
                    new_alignments[idx][(argmax, range(len(argmax)))] = 1
            # sharpening
            elif manual_attention_mode == 2:
                new_alignments = np.transpose(alignments,
                                              [0, 2, 1])  # [N, E, D]

                for idx in range(len(alignments)):
                    var = np.var(new_alignments[idx], 1)
                    mean_var = var[:input_lengths[idx]].mean()

                    new_alignments = np.pow(new_alignments[idx], 2)
            # prunning
            elif manual_attention_mode == 3:
                new_alignments = np.transpose(alignments,
                                              [0, 2, 1])  # [N, E, D]

                for idx in range(len(alignments)):
                    argmax = alignments[idx].argmax(1)
                    new_alignments[idx][(argmax, range(len(argmax)))] = 1

            feed_dict.update({
                self.model.manual_alignments: new_alignments,
                self.model.is_manual_attention: True,
            })

            new_wavs, new_alignments = \
                    self.sess.run(fetches, feed_dict=feed_dict)
            results = plot_and_save_parallel(new_wavs, new_alignments, True)

        return results

Esempio n. 30

0

Mostra file

 def get_text(self, text):
     text_norm = torch.LongTensor(text_to_sequence(text, [self.text_cleaners]))
     return text_norm

Esempio n. 31

0

Mostra file

 def get_text(self, text):
     text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
     return text_norm

Esempio n. 32

0

Mostra file

File: generate_data.py Progetto: re-arep/2019_1_informatics_task_research

def build_from_path(config):
    warning("Sampling rate: {}".format(hparams.sample_rate))

    executor = ProcessPoolExecutor(max_workers=config.num_workers)
    futures = []
    index = 1

    base_dir = os.path.dirname(config.metadata_path)
    data_dir = os.path.join(base_dir, config.data_dirname)
    makedirs(data_dir)

    loss_coeff = defaultdict(one)
    if config.metadata_path.endswith("json"):
        with open(config.metadata_path) as f:
            content = f.read()
        info = json.loads(content)
    elif config.metadata_path.endswith("csv"):
        with open(config.metadata_path) as f:
            info = {}
            for line in f:
                path, text = line.strip().split('|')
                info[path] = text
    else:
        raise Exception(" [!] Unkown metadata format: {}".format(config.metadata_path))

    new_info = {}
    for path in info.keys():
        if not os.path.exists(path):
            new_path = os.path.join(base_dir, path)
            if not os.path.exists(new_path):
                print(" [!] Audio not found: {}".format([path, new_path]))
                continue
        else:
            new_path = path

        new_info[new_path] = info[path]

    info = new_info

    for path in info.keys():
        if type(info[path]) == list:
            if hparams.ignore_recognition_level == 1 and len(info[path]) == 1 or \
                    hparams.ignore_recognition_level == 2:
                loss_coeff[path] = hparams.recognition_loss_coeff

            info[path] = info[path][0]

    ignore_description = {
        0: "use all",
        1: "ignore only unmatched_alignment",
        2: "fully ignore recognitio",
    }

    print(" [!] Skip recognition level: {} ({})". \
            format(hparams.ignore_recognition_level,
                   ignore_description[hparams.ignore_recognition_level]))

    for audio_path, text in info.items():
        if hparams.ignore_recognition_level > 0 and loss_coeff[audio_path] != 1:
            continue

        if base_dir not in audio_path:
            audio_path = os.path.join(base_dir, audio_path)

        try:
            tokens = text_to_sequence(text)
        except:
            continue

        fn = partial(
                _process_utterance,
                audio_path, data_dir, tokens, loss_coeff[audio_path])
        futures.append(executor.submit(fn))

    n_frames = [future.result() for future in tqdm(futures)]
    n_frames = [n_frame for n_frame in n_frames if n_frame is not None]

    hours = frames_to_hours(n_frames)

    print(' [*] Loaded metadata for {} examples ({:.2f} hours)'.format(len(n_frames), hours))
    print(' [*] Max length: {}'.format(max(n_frames)))
    print(' [*] Min length: {}'.format(min(n_frames)))

    plot_n_frames(n_frames, os.path.join(
            base_dir, "n_frames_before_filter.png"))

    min_n_frame = hparams.reduction_factor * hparams.min_iters
    max_n_frame = hparams.reduction_factor * hparams.max_iters - hparams.reduction_factor

    n_frames = [n for n in n_frames if min_n_frame <= n <= max_n_frame]
    hours = frames_to_hours(n_frames)

    print(' [*] After filtered: {} examples ({:.2f} hours)'.format(len(n_frames), hours))
    print(' [*] Max length: {}'.format(max(n_frames)))
    print(' [*] Min length: {}'.format(min(n_frames)))

    plot_n_frames(n_frames, os.path.join(
            base_dir, "n_frames_after_filter.png"))