def create_batch_inputs_from_texts(texts): sequences = [text_to_sequence(text) for text in texts] inputs = _prepare_inputs(sequences) input_lengths = np.asarray([len(x) for x in inputs], dtype=np.int32) for idx, (seq, text) in enumerate(zip(inputs, texts)): recovered_text = sequence_to_text(seq, skip_eos_and_pad=True) if recovered_text != h2j(text): log(" [{}] {}".format(idx, text)) log(" [{}] {}".format(idx, recovered_text)) log("=" * 30) return inputs, input_lengths
def synthesize(self, texts=None, tokens=None, base_path=None, paths=None, speaker_ids=None, start_of_sentence=None, end_of_sentence=True, pre_word_num=0, post_word_num=0, pre_surplus_idx=0, post_surplus_idx=1, use_short_concat=False, manual_attention_mode=0, base_alignment_path=None, librosa_trim=False, attention_trim=True, isKorean=True): # manual_attention_mode가 on되면, manual attention 적용하지 않음 버전과 적용한 버전해서, 2개가 만들어 진다. # Possible inputs: # 1) text=text # 2) text=texts # 3) tokens=tokens, texts=texts # use texts as guide if type(texts) == str: texts = [texts] if texts is not None and tokens is None: sequences = np.array([text_to_sequence(text) for text in texts]) sequences = _prepare_inputs(sequences) elif tokens is not None: sequences = tokens #sequences = np.pad(sequences,[(0,0),(0,5)],'constant',constant_values=(0)) # case by case ---> overfitting? if paths is None: paths = [None] * len(sequences) if texts is None: texts = [None] * len(sequences) time_str = get_time() def plot_and_save_parallel(wavs, alignments, use_manual_attention, mels): items = list( enumerate(zip(wavs, alignments, paths, texts, sequences, mels))) fn = partial(plot_graph_and_save_audio, base_path=base_path, start_of_sentence=start_of_sentence, end_of_sentence=end_of_sentence, pre_word_num=pre_word_num, post_word_num=post_word_num, pre_surplus_idx=pre_surplus_idx, post_surplus_idx=post_surplus_idx, use_short_concat=use_short_concat, use_manual_attention=use_manual_attention, librosa_trim=librosa_trim, attention_trim=attention_trim, time_str=time_str, isKorean=isKorean) return parallel_run(fn, items, desc="plot_graph_and_save_audio", parallel=False) #input_lengths = np.argmax(np.array(sequences) == 1, 1)+1 input_lengths = [np.argmax(a == 1) + 1 for a in sequences] fetches = [ #self.wav_output, self.model.linear_outputs, self.model. alignments, # # batch_size, text length(encoder), target length(decoder) self.model.mel_outputs, ] feed_dict = { self.model.inputs: sequences, self.model.input_lengths: input_lengths, } if base_alignment_path is None: feed_dict.update({ self.model.manual_alignments: np.zeros([1, 1, 1]), self.model.is_manual_attention: False, }) else: manual_alignments = [] #alignment_path = os.path.join(base_alignment_path,os.path.basename(base_path)) alignment_path = os.path.join(os.path.basename(base_path), base_alignment_path) for idx in range(len(sequences)): numpy_path = "{}{}.npy".format(alignment_path, idx) manual_alignments.append(np.load(numpy_path)) alignments_T = np.transpose(manual_alignments, [0, 2, 1]) feed_dict.update({ self.model.manual_alignments: alignments_T, self.model.is_manual_attention: True }) if speaker_ids is not None: if type(speaker_ids) == dict: speaker_embed_table = sess.run(self.model.speaker_embed_table) speaker_embed = [ speaker_ids[speaker_id] * speaker_embed_table[speaker_id] for speaker_id in speaker_ids ] feed_dict.update({self.model.speaker_embed_table: np.tile()}) else: feed_dict[self.model.speaker_id] = speaker_ids wavs, alignments, mels = self.sess.run(fetches, feed_dict=feed_dict) results = plot_and_save_parallel( wavs, alignments, use_manual_attention=False, mels=mels ) # use_manual_attention = True/False는 출력파일명에 'manual'을 넣고 빼고 차이 뿐. if manual_attention_mode > 0: # argmax one hot if manual_attention_mode == 1: alignments_T = np.transpose( alignments, [0, 2, 1] ) # [batch_size, Encoder length, Decoder_length] ==> [N,D,E]. (1, 50, 200) -->((1,200,50) new_alignments = np.zeros_like( alignments_T) # model에서 attention은 (N,D,E)이므로 for idx in range(len(alignments)): # batch에 대한 loop argmax = alignments[idx].argmax( 1) # text가 소리의 어디쯤에서 가장 영향을 많이 주었나? 즉 어디서 발음되나? new_alignments[idx][(argmax, range(len( argmax)))] = 1 # 최대값을 가지는 위치만 1로 바꾸어주는 효과. 나머지는 모두 0 # sharpening elif manual_attention_mode == 2: new_alignments = np.transpose( alignments, [0, 2, 1]) # [N, E, D] ==> [N,D,E] for idx in range(len(alignments)): # batch에 대한 loop # 분산, 평균을 계산한 후, 사용하지도 않네... 뭐야!!! var = np.var( new_alignments[idx], 1 ) # variance [N,D]. 각 Decoder time별 attention variance mean_var = var[:input_lengths[idx]].mean() new_alignments[idx] = np.power(new_alignments[idx], 2) # prunning elif manual_attention_mode == 3: new_alignments = np.transpose(alignments, [0, 2, 1]) # [N, E, D] for idx in range(len(alignments)): argmax = alignments[idx].argmax(1) new_alignments[idx][(argmax, range(len( argmax)))] = 1 # 최대값을 가지는 위치만 1로 바꾸어주는 효과. 나머지는 모두 유지 feed_dict.update({ self.model.manual_alignments: new_alignments, self.model.is_manual_attention: True, }) new_wavs, new_alignments = self.sess.run(fetches, feed_dict=feed_dict) results = plot_and_save_parallel(new_wavs, new_alignments, True) return results