def test_single(trainer, speaker2id_path, result_dir, enc_only, s_speaker, t_speaker): with open(speaker2id_path, 'r') as f_json: speaker2id = json.load(f_json) if s_speaker == 'S015': filename = './data/english/train/unit/S015_0361841101.wav' elif s_speaker == 'S119': filename = './data/english/train/unit/S119_1561145062.wav' else: raise NotImplementedError('Please modify path manually!') _, spec = get_spectrograms(filename) spec_expand = np.expand_dims(spec, axis=0) spec_tensor = torch.from_numpy(spec_expand).type(torch.FloatTensor) c = torch.tensor(torch.from_numpy(np.array([speaker2id[t_speaker] ]))).cuda() result = trainer.test_step(spec_tensor, c, enc_only=enc_only) result = result.squeeze(axis=0).transpose((1, 0)) wav_data = spectrogram2wav(result) write(os.path.join(result_dir, 'result.wav'), rate=16000, data=wav_data) print( 'Testing on source speaker {} and target speaker {}, output shape: {}'. format(s_speaker, t_speaker, result.shape))
def target_classify(trainer, seg_len, synthesis_list, result_dir, flag='test'): dir_path = os.path.join(result_dir, f'{flag}/') with open(synthesis_list, 'r') as f: file = f.readlines() acc = [] for line in file: # get wav path line = line.split('\n')[0].split(' ') utt_id = line[0].split('/')[1].split('_')[1] tar_speaker = line[1] wav_path = os.path.join(dir_path, f'{tar_speaker}_{utt_id}.wav') # get spectrogram _, spec = get_spectrograms(wav_path) # padding spec if len(spec) < seg_len: padding = np.zeros((seg_len - spec.shape[0], spec.shape[1])) spec = np.concatenate((spec, padding), axis=0) # classification logits = [] for idx in range(0, len(spec), seg_len): if idx + (seg_len * 2) > len(spec): spec_frag = spec[idx:-1] else: spec_frag = spec[idx:idx + seg_len] if len(spec_frag) >= seg_len: x = torch.from_numpy( np.expand_dims(spec_frag[:seg_len, :], axis=0)).type(torch.FloatTensor) logit = trainer.classify(x) logits.append(logit) elif idx == 0: raise RuntimeError('Please check if input is too short!') logits = np.concatenate(logits, axis=0) #logits = np.sum(logits, axis = 0) for logit in logits: am = logit.argmax() if am == 0: clf_speaker = 'V001' elif am == 1: clf_speaker = 'V002' else: clf_speaker = 'None' if clf_speaker == tar_speaker: acc.append(1) #print('[info]: {} is classified to {}'.format(wav_path, clf_speaker)) else: acc.append(0) #print('[Error]: {} is classified to {}'.format(wav_path, clf_speaker)) print('Classification Acc: {:.3f}'.format(np.sum(acc) / float(len(acc))))
def encode_for_tacotron(target, trainer, seg_len, multi2idx_path, wav_path, result_path): wavs = sorted(glob.glob(os.path.join(wav_path, '*.wav'))) print('[Converter] - Number of wav files to encoded: ', len(wavs)) names = [] enc_outputs = [] for wav_path in tqdm(wavs): name = wav_path.split('/')[-1].split('.')[0] s_id = name.split('_')[0] u_id = name.split('_')[1] if s_id != target: continue y, sr = librosa.load(wav_path) d = librosa.get_duration(y=y, sr=sr) if d > 25: continue # --> this filter out too long utts, 3523/3533 for V001 and V002 together in the english dataset _, spec = get_spectrograms(wav_path) encodings = encode(spec, trainer, seg_len, save=False) encodings = parse_encodings(encodings) enc_outputs.append(encodings) names.append((s_id, u_id)) # build encodings to character mapping idx = 0 multi2idx = {} print('[Converter] - Building encoding to symbol mapping...') for encodings in tqdm(enc_outputs): for encoding in encodings: if str(encoding) not in multi2idx: multi2idx[str(encoding)] = symbols[idx] idx += 1 print('[Converter] - Number of unique discret units: ', len(multi2idx)) with open(multi2idx_path, 'w') as file: file.write(json.dumps(multi2idx)) result_path = result_path.replace('target', target) print('[Converter] - Writing to meta file...') with open(result_path, 'w') as file: for i, encodings in enumerate(enc_outputs): file.write(str(names[i][0]) + '_' + str(names[i][1] + '|')) for encoding in encodings: file.write(multi2idx[str(encoding)]) file.write('\n')
def test_single(trainer, seg_len, speaker2id_path, result_dir, enc_only, s_speaker, t_speaker): with open(speaker2id_path, 'r') as f_json: speaker2id = json.load(f_json) if s_speaker == 'S015': filename = './data/english/train/unit/S015_0361841101.wav' elif s_speaker == 'S119': filename = './data/english/train/unit/S119_1561145062.wav' elif s_speaker == 'S130': filename = './data/english/test/S130_3516588097.wav' elif s_speaker == 'S089': filename = './data/english/test/S089_1810826781.wav' elif s_speaker == 'S378': filename = './data/surprise/test/S378_117437.wav' else: raise NotImplementedError('Please modify path manually!') _, spec = get_spectrograms(filename) wav_data, encodings = convert(trainer, seg_len, src_speaker_spec=spec, src_speaker=s_speaker, tar_speaker=t_speaker, utt_id='', speaker2id=speaker2id, result_dir=result_dir, enc_only=enc_only, save=[]) sf.write(os.path.join(result_dir, 'result.wav'), wav_data, hp.sr, 'PCM_16') write_encodings(os.path.join(result_dir, 'result.txt'), encodings) err_result = compare_asr(filename, os.path.join(result_dir, 'result.wav')) print( 'Testing on source speaker {} and target speaker {}, output shape: {}'. format(s_speaker, t_speaker, wav_data.shape)) print('Comparing ASR result - WERR: {:.3f} CERR: {:.3f}'.format( err_result[0], err_result[1]))
def main(): #---initialize---# args = get_test_args() HPS = Hps(args.hps_path) hps = HPS.get_tuple() trainer = get_trainer(args.hps_path, args.encoder_path, hps.g_mode, hps.enc_mode) if args.eval_t == 'None': print( '[Tacotron] - None is not a valid evaluation target! Please specify target manually, must be either V001, or V002.' ) return # Tacotron implementation: https://github.com/andi611/TTS-Tacotron-Pytorch model = Tacotron(n_vocab=len(symbols), embedding_dim=config.embedding_dim, mel_dim=config.num_mels, linear_dim=config.num_freq, r=config.outputs_per_step, padding_idx=config.padding_idx, attention=config.attention, use_mask=config.use_mask) #---handle path---# result_dir = os.path.join(args.result_dir, args.sub_result_dir) os.makedirs(result_dir, exist_ok=True) checkpoint_path = os.path.join(args.ckpt_dir, args.model_name) if args.dataset == 'english' and not os.path.isdir( './ckpt_tacotron_english'): print( '[Tacotron] - Recommand using the following name for ckpt_dir: ./ckpt_tacotron_english/' ) elif args.dataset == 'surprise' and not os.path.isdir( './ckpt_tacotron_surprise'): print( '[Tacotron] - Recommand using the following name for ckpt_dir: ./ckpt_tacotron_surprise/' ) #---load and set model---# print('[Tacotron] - Testing on the {} set.'.format(args.dataset)) print('[Tacotron] - Loading model: ', checkpoint_path) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) #---load and set mappings---# print('[Tacotron] - Loading mapping files: ', args.speaker2id_path) valid_arguments(valid_target=args.dataset, arg=args.speaker2id_path) with open(args.speaker2id_path, 'r') as f_json: speaker2id = json.load(f_json) print('[Tacotron] - Loading mapping files: ', args.multi2idx_path) with open(args.multi2idx_path, 'r') as f_json: multi2idx = json.load(f_json) if not args.test_single: #---parse testing list---# print('[Tacotron] - Testing from list: ', args.synthesis_list) valid_arguments(valid_target=args.dataset, arg=args.synthesis_list) feeds = [] with open(args.synthesis_list, 'r') as f: file = f.readlines() for line in file: line = line.split('\n')[0].split(' ') feeds.append({ 's_id': line[0].split('/')[1].split('_')[0], 'utt_id': line[0].split('/')[1].split('_')[1], 't_id': line[1], }) print('[Tester] - Number of files to be resynthesize: ', len(feeds)) for feed in tqdm(feeds): if feed['t_id'] == args.eval_t: wav_path = os.path.join( args.testing_dir, feed['s_id'] + '_' + feed['utt_id'] + '.wav') _, spec = get_spectrograms(wav_path) encodings = encode(spec, trainer, hps.seg_len, save=False) encodings = parse_encodings(encodings) line = ''.join([multi2idx[encoding] for encoding in encodings]) print(line) out_path = os.path.join( result_dir, feed['t_id'] + '_' + feed['utt_id'] + '.wav') synthesis_speech(model, text=line, path=out_path) else: wav_path = './data/english/train/voice/V002_0674932509.wav' # wav_path = './data/english/train/voice/V002_2252538703.wav' # wav_path = './data/english/train/voice/V002_1665800749.wav' _, spec = get_spectrograms(wav_path) encodings = encode(spec, trainer, hps.seg_len, save=False) write_encodings(path='./result/result.wav', encodings=encodings) parsed_encodings = parse_encodings(encodings) line = ''.join([multi2idx[encoding] for encoding in parsed_encodings]) print(line) synthesis_speech(model, text=line, path='./result/result.wav') # model.decoder.max_decoder_steps = config.max_decoder_steps # Set large max_decoder steps to handle long sentence outputs sys.exit(0)