def synthesis(text, args): m = Model() m.load_state_dict(load_checkpoint(args.restore_path)) print("[%s][%s] Synthesizing:" % (args.lang, args.spk), text) text = np.asarray([1] + list(text.encode('utf-8')) + [2]) text = t.LongTensor(text).unsqueeze(0) text = text mel_input = t.zeros([1, 1, 80]) pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0) pos_text = pos_text lang_to_id = json.load(open(os.path.join(args.data_path, 'lang_id.json'))) spk_to_id = json.load(open(os.path.join(args.data_path, 'spk_id.json'))) lang_id = lang_to_id[args.lang] spk_id = spk_to_id[args.spk] lang_id = t.LongTensor([lang_id]) spk_id = t.LongTensor([spk_id]) m.train(False) pbar = tqdm(range(args.max_len)) with t.no_grad(): for i in pbar: pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0) mel_pred, postnet_pred, attn, stop_token, _, attn_dec = \ m.forward(text, mel_input, pos_text, pos_mel, lang_id, spk_id) mel_input = t.cat([mel_input, mel_pred[:, -1:, :]], dim=1) if stop_token[:, -1].item() > 0: break mel = postnet_pred.squeeze(0).cpu().numpy() wav = mel2wav(mel) np.save(args.out_path + "_mel.npy", mel) write(args.out_path + ".wav", hp.sr, wav) plot_mel(args.out_path + "_mel.png", mel) plot_attn(attn, args.out_path + '_align.png')
def synthesis(text, args): m = Model() m_post = ModelPostNet() m.load_state_dict(load_checkpoint(args.restore_step1, "transformer")) m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet")) text = np.asarray(text_to_sequence(text, [hp.cleaners])) text = t.LongTensor(text).unsqueeze(0) text = text.cuda() mel_input = t.zeros([1,1, 80]).cuda() pos_text = t.arange(1, text.size(1)+1).unsqueeze(0) pos_text = pos_text.cuda() m=m.cuda() m_post = m_post.cuda() m.train(False) m_post.train(False) pbar = tqdm(range(args.max_len)) with t.no_grad(): for i in pbar: pos_mel = t.arange(1,mel_input.size(1)+1).unsqueeze(0).cuda() mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(text, mel_input, pos_text, pos_mel) mel_input = t.cat([t.zeros([1,1, 80]).cuda(),postnet_pred], dim=1) mag_pred = m_post.forward(postnet_pred) wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy()) write(hp.sample_path + "/test.wav", hp.sr, wav)
def synthesis(text, num): m = Model() # m_post = ModelPostNet() m.load_state_dict(load_checkpoint(num, "transformer")) # m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet")) text = np.asarray(text_to_sequence(text, [hp.cleaners])) text = t.LongTensor(text).unsqueeze(0) text = text.cuda() mel_input = t.zeros([1, 1, 80]).cuda() pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0) pos_text = pos_text.cuda() m = m.cuda() # m_post = m_post.cuda() m.train(False) # m_post.train(False) # pbar = tqdm(range(args.max_len)) with t.no_grad(): for _ in range(1000): pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda() mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward( text, mel_input, pos_text, pos_mel) mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1) # mag_pred = m_post.forward(postnet_pred) # wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy()) mel_postnet = postnet_pred[0].cpu().numpy().T plot_data([mel_postnet for _ in range(2)]) wav = audio.inv_mel_spectrogram(mel_postnet) wav = wav[0:audio.find_endpoint(wav)] audio.save_wav(wav, "result.wav")
def synthesis(text, args): m = Model() m_post = ModelPostNet() m.load_state_dict(load_checkpoint(args.step1, "transformer")) m_post.load_state_dict(load_checkpoint(args.step2, "postnet")) text = np.asarray(text_to_sequence(text, [hp.cleaners])) text = torch.LongTensor(text).unsqueeze(0) text = text.cuda() mel_input = np.load('3_0.pt.npy') pos_text = torch.arange(1, text.size(1) + 1).unsqueeze(0) pos_text = pos_text.cuda() m = m.cuda() m_post = m_post.cuda() m.train(False) m_post.train(False) with torch.no_grad(): mag_pred = m_post.forward( torch.from_numpy(mel_input).unsqueeze(0).cuda()) wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy()) write(hp.sample_path + "/test.wav", hp.sr, wav)
def synthesis(text, args, num): m = Model() m_post = ModelPostNet() m.load_state_dict(load_checkpoint(args.restore_step1, "transformer")) m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet")) text = np.asarray(text_to_sequence(text, [hp.cleaners])) text = t.LongTensor(text).unsqueeze(0) text = text.cuda() mel_input = t.zeros([1, 1, 80]).cuda() pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0) pos_text = pos_text.cuda() m = m.cuda() m_post = m_post.cuda() m.train(False) m_post.train(False) pbar = tqdm(range(args.max_len)) with t.no_grad(): for i in pbar: pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda() mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward( text, mel_input, pos_text, pos_mel) # print('mel_pred==================',mel_pred.shape) # print('postnet_pred==================', postnet_pred.shape) mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1) #print(postnet_pred[:, -1:, :]) #print(t.argmax(attn[1][1][i]).item()) #print('mel_input==================', mel_input.shape) # #直接用真实mel测试postnet效果 #aa = t.from_numpy(np.load('D:\SSHdownload\\000101.pt.npy')).cuda().unsqueeze(0) # # print(aa.shape) mag_pred = m_post.forward(postnet_pred) #real_mag = t.from_numpy((np.load('D:\SSHdownload\\003009.mag.npy'))).cuda().unsqueeze(dim=0) #wav = spectrogram2wav(postnet_pred) #print('shappe============',attn[2][0].shape) # count = 0 # for j in range(4): # count += 1 # attn1 = attn[0][j].cpu() # plot_alignment(attn1, path='./training_loss/'+ str(args.restore_step1)+'_'+str(count)+'_'+'S'+str(num)+'.png', title='sentence'+str(num)) attn1 = attn[0][1].cpu() plot_alignment(attn1, path='./training_loss/' + str(args.restore_step1) + '_' + 'S' + str(num) + '.png', title='sentence' + str(num)) wav = spectrogram2wav(mag_pred.squeeze(0).cpu().detach().numpy()) write( hp.sample_path + '/' + str(args.restore_step1) + '-' + "test" + str(num) + ".wav", hp.sr, wav)
def ini_model_train(opt): X_ini, y_ini, X_test, y_test, X_train_All, y_train_All = ini_model(opt) mod = Model().to(device) optimizer = optim.SGD(mod.parameters(), lr=opt.ini_lr) criterion = nn.CrossEntropyLoss() num_batches_train = X_ini.shape[0] // opt.ini_batch_size mod.train() for i in range(opt.ini_epoch): loss = 0 for j in range(num_batches_train): slce = get_slice(j, opt.ini_batch_size) X_tra = torch.from_numpy(X_ini[slce]).float().to(device) Y_tra = torch.from_numpy(y_ini[slce]).long().to(device) optimizer.zero_grad() out = mod(X_tra) batch_loss = criterion(out, Y_tra) batch_loss.backward() optimizer.step() loss += batch_loss mod.eval() acc = test_without_dropout(X_test, y_test, mod, device) print('\n[{}/{} epoch], training loss:{:.4f}, test accuracy is:{} \n'. format(i, opt.ini_epoch, loss.item() / num_batches_train, acc)) if i + 1 == opt.ini_epoch: for d in range(opt.num_dev): torch.save( { 'epoch': i, 'model_state_dict': mod.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss.item() }, os.path.join(opt.ini_model_path, 'device' + str(d), "ini.model.pth.tar")) torch.save( { 'epoch': i, 'model_state_dict': mod.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss.item() }, opt.ini_model_path) return X_test, y_test, X_train_All, y_train_All
def ini_train(X_ini, y_ini, X_te, y_te, epochs, paths, device, batch_size, lr, momentum, arr_drop): mod = Model(arr_drop).to(device) optimizer = optim.SGD(mod.parameters(), lr=lr, momentum=momentum) criterion = nn.CrossEntropyLoss() #batch_size = 200 num_batches_train = X_ini.shape[0] // batch_size print("number of batch ", num_batches_train) mod.train() for i in range(epochs): loss = 0 for j in range(num_batches_train): slce = get_slice(j, batch_size) X_tra = torch.from_numpy(X_ini[slce]).float().to(device) Y_tra = torch.from_numpy(y_ini[slce]).long().to(device) optimizer.zero_grad() out = mod(X_tra) batch_loss = criterion(out, Y_tra) batch_loss.backward() optimizer.step() loss += batch_loss mod.eval() with torch.no_grad(): X_va = torch.from_numpy(X_te).float().to(device) Y_va = torch.from_numpy(y_te).long().to(device) output = mod(X_va) preds = torch.max(output, 1)[1] acc = accuracy_score(Y_va, preds) print('\n[{}/{} epoch], training loss:{:.4f}, test accuracy is:{} \n'. format(i, epochs, loss.item() / num_batches_train, acc)) if i + 1 == epochs: for path in paths: torch.save( { 'epoch': i, 'model_state_dict': mod.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss.item() }, os.path.join(path, "ini.model.pth.tar")) return mod
def model(dataset, model_name=None, device=None, train=True): """加载模型""" device = device or torch.device( "cuda" if torch.cuda.is_available() else "cpu") net = Model(vocab_size=dataset.vocab_size, embedding_dim=config.embedding_dim, output_size=dataset.target_vocab_size, encoder_hidden_size=config.encoder_hidden_size, decoder_hidden_size=config.decoder_hidden_size, encoder_layers=config.encoder_layers, decoder_layers=config.decoder_layers, dropout=config.dropout, embedding_weights=dataset.vector_weights, device=device) if model_name: # 如果指定了模型名称, 就加载对应的模型 pre_trained_state_dict = torch.load(FILE_PATH + config.model_path + model_name, map_location=device) state_dict = net.state_dict() state_dict.update(pre_trained_state_dict) net.load_state_dict(state_dict) net.train() if train else net.eval() return net
def synthesis(args): m = Model() m_post = ModelPostNet() m_stop = ModelStopToken() m.load_state_dict(load_checkpoint(args.restore_step1, "transformer")) m_stop.load_state_dict(load_checkpoint(args.restore_step3, "stop_token")) m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet")) m=m.cuda() m_post = m_post.cuda() m_stop = m_stop.cuda() m.train(False) m_post.train(False) m_stop.train(False) test_dataset = get_dataset(hp.test_data_csv) test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) ref_dataset = get_dataset(hp.test_data_csv) ref_dataloader = DataLoader(ref_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) writer = get_writer(hp.checkpoint_path, hp.log_directory) ref_dataloader_iter = iter(ref_dataloader) for i, data in enumerate(test_dataloader): character, mel, mel_input, pos_text, pos_mel, text_length, mel_length, fname = data ref_character, ref_mel, ref_mel_input, ref_pos_text, ref_pos_mel, ref_text_length, ref_mel_length, ref_fname = next(ref_dataloader_iter) stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1) mel_input = t.zeros([1,1,80]).cuda() stop=[] character = character.cuda() mel = mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() ref_character = ref_character.cuda() ref_mel = ref_mel.cuda() ref_mel_input = ref_mel_input.cuda() ref_pos_text = ref_pos_text.cuda() ref_pos_mel = ref_pos_mel.cuda() with t.no_grad(): start=time.time() for i in range(args.max_len): pos_mel = t.arange(1,mel_input.size(1)+1).unsqueeze(0).cuda() mel_pred, postnet_pred, attn_probs, decoder_output, attns_enc, attns_dec, attns_style = m.forward(character, mel_input, pos_text, pos_mel, ref_mel, ref_pos_mel) stop_token = m_stop.forward(decoder_output) mel_input = t.cat([mel_input, postnet_pred[:,-1:,:]], dim=1) stop.append(t.sigmoid(stop_token).squeeze(-1)[0,-1]) if stop[-1] > 0.5: print("stop token at " + str(i) + " is :", stop[-1]) print("model inference time: ", time.time() - start) break if stop[-1] == 0: continue mag_pred = m_post.forward(postnet_pred) inf_time = time.time() - start print("inference time: ", inf_time) wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy()) print("rtx : ", (len(wav)/hp.sr) / inf_time) wav_path = os.path.join(hp.sample_path, 'wav') if not os.path.exists(wav_path): os.makedirs(wav_path) write(os.path.join(wav_path, "text_{}_ref_{}_synth.wav".format(fname, ref_fname)), hp.sr, wav) print("written as text{}_ref_{}_synth.wav".format(fname, ref_fname)) attns_enc_new=[] attns_dec_new=[] attn_probs_new=[] attns_style_new=[] for i in range(len(attns_enc)): attns_enc_new.append(attns_enc[i].unsqueeze(0)) attns_dec_new.append(attns_dec[i].unsqueeze(0)) attn_probs_new.append(attn_probs[i].unsqueeze(0)) attns_style_new.append(attns_style[i].unsqueeze(0)) attns_enc = t.cat(attns_enc_new, 0) attns_dec = t.cat(attns_dec_new, 0) attn_probs = t.cat(attn_probs_new, 0) attns_style = t.cat(attns_style_new, 0) attns_enc = attns_enc.contiguous().view(attns_enc.size(0), 1, hp.n_heads, attns_enc.size(2), attns_enc.size(3)) attns_enc = attns_enc.permute(1,0,2,3,4) attns_dec = attns_dec.contiguous().view(attns_dec.size(0), 1, hp.n_heads, attns_dec.size(2), attns_dec.size(3)) attns_dec = attns_dec.permute(1,0,2,3,4) attn_probs = attn_probs.contiguous().view(attn_probs.size(0), 1, hp.n_heads, attn_probs.size(2), attn_probs.size(3)) attn_probs = attn_probs.permute(1,0,2,3,4) attns_style = attns_style.contiguous().view(attns_style.size(0), 1, hp.n_heads, attns_style.size(2), attns_style.size(3)) attns_style = attns_style.permute(1,0,2,3,4) save_dir = os.path.join(hp.sample_path, 'figure', "text_{}_ref_{}_synth.wav".format(fname, ref_fname)) if not os.path.exists(save_dir): os.makedirs(save_dir) writer.add_alignments(attns_enc.detach().cpu(), attns_dec.detach().cpu(), attn_probs.detach().cpu(), attns_style.detach().cpu(), mel_length, text_length, args.restore_step1, 'Validation', save_dir)
betas=(0.9, 0.999), eps=1e-08, weight_decay=0.001, amsgrad=False) # training # print(model.state_dict()) model = model.float() loss_train = [] loss_val = [] latent = np.empty((1, 16, 16, 128)) for epoch in range(num_epochs): # Train model.train() # Sum of losses from this epoch epoch_loss_train = 0 for i, data in enumerate(train_loader): # Zeros the gradients of all optimized torch.Tensors optimizer.zero_grad() # Load data to tensors img = data['image'] position_target = data['point_map'] img = img.to(device=device, dtype=torch.float32) position_target = position_target.to(device=device)
def synthesis(args): m = Model() m.load_state_dict(load_checkpoint(args.restore_step1, "transformer")) m = m.cuda() m.train(False) vocoder = SmartVocoder(Hyperparameters(parse_args())) vocoder.load_state_dict( t.load('./mel2audio/merged_STFT_checkpoint.pth')["state_dict"]) vocoder = vocoder.cuda() vocoder.eval() with open('./hifi_gan/config.json') as f: data = f.read() json_config = json.loads(data) h = AttrDict(json_config) hifi_gan = Generator(h).cuda() state_dict_g = t.load('./hifi_gan/g_00334000', map_location='cuda') hifi_gan.load_state_dict(state_dict_g['generator']) hifi_gan.eval() hifi_gan.remove_weight_norm() test_dataset = get_dataset(hp.test_data_csv) test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) ref_dataset = get_dataset(hp.test_data_csv_shuf) ref_dataloader = DataLoader(ref_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) writer = get_writer(hp.checkpoint_path, hp.log_directory) mel_basis = t.from_numpy( librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels, 50, 11000)).unsqueeze(0) # (n_mels, 1+n_fft//2) ref_dataloader_iter = iter(ref_dataloader) _, ref_mel, _, _, _, ref_pos_mel, _, _, ref_fname = next( ref_dataloader_iter) for i, data in enumerate(test_dataloader): character, _, _, _, pos_text, _, text_length, _, fname = data mel_input = t.zeros([1, 1, 80]).cuda() character = character.cuda() ref_mel = ref_mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() with t.no_grad(): start = time.time() memory, c_mask, attns_enc, duration_mask = m.encoder(character, pos=pos_text) style, coarse_emb = m.ref_encoder(ref_mel) memory = t.cat((memory, coarse_emb.expand(-1, memory.size(1), -1)), -1) memory = m.memory_coarse_layer(memory) duration_predictor_output = m.duration_predictor( memory, duration_mask) duration = t.ceil(duration_predictor_output) duration = duration * duration_mask # max_length = t.sum(duration).type(t.LongTensor) # print("length : ", max_length) monotonic_interpolation, pos_mel_, weights = m.length_regulator( memory, duration, duration_mask) kv_mask = t.zeros([1, mel_input.size(1), character.size(1)]).cuda() # B, t', N kv_mask[:, :, :3] = 1 kv_mask = kv_mask.eq(0) stop_flag = False ctr = 0 for j in range(1200): pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda() mel_pred, postnet_pred, attn_probs, decoder_output, attns_dec, attns_style = m.decoder( memory, style, mel_input, c_mask, pos=pos_mel, ref_pos=ref_pos_mel, mono_inter=monotonic_interpolation[:, :mel_input.shape[1]], kv_mask=kv_mask) mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1) # print("j", j, "mel_input", mel_input.shape) if stop_flag and ctr == 10: break elif stop_flag: ctr += 1 kv_mask, stop_flag = update_kv_mask( kv_mask, attn_probs) # B, t', N --> B, t'+1, N postnet_pred = t.cat((postnet_pred, t.zeros(postnet_pred.size(0), 5, postnet_pred.size(-1)).cuda()), 1) gen_length = mel_input.size(1) # print("gen_length", gen_length) post_linear = m.postnet(postnet_pred) post_linear = resample(post_linear, seq_len=mel_input.size(1), scale=args.rhythm_scale) postnet_pred = resample(mel_input, seq_len=mel_input.size(1), scale=args.rhythm_scale) inf_time = time.time() - start print("inference time: ", inf_time) # print("speech_rate: ", len(postnet_pred[0])/len(character[0])) postnet_pred_v = postnet_pred.transpose(2, 1) postnet_pred_v = (postnet_pred_v * 100 + 20 - 100) / 20 B, C, T = postnet_pred_v.shape z = t.randn(1, 1, T * hp.hop_length).cuda() z = z * 0.6 # Temp t.cuda.synchronize() timestemp = time.time() with t.no_grad(): y_gen = vocoder.reverse(z, postnet_pred_v).squeeze() t.cuda.synchronize() print('{} seconds'.format(time.time() - timestemp)) wav = y_gen.to(t.device("cpu")).data.numpy() wav = np.pad( wav, [0, 4800], mode='constant', constant_values=0) #pad 0 for 0.21 sec silence at the end post_linear_v = post_linear.transpose(1, 2) post_linear_v = 10**((post_linear_v * 100 + 20 - 100) / 20) mel_basis = mel_basis.repeat(post_linear_v.shape[0], 1, 1) post_linear_mel_v = t.log10(t.bmm(mel_basis.cuda(), post_linear_v)) B, C, T = post_linear_mel_v.shape z = t.randn(1, 1, T * hp.hop_length).cuda() z = z * 0.6 # Temp t.cuda.synchronize() timestemp = time.time() with t.no_grad(): y_gen_linear = vocoder.reverse(z, post_linear_mel_v).squeeze() t.cuda.synchronize() wav_linear = y_gen_linear.to(t.device("cpu")).data.numpy() wav_linear = np.pad( wav_linear, [0, 4800], mode='constant', constant_values=0) #pad 0 for 0.21 sec silence at the end wav_hifi = hifi_gan(post_linear_mel_v).squeeze().clamp( -1, 1).detach().cpu().numpy() wav_hifi = np.pad( wav_hifi, [0, 4800], mode='constant', constant_values=0) #pad 0 for 0.21 sec silence at the end mel_path = os.path.join(hp.sample_path + '_' + str(args.rhythm_scale), 'mel') if not os.path.exists(mel_path): os.makedirs(mel_path) np.save( os.path.join( mel_path, 'text_{}_ref_{}_synth_{}.mel'.format(i, ref_fname, str(args.rhythm_scale))), postnet_pred.cpu()) linear_path = os.path.join( hp.sample_path + '_' + str(args.rhythm_scale), 'linear') if not os.path.exists(linear_path): os.makedirs(linear_path) np.save( os.path.join( linear_path, 'text_{}_ref_{}_synth_{}.linear'.format( i, ref_fname, str(args.rhythm_scale))), post_linear.cpu()) wav_path = os.path.join(hp.sample_path + '_' + str(args.rhythm_scale), 'wav') if not os.path.exists(wav_path): os.makedirs(wav_path) write( os.path.join( wav_path, "text_{}_ref_{}_synth_{}.wav".format(i, ref_fname, str(args.rhythm_scale))), hp.sr, wav) print("rtx : ", (len(wav) / hp.sr) / inf_time) wav_linear_path = os.path.join( hp.sample_path + '_' + str(args.rhythm_scale), 'wav_linear') if not os.path.exists(wav_linear_path): os.makedirs(wav_linear_path) write( os.path.join( wav_linear_path, "text_{}_ref_{}_synth_{}.wav".format(i, ref_fname, str(args.rhythm_scale))), hp.sr, wav_linear) wav_hifi_path = os.path.join( hp.sample_path + '_' + str(args.rhythm_scale), 'wav_hifi') if not os.path.exists(wav_hifi_path): os.makedirs(wav_hifi_path) write( os.path.join( wav_hifi_path, "text_{}_ref_{}_synth_{}.wav".format(i, ref_fname, str(args.rhythm_scale))), hp.sr, wav_hifi) show_weights = weights.contiguous().view(weights.size(0), 1, 1, weights.size(1), weights.size(2)) attns_enc_new = [] attns_dec_new = [] attn_probs_new = [] attns_style_new = [] for i in range(len(attns_enc)): attns_enc_new.append(attns_enc[i].unsqueeze(0)) attns_dec_new.append(attns_dec[i].unsqueeze(0)) attn_probs_new.append(attn_probs[i].unsqueeze(0)) attns_style_new.append(attns_style[i].unsqueeze(0)) attns_enc = t.cat(attns_enc_new, 0) attns_dec = t.cat(attns_dec_new, 0) attn_probs = t.cat(attn_probs_new, 0) attns_style = t.cat(attns_style_new, 0) attns_enc = attns_enc.contiguous().view(attns_enc.size(0), 1, hp.n_heads, attns_enc.size(2), attns_enc.size(3)) attns_enc = attns_enc.permute(1, 0, 2, 3, 4) attns_dec = attns_dec.contiguous().view(attns_dec.size(0), 1, hp.n_heads, attns_dec.size(2), attns_dec.size(3)) attns_dec = attns_dec.permute(1, 0, 2, 3, 4) attn_probs = attn_probs.contiguous().view(attn_probs.size(0), 1, hp.n_heads, attn_probs.size(2), attn_probs.size(3)) attn_probs = attn_probs.permute(1, 0, 2, 3, 4) attns_style = attns_style.contiguous().view(attns_style.size(0), 1, hp.n_heads, attns_style.size(2), attns_style.size(3)) attns_style = attns_style.permute(1, 0, 2, 3, 4) save_dir = os.path.join( hp.sample_path + '_' + str(args.rhythm_scale), 'figure', "text_{}_ref_{}_synth_{}.wav".format(fname, ref_fname, str(args.rhythm_scale))) if not os.path.exists(save_dir): os.makedirs(save_dir) writer.add_alignments(attns_enc.detach().cpu(), attns_dec.detach().cpu(), attn_probs.detach().cpu(), attns_style.detach().cpu(), show_weights.detach().cpu(), [t.tensor(gen_length).type(t.LongTensor)], text_length, args.restore_step1, 'Inference', save_dir)
def random_run(acquisition_iterations, X_Pool, y_Pool, pool_subset, dropout_iterations, nb_classes, Queries, X_test, y_test, rep, X_old, y_old, device, itr, cuda, g): mod = Model().to(device) if cuda: cp = torch.load(rep) print("\n ********load gpu version******* \n") else: cp = torch.load(rep, map_location='cpu') mod.load_state_dict(cp['model_state_dict']) optimizer = optim.Adam(mod.parameters(), lr=0.001, weight_decay=0.5) #,weight_decay=0.5 #optimizer = optim.SGD(mod.parameters(), lr=0.001,weight_decay=0.5) optimizer.load_state_dict(cp['optimizer_state_dict']) criterion = nn.CrossEntropyLoss() X_train = np.empty([0, 1, 28, 28]) y_train = np.empty([ 0, ]) AA = [] losses_train = [] #acc = test(test_loader,mod,device,cuda) acc = test(X_test, y_test, mod, device, cuda) AA.append(acc) print('initial test accuracy: ', acc) for i in range(acquisition_iterations): pool_subset_dropout = np.asarray( random.sample(range(0, X_Pool.shape[0]), pool_subset)) X_Pool_Dropout = X_Pool[pool_subset_dropout, :, :, :] y_Pool_Dropout = y_Pool[pool_subset_dropout] x_pool_index = np.random.choice(X_Pool_Dropout.shape[0], Queries, replace=False) Pooled_X = X_Pool_Dropout[x_pool_index, :, :, :] Pooled_Y = y_Pool_Dropout[x_pool_index] delete_Pool_X = np.delete(X_Pool, (pool_subset_dropout), axis=0) delete_Pool_Y = np.delete(y_Pool, (pool_subset_dropout), axis=0) delete_Pool_X_Dropout = np.delete(X_Pool_Dropout, (x_pool_index), axis=0) delete_Pool_Y_Dropout = np.delete(y_Pool_Dropout, (x_pool_index), axis=0) X_Pool = np.concatenate((delete_Pool_X, delete_Pool_X_Dropout), axis=0) y_Pool = np.concatenate((delete_Pool_Y, delete_Pool_Y_Dropout), axis=0) print('updated pool size is ', X_Pool.shape[0]) X_train = np.concatenate((X_train, Pooled_X), axis=0) y_train = np.concatenate((y_train, Pooled_Y), axis=0) print('number of data points from pool', X_train.shape[0]) batch_size = 100 X = np.vstack((X_old, Pooled_X)) y = np.hstack((y_old, Pooled_Y)) X, y = shuffle(X, y) num_batch = X.shape[0] // batch_size print("number of batch: ", num_batch) mod.train() for h in range(itr): losses = 0 for j in range(num_batch): slce = get_slice(j, batch_size) X_fog_ = torch.from_numpy(X[slce]).float().to(device) y_fog_ = torch.from_numpy(y[slce]).long().to(device) optimizer.zero_grad() out = mod(X_fog_) train_loss = criterion(out, y_fog_) losses += train_loss train_loss.backward() optimizer.step() losses_train.append(losses.item() / num_batch) acc = test(X_test, y_test, mod, device, cuda) print('test accuracy: ', acc) AA.append(acc) torch.save( { 'model_state_dict': mod.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, g) return AA, mod, X_train, y_train, losses_train, optimizer
def main(args: argparse.Namespace): # Load input data with open(args.train_metadata, 'r') as f: train_posts = json.load(f) with open(args.val_metadata, 'r') as f: val_posts = json.load(f) # Load labels labels = {} with open(args.label_intent, 'r') as f: intent_labels = json.load(f) labels['intent'] = {} for label in intent_labels: labels['intent'][label] = len(labels['intent']) with open(args.label_semiotic, 'r') as f: semiotic_labels = json.load(f) labels['semiotic'] = {} for label in semiotic_labels: labels['semiotic'][label] = len(labels['semiotic']) with open(args.label_contextual, 'r') as f: contextual_labels = json.load(f) labels['contextual'] = {} for label in contextual_labels: labels['contextual'][label] = len(labels['contextual']) # Build dictionary from training set train_captions = [] for post in train_posts: train_captions.append(post['orig_caption']) dictionary = Dictionary(tokenizer_method="TreebankWordTokenizer") dictionary.build_dictionary_from_captions(train_captions) # Set up torch device if 'cuda' in args.device and torch.cuda.is_available(): device = torch.device(args.device) kwargs = {'pin_memory': True} else: device = torch.device('cpu') kwargs = {} # Set up number of workers num_workers = min(multiprocessing.cpu_count(), args.num_workers) # Set up data loaders differently based on the task # TODO: Extend to ELMo + word2vec etc. if args.type == 'image_only': train_dataset = ImageOnlyDataset(train_posts, labels) val_dataset = ImageOnlyDataset(val_posts, labels) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=num_workers, collate_fn=collate_fn_pad_image_only, **kwargs) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, collate_fn=collate_fn_pad_image_only, **kwargs) elif args.type == 'image_text': train_dataset = ImageTextDataset(train_posts, labels, dictionary) val_dataset = ImageTextDataset(val_posts, labels, dictionary) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=num_workers, collate_fn=collate_fn_pad_image_text, **kwargs) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, collate_fn=collate_fn_pad_image_text, **kwargs) elif args.type == 'text_only': train_dataset = TextOnlyDataset(train_posts, labels, dictionary) val_dataset = TextOnlyDataset(val_posts, labels, dictionary) train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=num_workers, collate_fn=collate_fn_pad_text_only, **kwargs) val_data_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=num_workers, collate_fn=collate_fn_pad_text_only, **kwargs) # Set up the model model = Model(vocab_size=dictionary.size()).to(device) # Set up an optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_scheduler_step_size, gamma=args.lr_scheduler_gamma) # decay by 0.1 every 15 epochs # Set up loss function loss_fn = torch.nn.CrossEntropyLoss() # Setup tensorboard if args.tensorboard: writer = tensorboard.SummaryWriter(log_dir=args.log_dir + "/" + args.name, flush_secs=1) else: writer = None # Training loop if args.classification == 'intent': keys = ['intent'] elif args.classification == 'semiotic': keys = ['semiotic'] elif args.classification == 'contextual': keys = ['contextual'] elif args.classification == 'all': keys = ['intent', 'semiotic', 'contextual'] else: raise ValueError("args.classification doesn't exist.") best_auc_ovr = 0.0 best_auc_ovo = 0.0 best_acc = 0.0 best_model = None best_optimizer = None best_scheduler = None for epoch in range(args.epochs): for mode in ["train", "eval"]: # Set up a progress bar if mode == "train": pbar = tqdm.tqdm(enumerate(train_data_loader), total=len(train_data_loader)) model.train() else: pbar = tqdm.tqdm(enumerate(val_data_loader), total=len(val_data_loader)) model.eval() total_loss = 0 label = dict.fromkeys(keys, np.array([], dtype=np.int)) pred = dict.fromkeys(keys, None) for _, batch in pbar: if 'caption' not in batch: caption_data = None else: caption_data = batch['caption'].to(device) if 'image' not in batch: image_data = None else: image_data = batch['image'].to(device) label_batch = {} for key in keys: label_batch[key] = batch['label'][key].to(device) if mode == "train": model.zero_grad() pred_batch = model(image_data, caption_data) for key in keys: label[key] = np.concatenate((label[key], batch['label'][key].cpu().numpy())) x = pred_batch[key].detach().cpu().numpy() x_max = np.max(x, axis=1).reshape(-1, 1) z = np.exp(x - x_max) prediction_scores = z / np.sum(z, axis=1).reshape(-1, 1) if pred[key] is not None: pred[key] = np.vstack((pred[key], prediction_scores)) else: pred[key] = prediction_scores loss_batch = {} loss = None for key in keys: loss_batch[key] = loss_fn(pred_batch[key], label_batch[key]) if loss is None: loss = loss_batch[key] else: loss += loss_bath[key] total_loss += loss.item() if mode == "train": loss.backward() optimizer.step() # Terminate the progress bar pbar.close() # Update lr scheduler if mode == "train": scheduler.step() for key in keys: auc_score_ovr = roc_auc_score(label[key], pred[key], multi_class='ovr') # pylint: disable-all auc_score_ovo = roc_auc_score(label[key], pred[key], multi_class='ovo') # pylint: disable-all accuracy = accuracy_score(label[key], np.argmax(pred[key], axis=1)) print("[{} - {}] [AUC-OVR={:.3f}, AUC-OVO={:.3f}, ACC={:.3f}]".format(mode, key, auc_score_ovr, auc_score_ovo, accuracy)) if mode == "eval": best_auc_ovr = max(best_auc_ovr, auc_score_ovr) best_auc_ovo = max(best_auc_ovo, auc_score_ovo) best_acc = max(best_acc, accuracy) best_model = model best_optimizer = optimizer best_scheduler = scheduler if writer: writer.add_scalar('AUC-OVR/{}-{}'.format(mode, key), auc_score_ovr, epoch) writer.add_scalar('AUC-OVO/{}-{}'.format(mode, key), auc_score_ovo, epoch) writer.add_scalar('ACC/{}-{}'.format(mode, key), accuracy, epoch) writer.flush() if writer: writer.add_scalar('Loss/{}'.format(mode), total_loss, epoch) writer.flush() print("[{}] Epoch {}: Loss = {}".format(mode, epoch, total_loss)) hparam_dict = { 'train_split': args.train_metadata, 'val_split': args.val_metadata, 'lr': args.lr, 'epochs': args.epochs, 'batch_size': args.batch_size, 'num_workers': args.num_workers, 'shuffle': args.shuffle, 'lr_scheduler_gamma': args.lr_scheduler_gamma, 'lr_scheduler_step_size': args.lr_scheduler_step_size, } metric_dict = { 'AUC-OVR': best_auc_ovr, 'AUC-OVO': best_auc_ovo, 'ACC': best_acc } if writer: writer.add_hparams(hparam_dict=hparam_dict, metric_dict=metric_dict) writer.flush() Path(args.output_dir).mkdir(exist_ok=True) torch.save({ 'hparam_dict': hparam_dict, 'metric_dict': metric_dict, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), }, Path(args.output_dir) / '{}.pt'.format(args.name))