def init_model(): hparams = create_hparams() checkpoint_path = "checkpoints/mellotron_libritts.pt" tacotron = load_model(hparams).cpu().eval() tacotron.load_state_dict( torch.load(checkpoint_path, map_location=torch.device('cpu'))['state_dict']) waveglow_path = 'checkpoints/waveglow_256channels_v4.pt' waveglow = torch.load( waveglow_path, map_location=torch.device('cpu'))['model'].cpu().eval() denoiser = Denoiser(waveglow).cpu().eval() return (tacotron, waveglow, denoiser)
def run_test(model_dir, data_dir, mode, config_path='345M/', beam_width=10): config_path = config_path + 'config.json' vocab_path = config_path + 'vocab.json' merge_path = config_path + 'merges.txt' checkpoint_path = model_dir + '/GPT_model.pkl' log_filename = model_dir + '/test_data.log' config = GPT2Config.from_json_file(os.path.join('./configs/', config_path)) create_log(log_filename) print("Building model") model = load_model(GPT2LMHeadModel(config), checkpoint_path, test=True).cuda() model.eval() tokenizer = GPT2Tokenizer(vocab_path, merge_path) if mode == 'test': print('Loading test dataset...') test_data_loader = GPT2DataLoader(data_path=data_dir, vocab_file=vocab_path, bpe_merges=merge_path, bucket=2, batch_size=1, max_seq_len=512)
from data_loader import GPT2DataLoader from train import run import os import torch if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_size = 'small' if model_size == 'small': config_path = '117M/config.json' elif model_size == 'middle': config_path = '345M/config.json' elif model_size == 'big': config_path = '762M/config.json' config = GPT2Config.from_json_file(os.path.join('./configs/', config_path)) model = load_model(GPT2LMHeadModel(config), "checkpoints/small_fs.pkl") model = model.to(device) train_data_loader = GPT2DataLoader(data_path='DailyDialog/train_text.txt', vocab_file='./vocab_file/encoder.json', bpe_merges='vocab_file/merges.txt', bucket=2, batch_size=5, max_seq_len=512) valid_data_loader = GPT2DataLoader(data_path='DailyDialog/test_text.txt', vocab_file='./vocab_file/encoder.json', bpe_merges='vocab_file/merges.txt', bucket=2, batch_size=5, max_seq_len=512)
return logits if __name__ == "__main__": device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_size = 'middle' if model_size == 'small': config_path = '117M/config.json' elif model_size == 'middle': config_path = '345M/config.json' elif model_size == 'big': config_path = '762M/config.json' config = GPT2Config.from_json_file(os.path.join('./configs/', config_path)) print(config) checkpoint_path ="checkpoints/medium_ft.pkl"#'Cornell_models/GPT_Cornell_models.pkl' #"checkpoints/medium_fs.pkl" model = load_model(GPT2LMHeadModel(config), checkpoint_path,test=False) model = model.to(device) # train_data_loader = GPT2DataLoader(data_path='DailyDialog/train_text.txt', # vocab_file='./vocab_file/encoder.json', # bpe_merges='vocab_file/merges.txt', # bucket=2, # batch_size=5, # max_seq_len=512) vocab_file = './configs/345M/vocab.json' bpe_merges = './configs/345M/merges.txt' #valid_data_loader = GPT2DataLoader(data_path='DailyDialog/test_text.txt', # vocab_file=vocab_file, # bpe_merges=bpe_merges, # bucket=2, # batch_size=1,
def agumentation(arpabet_dict, audio_paths, target_spk_id_list, output_path, ljs=False): if not os.path.exists(output_path): os.makedirs(output_path) # Step1: Basic Setups if not ljs: # Whether to use lj speech checkpoint_path = "mellotron_libritts.pt" else: checkpoit_path = "mellotron_ljs.pt" if torch.cuda.is_available(): tacotron = load_model(hparams).cuda().eval() else: tacotron = load_model(hparams).eval() tacotron.load_state_dict( torch.load(checkpoint_path, map_location="cpu")['state_dict']) waveglow_path = 'waveglow_256channels_v4.pt' if torch.cuda.is_available(): waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() else: waveglow = torch.load(waveglow_path, map_location="cpu")['model'].eval().cpu() denoiser = Denoiser(waveglow).eval() arpabet_dict = cmudict.CMUDict(arpabet_dict) dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) # Step2: Load for file_idx in range(len(dataloader)): source_scp = open(os.path.join(output_path, "source.scp"), "w", encoding="utf-8") audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] source_scp.write("{} {}\n".format(file_idx, audio_path)) # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :] pitch_contour = dataloader[file_idx][3][None] if torch.cuda.is_available(): text_encoded = text_encoded.cuda() pitch_contour = pitch_contour.cuda() mel = load_mel(audio_path) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = tacotron.parse_batch(datacollate([dataloader[file_idx]])) # Step3: Perform speaker transfer with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = tacotron.forward( x) rhythm = rhythm.permute(1, 0, 2) for spk_id in target_spk_id_list: speaker_id = torch.LongTensor([spk_id]) if torch.cuda.is_available(): speaker_id = speaker_id.cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = tacotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour * 0.4, rhythm)) with torch.no_grad(): audio = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] sf.write( os.path.join(output_path, "{}-{}.wav".format(file_idx, spk_id)), audio.detach().cpu().numpy().T, hparams.sampling_rate)
model_size = 'medium' if model_size == 'small': config_path = '117M/config.json' vocab_path = '117M/vocab.json' merges_path = '117M/merges.txt' elif model_size == 'medium': config_path = '345M/config.json' vocab_path = '345M/vocab.json' merges_path = '345M/merges.txt' elif model_size == 'large': config_path = '762M/config.json' vocab_path = '762M/vocab.json' merges_path = '762M/merges.txt' config = GPT2Config.from_json_file(os.path.join('./configs/', config_path)) model = load_model(GPT2LMHeadModel(config), "checkpoints/medium_ft.pkl") device = range(torch.cuda.device_count()) model = torch.nn.DataParallel(model, device_ids=device).cuda() vocab_file = os.path.join('./configs/', vocab_path) bpe_merges = os.path.join('./configs/', merges_path) train_data_loader = GPT2DataLoader( data_path='Data/Cornell_movie_dialogs/dd_train.txt', vocab_file=vocab_file, bpe_merges=bpe_merges, bucket=2, batch_size=2, max_seq_len=512) valid_data_loader = GPT2DataLoader(
def main(argv): del argv save, logdir, figname, logHandler = utils.configuration(FLAGS) train_ds, test_ds, placeholder = get_dataset(FLAGS) loss, correct_prediction, var_list = utils.load_model(FLAGS, placeholder) train_iterator = None test_iterator = None fix_opt, add_opt, stop_opt = utils.make_optimizer(placeholder, loss, var_list) fix_accuracy, add_accuracy = correct_prediction save_dir, save_file = save var_all, var_m1, _ = var_list epoch_list, original, proposed = [], [], [] with tf.Session() as sess: with tf.device('/cpu:0'): merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(logdir) writer.add_graph(sess.graph) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_all) print('Learning started. It takes sometimes...') print() for i in range(1, FLAGS.epochs + 1): logHandler.print_epoch() if i == (FLAGS.stop_point + 1): logHandler._print('Proposed training...') loader = tf.train.Saver(var_m1) loader.restore(sess, tf.train.latest_checkpoint(save_dir)) if i <= FLAGS.stop_point: if i % FLAGS.iteration == 0: loader = tf.train.Saver(var_all) loader.restore(sess, tf.train.latest_checkpoint(save_dir)) utils.fit_model(sess, add_opt, placeholder, train_iterator, train_ds, i, FLAGS, logHandler, merged_summary, writer) origin_test_accuracy = utils.test_validate( sess, fix_accuracy, test_iterator, placeholder, test_ds, FLAGS, logHandler) proposed_test_accuracy = utils.test_validate( sess, add_accuracy, test_iterator, placeholder, test_ds, FLAGS, logHandler) else: utils.fit_model(sess, fix_opt, placeholder, train_iterator, train_ds, i, FLAGS, logHandler, merged_summary, writer) utils.train_validate(sess, fix_accuracy, train_iterator, placeholder, train_ds, FLAGS, logHandler) origin_test_accuracy = utils.test_validate( sess, fix_accuracy, test_iterator, placeholder, test_ds, FLAGS, logHandler) proposed_test_accuracy = utils.test_validate( sess, add_accuracy, test_iterator, placeholder, test_ds, FLAGS, logHandler) saver.save(sess, save_file) else: # loader = tf.train.Saver(var_m1) # loader.restore(sess, tf.train.latest_checkpoint(save_dir)) utils.fit_model(sess, stop_opt, placeholder, train_iterator, train_ds, i, FLAGS, logHandler, merged_summary, writer) if train_iterator is not None: sess.run(train_iterator.initializer) utils.train_validate(sess, add_accuracy, train_iterator, placeholder, train_ds, FLAGS, logHandler) proposed_test_accuracy = utils.test_validate( sess, add_accuracy, test_iterator, placeholder, test_ds, FLAGS, logHandler) origin_test_accuracy = utils.test_validate( sess, fix_accuracy, test_iterator, placeholder, test_ds, FLAGS, logHandler) epoch_list.append(i) proposed.append(proposed_test_accuracy) original.append(origin_test_accuracy) # Add_final_train_accuracy = tu.train_validate(sess, add_accuracy, train_iterator, # X, Y, dropout_rate, train_ds, FLAGS) logHandler._print('Original Accuracy: ') origin_test_accuracy = utils.test_validate(sess, fix_accuracy, test_iterator, placeholder, test_ds, FLAGS, logHandler) logHandler._print('Proposed Accuracy: ') utils.test_validate(sess, add_accuracy, test_iterator, placeholder, test_ds, FLAGS, logHandler) plot_acc(epoch_list, original, proposed, figname) saver.save(sess, save_file) logHandler._print('Training done successfully')
#Parameters experiment: POPULATION_SIZE = 24 NUMBER_ROLLS = 5 GENERATION_LIMIT = 32 SCORE_LIMIT = 100 MAX_STEPS = 200 #each run should actually has 1000 steps, but this can give us time vae = convVAE.ConvVAE() lstm = lstm_mdn.LSTM_MDN() #line 37 train_lstm? #mdn = need from datasets.generate_lstm_training import LSTMDataset #line 63 train_lstm sys.path.insert(1, '/users/alberto/projects/WorldModels/utils') import train_utils train_utils.load_model() #Using this, as recommended by paper #http://blog.otoro.net/2017/11/12/evolving-stable-strategies/ def rollout(k, env): # k is a controller instance # env is the car racing environment obs = env.reset() done = False total_reward = 0 #while not done: while step_counter < MAX_STEPS:
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) if torch.cuda.is_available(): melspec = melspec.cuda() return melspec # Step1: Basic Setups hparams = create_hparams() stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) checkpoint_path = "mellotron_libritts.pt" if torch.cuda.is_available(): tacotron = load_model(hparams).cuda().eval() else: tacotron = load_model(hparams).eval() tacotron.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict']) waveglow_path = 'waveglow_256channels_v4.pt' if torch.cuda.is_available(): waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() else: waveglow = torch.load(waveglow_path, map_location="cpu")['model'].eval().cpu() denoiser = Denoiser(waveglow).eval() arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')