def encode_dataset(cfg): out_dir = Path(utils.to_absolute_path(cfg.out_dir)) out_dir.mkdir(exist_ok=True, parents=True) root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path with open(root_path / "test.json") as file: metadata = json.load(file) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) encoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) encoder.eval() if cfg.save_auxiliary: auxiliary = [] def hook(module, input, output): auxiliary.append(output.clone()) encoder.encoder[-1].register_forward_hook(hook) for _, _, _, path in tqdm(metadata): path = root_path.parent / path mel = torch.from_numpy(np.load( path.with_suffix(".mel.npy"))).unsqueeze(0).to(device) with torch.no_grad(): z, c, indices = encoder.encode(mel) z = z.squeeze().cpu().numpy() out_path = out_dir / path.stem with open(out_path.with_suffix(".txt"), "w") as file: np.savetxt(file, z, fmt="%.16f") if cfg.save_auxiliary: aux_path = out_dir.parent / "auxiliary_embedding1" aux_path.mkdir(exist_ok=True, parents=True) out_path = aux_path / path.stem c = c.squeeze().cpu().numpy() with open(out_path.with_suffix(".txt"), "w") as file: np.savetxt(file, c, fmt="%.16f") aux_path = out_dir.parent / "auxiliary_embedding2" aux_path.mkdir(exist_ok=True, parents=True) out_path = aux_path / path.stem aux = auxiliary.pop().squeeze().cpu().numpy() with open(out_path.with_suffix(".txt"), "w") as file: np.savetxt(file, aux, fmt="%.16f")
def convert(cfg): dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) synthesis_list_path = Path(utils.to_absolute_path(cfg.synthesis_list)) with open(synthesis_list_path) as file: synthesis_list = json.load(file) in_dir = Path(utils.to_absolute_path(cfg.in_dir)) out_dir = Path(utils.to_absolute_path(cfg.out_dir)) out_dir.mkdir(exist_ok=True, parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) decoder = Decoder(**cfg.model.decoder) encoder.to(device) decoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() for wav_path, speaker_id, out_filename in tqdm(synthesis_list): wav_path = in_dir / wav_path wav, _ = librosa.load( wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) with torch.no_grad(): z, _ = encoder.encode(mel) output = decoder.generate(z, speaker) path = out_dir / out_filename librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
def get_encoder(latent_dim, fckpt=''): E = Encoder(latent_dim) if fckpt and os.path.exists(fckpt): ckpt = torch.load(fckpt) loaded_sd = ckpt['E'] try: E.load_state_dict(loaded_sd) except: curr_params = E.state_dict() curr_keys = list(curr_params.keys()) updated_params = {} for k, v in loaded_sd.items(): if 'bn7' in k: newk = k.replace('bn7', 'conv7') else: newk = k if newk in curr_keys and loaded_sd[k].shape == curr_params[ newk].shape: updated_params[newk] = v else: print('Failed to load:', k) curr_params.update(updated_params) E.load_state_dict(curr_params) return E.to(device)
class Model: def __init__(self, chpt_enc_path, chpt_dec_path, chpt_stat_path): historyLength = 10 encoder_dim = hiddenDimension lstm_input_dim = historyLength + 1 decoder_dim = hiddenDimension attention_dim = hiddenDimension output_dim = 1 self.decodeLength = 20 self.encoder = Encoder() self.decoder = DecoderWithAttention(encoder_dim, lstm_input_dim, decoder_dim, attention_dim, output_dim) self.encoder.load_state_dict(torch.load(chpt_enc_path)) self.decoder.load_state_dict(torch.load(chpt_dec_path)) self.encoder = self.encoder.to(device) self.decoder = self.decoder.to(device) self.encoder.eval() self.decoder.eval() with open(chpt_stat_path, 'rb') as f: chpt_stat = pickle.load(f) self.cMean = chpt_stat['cMean_tr'] self.cStd = chpt_stat['cStd_tr'] self.vMean = chpt_stat['vMean_tr'] self.vStd = chpt_stat['vStd_tr'] self.aMean = chpt_stat['aMean_tr'] self.aStd = chpt_stat['aStd_tr'] self.mean = torch.Tensor([self.vMean, self.aMean]).to(device) self.std = torch.Tensor([self.vStd, self.aStd]).to(device) def predict(self, curvatures, currentSpeed, histSpeeds, currentAccelX, histAccelXs): curvatures = torch.FloatTensor(curvatures).to(device) currentSpeed = torch.FloatTensor([currentSpeed]).to(device) histSpeeds = torch.FloatTensor(histSpeeds).to(device) currentAccelX = torch.FloatTensor([currentAccelX]).to(device) histAccelXs = torch.FloatTensor(histAccelXs).to(device) curvatures = (curvatures - self.cMean) / self.cStd currentSpeed = (currentSpeed - self.vMean) / self.vStd histSpeeds = (histSpeeds - self.vMean) / self.vStd currentAccelX = (currentAccelX - self.aMean) / self.aStd histAccelXs = (histAccelXs - self.aMean) / self.aStd curvatures = self.encoder(curvatures.unsqueeze(dim=0).unsqueeze(dim=0)) predictions, alphas, alphas_target = self.decoder(curvatures, currentSpeed, histSpeeds.unsqueeze(dim=0), currentAccelX, histAccelXs.unsqueeze(dim=0), self.decodeLength, self.vMean, self.vStd, self.aMean, self.aStd) return (predictions.squeeze()*self.aStd + self.aMean).cpu().detach().numpy(), alphas.squeeze().cpu().detach().numpy()
def initialize_for_test(params): data_loader = get_loader(params, mode='test') encoder_file = os.path.join(params.encoder_save, 'epoch-%d.pkl' % params.num_epochs) decoder_file = os.path.join(params.decoder_save, 'epoch-%d.pkl' % params.num_epochs) vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder, and set each to inference mode. encoder = Encoder(params) decoder = Decoder(params, vocab_size) encoder.eval() decoder.eval() # Load the trained weights. encoder.load_state_dict(torch.load(encoder_file)) decoder.load_state_dict(torch.load(decoder_file)) encoder.to(params.device) decoder.to(params.device) return data_loader, encoder, decoder
def load_encoder(data_root, weight_path, device): encoder = Encoder() if weight_path: weight = torch.load(weight_path) else: weight = torch.load(get_best_weight(data_root)) encoder.load_state_dict(weight) if device >= 0: encoder = encoder.to(f"cuda:{device}") encoder.eval() return encoder
def main(data_name): dataset = MyDataSet(data_name=data_name, reset=False) vocab_size = dataset.vocab_size corpus = dataset.corpus id2word = {v: k for k, v in corpus.items()} train_loader, val_loader = _get_data_loader(dataset, 0.5, batch_size) embedding, embed_dim = load_embedding(basic_settings['word2vec'], corpus) encoder = Encoder(dataset.feature_dim, output_dim=100) decoder = DecoderWithAttention(encoder.get_output_dim(), decoder_dim=100, attn_dim=100, embed_dim=embed_dim, vocab_size=vocab_size) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=lr) encoder = encoder.to(device) decoder = decoder.to(device) criterion = torch.nn.CrossEntropyLoss().to(device) best_bleu4 = 0 best_hypos = [] best_refs = [] for epoch in range(1, epoches + 1): # One epoch's training train_epoch(train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion, optimizer=decoder_optimizer, epoch=epoch) # One epoch's validation bleu4_score, refs, hypos = validate(val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion, word2id=corpus) if bleu4_score > best_bleu4: best_bleu4 = bleu4_score best_refs = refs best_hypos = hypos name = data_name + '_' + str(best_bleu4) + '.xlsx' save_result(name, best_refs, best_hypos, id2word)
def main(args): # Image preprocessing # In generation phase, we need should random crop, just resize transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wraper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build model encoder = Encoder(embed_size=args.embed_size).eval() decoder = Decoder(stateful=False, embed_size=args.embed_size, hidden_size=args.hidden_size, vocab_size=len(vocab), num_layers=args.num_layers).to(device) encoder = encoder.to(device) decoder = decoder.to(device) # load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path, map_location=device)) decoder.load_state_dict(torch.load(args.decoder_path, map_location=device)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<<end>>': break sentence = ' '.join(sampled_caption) print(sentence)
def load_model( encoder_path, decoder_path, vocab_size, layer_type='gru', embed_size=256, hidden_size=512, num_layers=2, ): if layer_type == 'lstm': from model import Encoder, Decoder else: from model_gru import Encoder, Decoder # eval mode (batchnorm uses moving mean/variance) encoder = Encoder(embed_size).eval() decoder = Decoder(embed_size, hidden_size, vocab_size, num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) return encoder, decoder
def train_model(cfg): tensorboard_path = Path( utils.to_absolute_path("tensorboard")) / cfg.checkpoint_dir checkpoint_dir = Path(utils.to_absolute_path(cfg.checkpoint_dir)) writer = SummaryWriter(tensorboard_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) cpc = CPCLoss(**cfg.model.cpc) encoder.to(device) cpc.to(device) optimizer = optim.Adam(chain(encoder.parameters(), cpc.parameters()), lr=cfg.training.scheduler.initial_lr) scheduler = WarmupScheduler( optimizer, warmup_epochs=cfg.training.scheduler.warmup_epochs, initial_lr=cfg.training.scheduler.initial_lr, max_lr=cfg.training.scheduler.max_lr, milestones=cfg.training.scheduler.milestones, gamma=cfg.training.scheduler.gamma) if cfg.resume: print("Resume checkpoint from: {}:".format(cfg.resume)) resume_path = utils.to_absolute_path(cfg.resume) checkpoint = torch.load(resume_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) cpc.load_state_dict(checkpoint["cpc"]) optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) start_epoch = checkpoint["epoch"] else: start_epoch = 1 root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path dataset = CPCDataset( root=root_path, n_sample_frames=cfg.training.sample_frames + cfg.training.n_prediction_steps, n_utterances_per_speaker=cfg.training.n_utterances_per_speaker, hop_length=cfg.preprocessing.hop_length, sr=cfg.preprocessing.sr) dataloader = DataLoader(dataset, batch_size=cfg.training.n_speakers_per_batch, shuffle=True, num_workers=cfg.training.n_workers, pin_memory=True, drop_last=True) for epoch in range(start_epoch, cfg.training.n_epochs + 1): if epoch % cfg.training.log_interval == 0 or epoch == start_epoch: average_cpc_loss = average_vq_loss = average_perplexity = 0 average_accuracies = np.zeros(cfg.training.n_prediction_steps // 2) for i, (mels, _) in enumerate(tqdm(dataloader), 1): mels = mels.to(device) mels = mels.view( cfg.training.n_speakers_per_batch * cfg.training.n_utterances_per_speaker, cfg.preprocessing.n_mels, -1) optimizer.zero_grad() z, c, vq_loss, perplexity = encoder(mels) cpc_loss, accuracy = cpc(z, c) loss = cpc_loss + vq_loss loss.backward() optimizer.step() average_cpc_loss += (cpc_loss.item() - average_cpc_loss) / i average_vq_loss += (vq_loss.item() - average_vq_loss) / i average_perplexity += (perplexity.item() - average_perplexity) / i average_accuracies += (np.array(accuracy) - average_accuracies) / i scheduler.step() if epoch % cfg.training.log_interval == 0 and epoch != start_epoch: writer.add_scalar("cpc_loss/train", average_cpc_loss, epoch) writer.add_scalar("vq_loss/train", average_vq_loss, epoch) writer.add_scalar("perplexity/train", average_perplexity, epoch) print( "epoch:{}, cpc loss:{:.2E}, vq loss:{:.2E}, perpexlity:{:.3f}". format(epoch, cpc_loss, average_vq_loss, average_perplexity)) print(100 * average_accuracies) if epoch % cfg.training.checkpoint_interval == 0 and epoch != start_epoch: save_checkpoint(encoder, cpc, optimizer, scheduler, epoch, checkpoint_dir)
def main(epoch_num, batch_size, verbose, UNSEEN, SEEN, MODE): [ hownet_file, sememe_file, word_index_file, word_vector_file, dictionary_file, word_cilinClass_file ] = [ 'hownet.json', 'sememe.json', 'word_index.json', 'word_vector.npy', 'dictionary_sense.json', 'word_cilinClass.json' ] word2index, index2word, word2vec, sememe_num, label_size, label_size_chara, word_defi_idx_all = load_data( hownet_file, sememe_file, word_index_file, word_vector_file, dictionary_file, word_cilinClass_file) (word_defi_idx_TrainDev, word_defi_idx_seen, word_defi_idx_test2000, word_defi_idx_test200, word_defi_idx_test272) = word_defi_idx_all index2word = np.array(index2word) length = len(word_defi_idx_TrainDev) valid_dataset = MyDataset(word_defi_idx_TrainDev[int(0.9 * length):]) test_dataset = MyDataset(word_defi_idx_test2000 + word_defi_idx_test200 + word_defi_idx_test272) if SEEN: mode = 'S_' + MODE print('*METHOD: Seen defi.') print('*TRAIN: [Train + allSeen(2000+200+272)]') print('*TEST: [2000rand1 + 200desc + 272desc]') train_dataset = MyDataset(word_defi_idx_TrainDev[:int(0.9 * length)] + word_defi_idx_seen) elif UNSEEN: mode = 'U_' + MODE print('*METHOD: Unseen All words and defi.') print('*TRAIN: [Train]') print('*TEST: [2000rand1 + 200desc + 272desc]') train_dataset = MyDataset(word_defi_idx_TrainDev[:int(0.9 * length)]) print('*MODE: [%s]' % mode) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn) valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=batch_size, shuffle=False, collate_fn=my_collate_fn_test) print('Train dataset: ', len(train_dataset)) print('Valid dataset: ', len(valid_dataset)) print('Test dataset: ', len(test_dataset)) word_defi_idx = word_defi_idx_TrainDev + word_defi_idx_seen wd2sem = word2sememe(word_defi_idx, len(word2index), sememe_num) wd_sems = label_multihot(wd2sem, sememe_num) wd_sems = torch.from_numpy(np.array(wd_sems[:label_size])).to(device) wd_POSs = label_multihot(word2POS(word_defi_idx, len(word2index), 13), 13) wd_POSs = torch.from_numpy(np.array(wd_POSs[:label_size])).to(device) wd_charas = label_multihot( word2chara(word_defi_idx, len(word2index), label_size_chara), label_size_chara) wd_charas = torch.from_numpy(np.array(wd_charas[:label_size])).to(device) wd2Cilin1 = word2Cn(word_defi_idx, len(word2index), 'C1', 13) wd_C1 = label_multihot(wd2Cilin1, 13) #13 96 1426 4098 wd_C1 = torch.from_numpy(np.array(wd_C1[:label_size])).to(device) wd_C2 = label_multihot(word2Cn(word_defi_idx, len(word2index), 'C2', 96), 96) wd_C2 = torch.from_numpy(np.array(wd_C2[:label_size])).to(device) wd_C3 = label_multihot(word2Cn(word_defi_idx, len(word2index), 'C3', 1426), 1426) wd_C3 = torch.from_numpy(np.array(wd_C3[:label_size])).to(device) wd_C4 = label_multihot(word2Cn(word_defi_idx, len(word2index), 'C4', 4098), 4098) wd_C4 = torch.from_numpy(np.array(wd_C4[:label_size])).to(device) '''wd2Cilin = word2Cn(word_defi_idx, len(word2index), 'C', 5633) wd_C0 = label_multihot(wd2Cilin, 5633) wd_C0 = torch.from_numpy(np.array(wd_C0[:label_size])).to(device) wd_C = [wd_C1, wd_C2, wd_C3, wd_C4, wd_C0] ''' wd_C = [wd_C1, wd_C2, wd_C3, wd_C4] #----------mask of no sememes print('calculating mask of no sememes...') mask_s = torch.zeros(label_size, dtype=torch.float32, device=device) for i in range(label_size): sems = set(wd2sem[i].detach().cpu().numpy().tolist()) - set( [sememe_num]) if len(sems) == 0: mask_s[i] = 1 mask_c = torch.zeros(label_size, dtype=torch.float32, device=device) for i in range(label_size): cc = set(wd2Cilin1[i].detach().cpu().numpy().tolist()) - set([13]) if len(cc) == 0: mask_c[i] = 1 model = Encoder(vocab_size=len(word2index), embed_dim=word2vec.shape[1], hidden_dim=200, layers=1, class_num=label_size, sememe_num=sememe_num, chara_num=label_size_chara) model.embedding.weight.data = torch.from_numpy(word2vec) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Adam best_valid_accu = 0 DEF_UPDATE = True for epoch in range(epoch_num): print('epoch: ', epoch) model.train() train_loss = 0 label_list = list() pred_list = list() for words_t, sememes_t, definition_words_t, POS_t, sememes, POSs, charas_t, C, C_t in tqdm( train_dataloader, disable=verbose): optimizer.zero_grad() loss, _, indices = model('train', x=definition_words_t, w=words_t, ws=wd_sems, wP=wd_POSs, wc=wd_charas, wC=wd_C, msk_s=mask_s, msk_c=mask_c, mode=MODE) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() predicted = indices[:, :100].detach().cpu().numpy().tolist() train_loss += loss.item() label_list.extend(words_t.detach().cpu().numpy()) pred_list.extend(predicted) train_accu_1, train_accu_10, train_accu_100 = evaluate( label_list, pred_list) del label_list del pred_list gc.collect() print('train_loss: ', train_loss / len(train_dataset)) print('train_accu(1/10/100): %.2f %.2F %.2f' % (train_accu_1, train_accu_10, train_accu_100)) model.eval() with torch.no_grad(): valid_loss = 0 label_list = [] pred_list = [] for words_t, sememes_t, definition_words_t, POS_t, sememes, POSs, charas_t, C, C_t in tqdm( valid_dataloader, disable=verbose): loss, _, indices = model('train', x=definition_words_t, w=words_t, ws=wd_sems, wP=wd_POSs, wc=wd_charas, wC=wd_C, msk_s=mask_s, msk_c=mask_c, mode=MODE) predicted = indices[:, :100].detach().cpu().numpy().tolist() valid_loss += loss.item() label_list.extend(words_t.detach().cpu().numpy()) pred_list.extend(predicted) valid_accu_1, valid_accu_10, valid_accu_100 = evaluate( label_list, pred_list) print('valid_loss: ', valid_loss / len(valid_dataset)) print('valid_accu(1/10/100): %.2f %.2F %.2f' % (valid_accu_1, valid_accu_10, valid_accu_100)) del label_list del pred_list gc.collect() if valid_accu_10 > best_valid_accu: best_valid_accu = valid_accu_10 print('-----best_valid_accu-----') #torch.save(model, 'saved.model') label_list = [] pred_list = [] for words_t, definition_words_t in tqdm(test_dataloader, disable=verbose): indices = model('test', x=definition_words_t, w=words_t, ws=wd_sems, wP=wd_POSs, wc=wd_charas, wC=wd_C, msk_s=mask_s, msk_c=mask_c, mode=MODE) predicted = indices[:, :1000].detach().cpu().numpy( ).tolist() label_list.extend(words_t.detach().cpu().numpy()) pred_list.extend(predicted) test_accu_1, test_accu_10, test_accu_100, median, variance = evaluate_test( label_list, pred_list) print('test_accu(1/10/100): %.2f %.2F %.2f %.1f %.2f' % (test_accu_1, test_accu_10, test_accu_100, median, variance)) if epoch > 10: json.dump((index2word[label_list]).tolist(), open(mode + '_label_list.json', 'w')) json.dump((index2word[np.array(pred_list)]).tolist(), open(mode + '_pred_list.json', 'w')) del label_list del pred_list gc.collect()
assert args.snapshot is not None else: if args.sort_by_freq is False: assert args.order_free in ["pla", "mla"] else: if args.order_free: raise ValueError( 'Sort by freq and order_free are mutually exclusive.') resume = 0 highest_f1 = 0 epochs_without_imp = 0 iterations = 0 encoder = Encoder(encoder_weights=args.encoder_weights) decoder = Decoder(args.hidden_size, args.embed_size, args.attention_size, args.dropout) encoder = encoder.to('cuda') decoder = decoder.to('cuda') snapshot = args.snapshot test_model = args.test_model train_from_scratch = args.train_from_scratch swa_params = eval(args.swa_params) finetune_encoder = args.finetune_encoder if not test_model: if finetune_encoder: encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=args.encoder_lr) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.decoder_lr) else:
embedding_dimension=opt.embedding_dim, hidden_size=opt.rnn_hidden, num_layer=opt.num_layers) optimizer1 = torch.optim.Adam(encoder.parameters(), lr=opt.lr) decoder = BahdanauAttnDecoderRNN(opt.rnn_hidden, opt.embedding_dim, len(en_config.word2ix), n_layers=2, dropout_p=0.1) # decoder = optimizer2 = torch.optim.Adam(decoder.parameters(), lr=opt.lr) if opt.save_path: encoder.load_state_dict(torch.load(opt.save_path + 'encoder.pth')) decoder.load_state_dict(torch.load(opt.save_path + 'decoder.pth')) print('load update model') encoder.to(device) decoder.to(device) loss_meter = AverageValueMeter() ''' for epoch in range(200): loss_meter.reset() for ii, ((in_lang,in_lengths),(out_lang,out_lengths)) in tqdm(enumerate(train_dataloader)): in_lang = in_lang.to(device) out_lang = out_lang.to(device) optimizer1.zero_grad() optimizer2.zero_grad() encoder_outputs, encoder_hidden = encoder(in_lang,in_lengths) # MAX_LENGTH, BATCH_SIZE, EMBEDDING DIMENSION // n_layer, BATCH_SIZE, EMBEDDING DIMENSION # Prepare input and output variables decoder_input = torch.LongTensor([fr_config.word2ix[opt.start]] * in_lang.shape[1]).to(device)
class Solver(object): """Solver for training and testing""" def __init__(self, data_loader, config): """Initialize configurations.""" # Data loader. self.data_loader = data_loader # Model configurations. self.a_dim = config.a_dim self.id_dim = config.id_dim # Training configurations. self.batch_size = config.batch_size self.num_iters = config.num_iters self.num_iters_decay = config.num_iters_decay self.lr = config.lr self.n_critic = config.n_critic self.beta1 = config.beta1 self.beta2 = config.beta2 self.resume_iters = config.resume_iters # Test configurations. self.test_iters = config.test_iters # Miscellaneous. self.use_tensorboard = config.use_tensorboard self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Directories. self.log_dir = config.log_dir self.sample_dir = config.sample_dir self.model_save_dir = config.model_save_dir self.result_dir = config.result_dir # Step size. self.log_step = config.log_step self.sample_step = config.sample_step self.model_save_step = config.model_save_step self.lr_update_step = config.lr_update_step # Build the model and tensorboard. self.build_model() if self.use_tensorboard: self.build_tensorboard() def build_model(self): """Create a generator and a discriminator.""" self.G = Generator() self.D = Discriminator() self.I = Encoder() self.C = Encoder() self.A = Attribute() self.g_optimizer = torch.optim.Adam(self.G.parameters(), self.lr, [self.beta1, self.beta2]) self.d_optimizer = torch.optim.Adam(self.D.parameters(), self.lr, [self.beta1, self.beta2]) self.i_optimizer = torch.optim.Adam(self.I.parameters(), self.lr, [self.beta1, self.beta2]) self.c_optimizer = torch.optim.Adam(self.C.parameters(), self.lr, [self.beta1, self.beta2]) self.a_optimizer = torch.optim.Adam(self.A.parameters(), self.lr, [self.beta1, self.beta2]) self.G.to(self.device) self.D.to(self.device) self.A.to(self.device) self.I.to(self.device) self.C.to(self.device) def print_network(self, model, name): """Print out the network information.""" num_params = 0 for p in model.parameters(): num_params += p.numel() print(model) print(name) print("The number of parameters: {}".format(num_params)) def restore_model(self, resume_iters): """Restore the trained generator and discriminator.""" print('Loading the trained models from step {}...'.format(resume_iters)) G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(resume_iters)) D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(resume_iters)) A_path = os.path.join(self.model_save_dir, '{}-A.ckpt'.format(resume_iters)) I_path = os.path.join(self.model_save_dir, '{}-I.ckpt'.format(resume_iters)) C_path = os.path.join(self.model_save_dir, '{}-C.ckpt'.format(resume_iters)) self.A.load_state_dict(torch.load(A_path, map_location=lambda storage, loc: storage)) self.I.load_state_dict(torch.load(I_path, map_location=lambda storage, loc: storage)) self.C.load_state_dict(torch.load(C_path, map_location=lambda storage, loc: storage)) self.G.load_state_dict(torch.load(G_path, map_location=lambda storage, loc: storage)) self.D.load_state_dict(torch.load(D_path, map_location=lambda storage, loc: storage)) def build_tensorboard(self): """Build a tensorboard logger.""" from logger import Logger self.logger = Logger(self.log_dir) def update_lr(self, lr): """Decay learning rates of the generator and discriminator.""" for param_group in self.g_optimizer.param_groups: param_group['lr'] = lr for param_group in self.d_optimizer.param_groups: param_group['lr'] = lr for param_group in self.i_optimizer.param_groups: param_group['lr'] = lr for param_group in self.a_optimizer.param_groups: param_group['lr'] = lr for param_group in self.c_optimizer.param_groups: param_group['lr'] = lr def reset_grad(self): """Reset the gradient buffers.""" self.g_optimizer.zero_grad() self.d_optimizer.zero_grad() self.i_optimizer.zero_grad() self.a_optimizer.zero_grad() self.c_optimizer.zero_grad() def denorm(self, x): """Convert the range from [-1, 1] to [0, 1].""" out = (x + 1) / 2 return out.clamp_(0, 1) def classification_loss(self, logit, target): """Compute binary or softmax cross entropy loss.""" return F.cross_entropy(logit, target) def mse_loss(self, out, gt): """Computes the MSE between model output and scalar gt""" loss = 0.5 * torch.mean(torch.abs(out - gt)**2) return loss def L1_loss(self, pred, target): """ Calculate L1 loss """ return torch.mean(torch.abs(pred - target)) def reparameterization(self, mu, logvar): std = torch.exp(logvar / 2) sampled_z = torch.FloatTensor(np.random.normal(0, 1, (mu.size(0), 8))).to(self.device) z = sampled_z * std + mu return z def train(self): """Train StarGAN within a single dataset.""" # Set data loader. data_loader = self.data_loader # Fetch fixed inputs for debugging. data_iter = iter(data_loader) batch_fixed = next(data_iter) for k in batch_fixed: batch_fixed[k] = batch_fixed[k].to(self.device) # Learning rate cache for decaying. lr = self.lr # Start training from scratch or resume training. start_iters = 0 if self.resume_iters: start_iters = self.resume_iters self.restore_model(self.resume_iters) # Start training. print('Start training...') start_time = time.time() for i in range(start_iters, self.num_iters): # =================================================================================== # # 1. Preprocess input data # # =================================================================================== # # Fetch real images and labels. try: batch = next(data_iter) except: data_iter = iter(data_loader) batch = next(data_iter) for k in batch: batch[k] = batch[k].to(self.device) # =================================================================================== # # 2. Train the discriminator # # =================================================================================== # loss = {} # get identity z id_z, _ = self.I(batch['img_profile']) # get attribute z mu, logvar = self.A(batch['img_frontal']) a_z = self.reparameterization(mu, logvar) # get x' x = torch.cat([id_z, a_z], 1) x_fake = self.G(x) # Get the predicted identity id_pred, _ = self.C(batch['img_profile']) # distinguish the true and the false d_real, _ = self.D(batch['img_frontal']) d_fake, _ = self.D(x_fake.detach()) # train I loss_Li = self.classification_loss(id_z, batch['label']) # train A loss_KL = torch.sum(0.5 * (mu**2 + torch.exp(logvar) - logvar - 1)) loss_GR = self.mse_loss(batch['img_frontal'], x_fake) # triain C loss_C = self.classification_loss(id_pred, batch['label']) # train D loss_D = - torch.mean(d_real) + torch.mean(d_fake) d_loss = loss_D + loss_C + loss_GR + loss_KL + loss_Li self.reset_grad() d_loss.backward() self.d_optimizer.step() self.c_optimizer.step() self.a_optimizer.step() self.i_optimizer.step() loss['C/loss_C'] = loss_C.item() loss['A/loss_GR'] = loss_GR.item() loss['I/loss_Li'] = loss_Li.item() loss['D/loss_D'] = loss_D.item() # =================================================================================== # # 3. Train the generator # # =================================================================================== # if (i + 1) % self.n_critic == 0: id_z, _ = self.I(batch['img_profile']) # get attribute z mu, logvar = self.A(batch['img_frontal']) a_z = self.reparameterization(mu, logvar) # get x' x = torch.cat([id_z, a_z], 1) x_fake = self.G(x) # Get the predicted identity _, c_f_s = self.C(batch['img_profile']) _, c_f_x = self.C(x_fake) # distinguish the true and the false d_real, d_f_a = self.D(batch['img_frontal']) d_fake, d_f_x = self.D(x_fake) loss_GR = self.mse_loss(batch['img_frontal'], x_fake) # triain C loss_GC = self.mse_loss(c_f_x, c_f_s) loss_GD = self.mse_loss(d_f_x, d_f_a) loss_g = - torch.mean(d_fake) g_loss = loss_g + loss_GC + loss_GR + loss_GD self.reset_grad() g_loss.backward() self.g_optimizer.step() # Logging. loss['G/loss_GR'] = loss_GR.item() loss['G/loss_GC'] = loss_GC.item() loss['G/loss_GD'] = loss_GD.item() loss['G/loss_g'] = loss_g.item() # =================================================================================== # # 4. Miscellaneous # # =================================================================================== # # Print out training information. if (i + 1) % self.log_step == 0: et = time.time() - start_time et = str(datetime.timedelta(seconds=et))[:-7] log = "Elapsed [{}], Iteration [{}/{}]".format(et, i + 1, self.num_iters) for tag, value in loss.items(): log += ", {}: {:.4f}".format(tag, value) print(log) if self.use_tensorboard: for tag, value in loss.items(): self.logger.scalar_summary(tag, value, i + 1) # Translate fixed images for debugging. if (i + 1) % self.sample_step == 0: for k in batch_fixed: batch_fixed[k] = batch_fixed[k].to(self.device) with torch.no_grad(): x_fake_list = [batch_fixed['img_profile']] id_z, _ = self.I(batch_fixed['img_profile']) # get attribute z mu, logvar = self.A(batch_fixed['img_frontal']) a_z = self.reparameterization(mu, logvar) # get x' x = torch.cat([id_z, a_z], 1) x_fake = self.G(x) x_fake_list.append(x_fake) x_concat = torch.cat(x_fake_list, dim=3) sample_path = os.path.join(self.sample_dir, '{}-images.jpg'.format(i + 1)) save_image(self.denorm(x_concat.data.cpu()), sample_path, nrow=2, padding=5) print('Saved real and fake images into {}...'.format(sample_path)) # Save model checkpoints. if (i + 1) % self.model_save_step == 0: G_path = os.path.join(self.model_save_dir, '{}-G.ckpt'.format(i + 1)) D_path = os.path.join(self.model_save_dir, '{}-D.ckpt'.format(i + 1)) torch.save(self.G.state_dict(), G_path) torch.save(self.D.state_dict(), D_path) print('Saved model checkpoints into {}...'.format(self.model_save_dir)) # Decay learning rates. if (i + 1) % self.lr_update_step == 0 and (i + 1) > (self.num_iters - self.num_iters_decay): lr -= (self.lr / float(self.num_iters_decay)) self.update_lr(lr) print('Decayed learning rates, lr: {}'.format(lr))
def main(): epoch = 1000 batch_size = 64 hidden_dim = 256 encoder = Encoder(num_words, hidden_dim) if args.attn: attn_model = 'dot' decoder = LuongAttnDecoderLength(attn_model, hidden_dim, num_words, MAX_TGT_LEN) else: decoder = DecoderTask1(hidden_dim, num_words) if args.train: weight = torch.ones(num_words) weight[word2idx_mapping[PAD_TOKEN]] = 0 encoder = encoder.to(device) decoder = decoder.to(device) weight = weight.to(device) encoder_optimizer = Adam(encoder.parameters(), lr=0.001) decoder_optimizer = Adam(decoder.parameters(), lr=0.001) criterion = nn.NLLLoss(ignore_index=int(word2idx_mapping[PAD_TOKEN]), size_average=True) #criterion = nn.CrossEntropyLoss(weight=weight) np.random.seed(1124) order = np.arange(len(train_data)) best_loss = 1e5 best_epoch = 0 for e in range(epoch): #if e - best_epoch > 20: break #np.random.shuffle(order) choice = np.random.choice(order, 10000, replace=False) shuffled_train_data = train_data[choice] train_loss = 0 valid_loss = 0 for b in tqdm(range(int(len(choice) // batch_size))): batch_x = torch.LongTensor( shuffled_train_data[b * batch_size:(b + 1) * batch_size][:, 0].tolist()).t() batch_y = torch.LongTensor( shuffled_train_data[b * batch_size:(b + 1) * batch_size][:, 1].tolist()).t() batch_x, batch_y = batch_x.to(device), batch_y.to(device) train_loss += train(batch_x, batch_y, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, False) train_loss /= b ''' for b in range(len(valid_data) // batch_size): batch_x = torch.LongTensor(valid_data[b*batch_size: (b+1)*batch_size][:, 0].tolist()).t() batch_y = torch.LongTensor(valid_data[b*batch_size: (b+1)*batch_size][:, 1].tolist()).t() batch_x, batch_y = batch_x.to(device), batch_y.to(device) valid_loss += train(batch_x, batch_y, max_seqlen, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, True) valid_loss /= b ''' print( "epoch {}, train_loss {:.4f}, valid_loss {:.4f}, best_epoch {}, best_loss {:.4f}" .format(e, train_loss, valid_loss, best_epoch, best_loss)) ''' if valid_loss < best_loss: best_loss = valid_loss best_epoch = e torch.save(encoder.state_dict(), args.encoder_path + '.best') torch.save(decoder.state_dict(), args.decoder_path + '.best') ''' torch.save(encoder.state_dict(), args.encoder_path) torch.save(decoder.state_dict(), args.decoder_path) print(encoder) print(decoder) else: encoder.load_state_dict( torch.load(args.encoder_path, map_location=torch.device(device))) decoder.load_state_dict( torch.load(args.decoder_path, map_location=torch.device(device))) print(encoder) print(decoder) print("==========================================================") predict(encoder, decoder)
image, padding=2, normalize=False)) netE = Encoder(ngpu=1, nz=nz, nc=3) netD = Res_Discriminator(channel=6) netG = ResnetGenerator32(z_dim=nz) netD2 = ResnetDiscriminator32(stack=6, ch=opt.ch) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") netE = nn.DataParallel(netE) netD = nn.DataParallel(netD) netG = nn.DataParallel(netG) netD2 = nn.DataParallel(netD2) netE.to(device) netD2.to(device) netG.to(device) netD.to(device) optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(0, .9)) optimizerD2 = optim.Adam(netD2.parameters(), lr=opt.lr, betas=(0, .9)) optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(0, .9)) optimizerE = optim.Adam(netE.parameters(), lr=opt.lr, betas=(.5, .9)) start = opt.start def adjust_learning_rate(optimizer, epoch, num_epochs): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" lr = opt.lr - opt.lr * (epoch - 45) / (num_epochs - 45)
def train_model(resume): with open(Path("./cfg/cfg.json").absolute()) as file: para = json.load(file) tensorboard_path = Path("./tensorboard/writer").absolute() checkpoint_dir = Path("./checkpoint").absolute() writer = SummaryWriter(tensorboard_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(in_channels=para['encoder']['in_channels'], channels=para['encoder']['channels'], n_embeddings=para['encoder']['n_embeddings'], embedding_dim=para['encoder']['embedding_dim'], jitter=para['encoder']['jitter']) decoder = Decoder(in_channels=para['decoder']['in_channels'], conditioning_channels=para['decoder']['conditioning_channels'], n_speakers = para['decoder']['n_speakers'], speaker_embedding_dim=para['decoder']['speaker_embedding_dim'], mu_embedding_dim=para['decoder']['mu_embedding_dim'], rnn_channels=para['decoder']['rnn_channels'], fc_channels=para['decoder']['fc_channels'], bits=para['decoder']['bits'], hop_length=para['decoder']['hop_length']) encoder.to(device) decoder.to(device) if resume: resume_path = Path("./checkpoint/model.pt").absolute() print("Resume checkpoint from: {}:".format(str(resume_path))) checkpoint = torch.load(resume_path, map_location=lambda storage, loc: storage) print(checkpoint.keys()) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) optimizer = optim.Adam( chain(encoder.parameters(), decoder.parameters()), lr=1e-5) # [encoder, decoder], optimizer = amp.initialize([encoder, decoder], optimizer, opt_level="O1") scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[300000, 400000], gamma=0.5) optimizer.load_state_dict(checkpoint["optimizer"]) #amp.load_state_dict(checkpoint["amp"]) scheduler.load_state_dict(checkpoint["scheduler"]) global_step = checkpoint["step"] else: global_step = 0 optimizer = optim.Adam( chain(encoder.parameters(), decoder.parameters()), lr=1e-5) # [encoder, decoder], optimizer = amp.initialize([encoder, decoder], optimizer, opt_level="O1") scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[300000, 400000], gamma=0.5) sdataset = SpeechDataset( root='./preprocessed_file/train', hop_length=para['preprocess']['hop_length'], sr=para['preprocess']['sr'], sample_frames=para['preprocess']['sample_frames']) print(len(sdataset)) dataloader = DataLoader( dataset=sdataset, batch_size=16, shuffle=True, num_workers=1, pin_memory=True, drop_last=True) print(len(dataloader)) n_epochs = 1 # start_epoch = global_step // len(dataloader) + 1 for epoch in range(global_step, global_step+n_epochs): average_recon_loss = average_vq_loss = average_perplexity = 0 for i, (audio, mels, speakers) in enumerate(tqdm(dataloader), 1): #audio, mels, speakers = audio.to(device), mels.to(device), speakers.to(device) #print(speakers) optimizer.zero_grad() z, vq_loss, perplexity = encoder(mels) output = decoder(audio[:, :-1], z, speakers) recon_loss = F.cross_entropy(output.transpose(1, 2), audio[:, 1:]) loss = recon_loss + vq_loss loss.backward() #with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1) optimizer.step() scheduler.step() average_recon_loss += (recon_loss.item() - average_recon_loss) / i average_vq_loss += (vq_loss.item() - average_vq_loss) / i average_perplexity += (perplexity.item() - average_perplexity) / i global_step += 1 save_checkpoint( encoder, decoder, optimizer, amp, scheduler, global_step, checkpoint_dir) writer.add_scalar("recon_loss/train", average_recon_loss, global_step) writer.add_scalar("vq_loss/train", average_vq_loss, global_step) writer.add_scalar("average_perplexity", average_perplexity, global_step) print("epoch:{}, recon loss:{:.2E}, vq loss:{:.2E}, perpexlity:{:.3f}" .format(epoch, average_recon_loss, average_vq_loss, average_perplexity))
def main(): input_lang, output_lang, pairs, data1, data2 = read_langs("eng", "fra", True) input_tensor = [[input_lang.word2index[s] for s in es.split(' ')] for es in data1] target_tensor = [[output_lang.word2index[s] for s in es.split(' ')] for es in data2] max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor) input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor] target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor] print(len(target_tensor)) input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2) # Show length print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)) BUFFER_SIZE = len(input_tensor_train) BATCH_SIZE = 64 N_BATCH = BUFFER_SIZE // BATCH_SIZE embedding_dim = 256 units = 1024 vocab_inp_size = len(input_lang.word2index) vocab_tar_size = len(output_lang.word2index) train_dataset = MyData(input_tensor_train, target_tensor_train) val_dataset = MyData(input_tensor_val, target_tensor_val) dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=True, shuffle=True) device = torch.device("cpu") encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE) decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE) encoder.to(device) decoder.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001) EPOCHS = 10 for epoch in range(EPOCHS): start = time() encoder.train() decoder.train() total_loss = 0 for (batch, (inp, targ, inp_len)) in enumerate(dataset): loss = 0 xs, ys, lens = sort_batch(inp, targ, inp_len) enc_output, enc_hidden = encoder(xs.to(device), lens, device) dec_hidden = enc_hidden dec_input = torch.tensor([[output_lang.word2index['<sos>']]] * BATCH_SIZE) for t in range(1, ys.size(1)): predictions, dec_hidden, _ = decoder(dec_input.to(device), dec_hidden.to(device), enc_output.to(device)) loss += loss_function(criterion, ys[:, t].to(device), predictions.to(device)) # loss += loss_ dec_input = ys[:, t].unsqueeze(1) batch_loss = (loss / int(ys.size(1))) total_loss += batch_loss optimizer.zero_grad() loss.backward() ### UPDATE MODEL PARAMETERS optimizer.step() if batch % 100 == 0: print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.detach().item())) ### TODO: Save checkpoint for model print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / N_BATCH)) print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
valid_loader = torch.utils.data.DataLoader(CaptionDataset(DATA_FOLDER, 'VAL'), batch_size=BATCH_SIZE, shuffle=True, num_workers=1, pin_memory=True) # Optimizer optimizer = torch.optim.Adam(decoder.parameters(), lr=LEARNING_RATE) # Parameters check model_parameters = filter(lambda p: p.requires_grad, decoder.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print('\n>> {} parameters\n'.format(params)) encoder = encoder.to(DEVICE) decoder = decoder.to(DEVICE) #========================================================================================================= #========================================================================================================= #================================ 3. TRAINING for epoch in range(START_EPOCH, START_EPOCH + N_EPOCHS): decoder.train() encoder.train() epoch_loss = 0. time = datetime.now() for i, (image, caption, length) in enumerate(tqdm(train_loader)):
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models if args.model_type == 'no_attention': encoder = Encoder(args.embed_size).to(device) decoder = Decoder(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) elif args.model_type == 'attention': encoder = EncoderAtt(encoded_image_size=9).to(device) decoder = DecoderAtt(vocab, args.encoder_dim, args.hidden_size, args.attention_dim, args.embed_size, args.dropout_ratio, args.alpha_c).to(device) elif args.model_type == 'transformer': model = Transformer(len(vocab), args.embed_size, args.transformer_layers, 8, args.dropout_ratio).eval() else: print('Select model_type attention or no_attention') if args.model_type != 'transformer': encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict( torch.load(args.encoder_path, map_location=torch.device('cpu'))) decoder.load_state_dict( torch.load(args.decoder_path, map_location=torch.device('cpu'))) else: model = model.to(device) model.load_state_dict( torch.load(args.model_path, map_location=torch.device('cpu'))) filenames = os.listdir(args.image_dir) predicted = {} for file in tqdm(filenames): if file == '.DS_Store': continue # Prepare an image image = load_image(os.path.join(args.image_dir, file), transform) image_tensor = image.to(device) if args.model_type == 'attention': features = encoder(image_tensor) sampled_ids, _ = decoder.sample(features) sampled_ids = sampled_ids[0].cpu().numpy() sampled_caption = ['<start>'] elif args.model_type == 'no_attention': features = encoder(image_tensor) sampled_ids = decoder.sample(features) sampled_ids = sampled_ids[0].cpu().numpy() sampled_caption = ['<start>'] elif args.model_type == 'transformer': e_outputs = model.encoder(image_tensor) max_seq_length = 20 sampled_ids = torch.zeros(max_seq_length, dtype=torch.long) sampled_ids[0] = torch.LongTensor([[vocab.word2idx['<start>']] ]).to(device) for i in range(1, max_seq_length): trg_mask = np.triu(np.ones((1, i, i)), k=1).astype('uint8') trg_mask = Variable(torch.from_numpy(trg_mask) == 0).to(device) out = model.decoder(sampled_ids[:i].unsqueeze(0), e_outputs, trg_mask) out = model.out(out) out = F.softmax(out, dim=-1) val, ix = out[:, -1].data.topk(1) sampled_ids[i] = ix[0][0] sampled_ids = sampled_ids.cpu().numpy() sampled_caption = [] # Convert word_ids to words #sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) #print(sentence) predicted[file] = sentence #print(file, sentence) json.dump(predicted, open(args.predict_json, 'w'))
s_testloader = DataLoader(s_testset, batch_size=batch_size, shuffle=True) t_trainset, t_testset = load_usps(data_per_class) #transformの指定は禁止 t_trainloader = DataLoader(t_trainset, batch_size=batch_size, shuffle=True) t_testloader = DataLoader(t_testset, batch_size=64, shuffle=True) net_g = Encoder() net_h = classifier() net_DCD = DCD() loss_func = torch.nn.CrossEntropyLoss() #損失関数は共通 #ソースにおいてgとhを訓練 print("part 1 : initial training for g and h") optimizer = torch.optim.Adam(list(net_g.parameters()) + list(net_h.parameters()), lr=0.001) #optimizerが両者を更新 net_g = net_g.to(device) net_h = net_h.to(device) net_DCD = net_DCD.to(device) if not device == "cpu": net_g = nn.DataParallel(net_g) net_h = nn.DataParallel(net_h) net_DCD = nn.DataParallel(net_DCD) for epoch in range(num_ep_init_gh): for data, label in s_trainloader: data, label = data.to(device), label.to(device) optimizer.zero_grad() pred = net_h(net_g(data)) loss = loss_func(pred, label) loss.backward() optimizer.step()
def main(): device = torch.device('cuda:0') n_features = 256 n_epochs = 40 batch_size = 64 skip_training = False # Create the transformer model encoder = Encoder(src_vocab_size=trainset.input_lang.n_words, n_blocks=3, n_features=n_features, n_heads=16, n_hidden=1024) decoder = Decoder(tgt_vocab_size=trainset.output_lang.n_words, n_blocks=3, n_features=n_features, n_heads=16, n_hidden=1024) encoder.to(device) decoder.to(device) # define training loop parameters parameters = list(encoder.parameters()) + list(decoder.parameters()) adam = torch.optim.Adam(parameters, lr=0, betas=(0.9, 0.98), eps=1e-9) optimizer = NoamOptimizer(n_features, 2, 10000, adam) loss_method = nn.NLLLoss(ignore_index=0, reduction='mean') # prepare data data_dir = tools.select_data_dir() trainset = TranslationDataset(data_dir, train=True) trainloader = DataLoader(dataset=trainset, batch_size=64, shuffle=True, collate_fn=collate, pin_memory=True) # training if not skip_training: for epoch in range(n_epochs): loss = training_loop(encoder, decoder, optimizer, loss_method, trainloader) print(f'Train Epoch {epoch+1}: Loss: {loss}') # save and load trained model tools.save_model(encoder, 'tr_encoder.pth') tools.save_model(decoder, 'tr_decoder.pth') else: encoder = Encoder(src_vocab_size=trainset.input_lang.n_words, n_blocks=3, n_features=256, n_heads=16, n_hidden=1024) tools.load_model(encoder, 'tr_encoder.pth', device) decoder = Decoder(tgt_vocab_size=trainset.output_lang.n_words, n_blocks=3, n_features=256, n_heads=16, n_hidden=1024) tools.load_model(decoder, 'tr_decoder.pth', device) # Generate translations with the trained model # translate sentences from the training set print('Translate training data:') print('-----------------------------') for i in range(5): src_sentence, tgt_sentence = trainset[np.random.choice(len(trainset))] print( '>', ' '.join(trainset.input_lang.index2word[i.item()] for i in src_sentence)) print( '=', ' '.join(trainset.output_lang.index2word[i.item()] for i in tgt_sentence)) out_sentence = translate(encoder, decoder, src_sentence) print( '<', ' '.join(trainset.output_lang.index2word[i.item()] for i in out_sentence), '\n') # translate sentences from the test set testset = TranslationDataset(data_dir, train=False) print('Translate test data:') print('-----------------------------') for i in range(5): input_sentence, target_sentence = testset[np.random.choice( len(testset))] print( '>', ' '.join(testset.input_lang.index2word[i.item()] for i in input_sentence)) print( '=', ' '.join(testset.output_lang.index2word[i.item()] for i in target_sentence)) output_sentence = translate(encoder, decoder, input_sentence) print( '<', ' '.join(testset.output_lang.index2word[i.item()] for i in output_sentence), '\n')
def DDF(cfg): filter_list_path = Path(utils.to_absolute_path(cfg.filter_list)) with open(filter_list_path) as file: filter_list = json.load(file) in_dir = Path(utils.to_absolute_path(cfg.in_dir)) out_dir = Path(utils.to_absolute_path(cfg.out_dir)) out_dir.mkdir(exist_ok=True, parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = Encoder(**cfg.model.encoder) decoder = Decoder(**cfg.model.decoder) encoder.to(device) decoder.to(device) print("Load checkpoint from: {}:".format(cfg.checkpoint)) checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() meter = pyloudnorm.Meter(cfg.preprocessing.sr) #--------------------------------------- if cfg.privacy_preference == "Low": for wav_path, speaker_id, out_filename in tqdm(filter_list): wav_path = in_dir / wav_path # librosa.load (it will return audio time series, and its sampling rate) wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 path = out_dir / out_filename # to return raw recording in mel-spectrogram without any filtering if cfg.output_type == "Embedding": mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db( mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).squeeze().to(device).numpy() np.savetxt(path.with_suffix(".mel.txt"), mel) # to return raw recording in waveform without any filtering if cfg.output_type == "Recording": librosa.output.write_wav(path.with_suffix(".wav"), wav.astype(np.float32), sr=cfg.preprocessing.sr) #--------------------------------------- if cfg.privacy_preference == "Moderate": dataset_path = Path( utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) for wav_path, speaker_id, out_filename in tqdm(filter_list): wav_path = in_dir / wav_path wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) path = out_dir / out_filename if cfg.output_type == "Recording": with torch.no_grad(): vq, _ = encoder.encode(mel) output = decoder.generate(vq, speaker) output_loudness = meter.integrated_loudness(output) output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr) if cfg.output_type == "Embedding": with torch.no_grad(): vq, _ = encoder.encode(mel) speaker = decoder.speaker(speaker) vq = vq.squeeze().to(device).numpy() speaker = speaker.squeeze().to(device).numpy() np.savetxt(path.with_suffix(".vq.txt"), vq) np.savetxt(path.with_suffix(".speaker.txt"), speaker) #--------------------------------------- if cfg.privacy_preference == "High": dataset_path = Path( utils.to_absolute_path("Training/Datasets")) / cfg.dataset.path with open(dataset_path / "speakers.json") as file: speakers = sorted(json.load(file)) for wav_path, speaker_id, out_filename in tqdm(filter_list): wav_path = in_dir / wav_path wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to(device) speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device) path = out_dir / out_filename if cfg.output_type == "Recording": with torch.no_grad(): vq, _ = encoder.encode(mel) output = decoder.generate(vq, speaker) output_loudness = meter.integrated_loudness(output) output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr) if cfg.output_type == "Embedding": with torch.no_grad(): vq, _ = encoder.encode(mel) vq = vq.squeeze().cpu().numpy() np.savetxt(path.with_suffix(".vq.txt"), vq)
embedding_sd = checkpoint['embedding'] embedding = nn.Embedding(Config.vocab_size, Config.hidden_size) embedding.load_state_dict(embedding_sd) encoder = Encoder(embedding) attn_model = 'dot' decoder = Decoder( attn_model, embedding, ) encoder.load_state_dict(encoder_sd) decoder.load_state_dict(decoder_sd) encoder = encoder.to(Config.device) decoder = decoder.to(Config.device) # Set dropout layers to eval mode encoder.eval() decoder.eval() # Initialize search module searcher = GreedySearchDecoder(encoder, decoder) vocab2id = json.load(open('./data/vocab2id.json', 'r')) id2vocab = json.load(open('./data/id2vocab.json', 'r')) print(id2vocab) # Begin chatting (uncomment and run the following line to begin) evaluateInput(encoder, decoder, searcher, vocab2id, id2vocab)
def train(save_path, checkpoint, data_root, batch_size, dataset): device = 'cuda:0' if torch.cuda.is_available() else 'cpu' transform = transforms.Compose( [transforms.Resize((128, 128)), transforms.ToTensor()]) target_transform = transforms.Compose( [transforms.Resize((128, 128)), ToTensor()]) if dataset == 'cityscapes': train_data = Cityscapes(str(data_root), split='train', mode='fine', target_type='semantic', transform=transform, target_transform=transform) eG = 35 dG = [35, 35, 20, 14, 10, 4, 1] eC = 8 dC = 280 n_classes = len(Cityscapes.classes) update_lr = update_lr_default epoch = 200 else: train_data = Deepfashion(str(data_root), split='train', transform=transform, target_transform=transform) n_classes = len(Deepfashion.eclasses) eG = 8 eC = 64 dG = [8, 8, 4, 4, 2, 2, 1] dC = 160 update_lr = update_lr_deepfashion epoch = 100 data_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, num_workers=1) os.makedirs(save_path, exist_ok=True) n_channels = 3 encoder = Encoder(n_classes * n_channels, C=eC, G=eG) decoder = Decoder(8 * eG, n_channels, n_classes, C=dC, Gs=dG) discriminator = Discriminator(n_classes + n_channels) vgg = Vgg19().eval() encoder = torch.nn.DataParallel(encoder) decoder = torch.nn.DataParallel(decoder) discriminator = torch.nn.DataParallel(discriminator) vgg = torch.nn.DataParallel(vgg) gen_opt = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.0001, betas=(0, 0.9)) dis_opt = optim.Adam(discriminator.parameters(), lr=0.0004, betas=(0, 0.9)) gen_scheduler = optim.lr_scheduler.LambdaLR(gen_opt, update_lr) dis_scheduler = optim.lr_scheduler.LambdaLR(gen_opt, update_lr) params = [ 'encoder', 'decoder', 'discriminator', 'gen_opt', 'dis_opt', 'gen_scheduler', 'dis_scheduler' ] if os.path.exists(checkpoint): cp = torch.load(checkpoint) print(f'Load checkpoint: {checkpoint}') for param in params: eval(param).load_state_dict(cp[param]) # encoder.load_state_dict(cp['encoder']) # decoder.load_state_dict(cp['decoder']) # discriminator.load_state_dict(cp['discriminator']) # gen_opt.load_state_dict(cp['gen_opt']) # dis_opt.load_state_dict(cp['dis_opt']) # gen_scheduler.load_state_dict(cp['gen_scheduler']) # dis_scheduler.load_state_dict(cp['dis_scheduler']) def to_device_optimizer(opt): for state in opt.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) to_device_optimizer(gen_opt) to_device_optimizer(dis_opt) encoder = encoder.to(device) decoder = decoder.to(device) discriminator = discriminator.to(device) vgg = vgg.to(device) print(len(data_loader)) for epoch in range(epoch): e_g_loss = [] e_d_loss = [] for i, batch in tqdm(enumerate(data_loader)): x, sem = batch x = x.to(device) sem = sem.to(device) sem = sem * 255.0 sem = sem.long() s = split_class(x, sem, n_classes) sem_target = sem.clone() del sem sem = torch.zeros(x.size()[0], n_classes, sem_target.size()[2], sem_target.size()[3], device=x.device) sem.scatter_(1, sem_target, 1) s = s.detach() s = s.to(device) mu, sigma = encoder(s) z = mu + torch.exp(0.5 * sigma) * torch.rand(mu.size(), device=mu.device) gen = decoder(z, sem) d_fake = discriminator(gen, sem) d_real = discriminator(x, sem) l1loss = nn.L1Loss() gen_opt.zero_grad() loss_gen = 0.5 * d_fake[0][-1].mean() + 0.5 * d_fake[1][-1].mean() loss_fm = sum([ sum([l1loss(f, g) for f, g in zip(fs, rs)]) for fs, rs in zip(d_fake, d_real) ]).mean() f_fake = vgg(gen) f_real = vgg(x) # loss_p = 1.0 / 32 * l1loss(f_fake.relu1_2, f_real.relu1_2) + \ # 1.0 / 16 * l1loss(f_fake.relu2_2, f_real.relu2_2) + \ # 1.0 / 8 * l1loss(f_fake.relu3_3, f_real.relu3_3) + \ # 1.0 / 4 * l1loss(f_fake.relu4_3, f_real.relu4_3) + \ # l1loss(f_fake.relu5_3, f_real.relu5_3) loss_p = 1.0 / 32 * l1loss(f_fake[0], f_real[0]) + \ 1.0 / 16 * l1loss(f_fake[1], f_real[1]) + \ 1.0 / 8 * l1loss(f_fake[2], f_real[2]) + \ 1.0 / 4 * l1loss(f_fake[3], f_real[3]) + \ l1loss(f_fake[4], f_real[4]) loss_kl = -0.5 * torch.sum(1 + sigma - mu * mu - torch.exp(sigma)) loss = loss_gen + 10.0 * loss_fm + 10.0 * loss_p + 0.05 * loss_kl loss.backward(retain_graph=True) gen_opt.step() dis_opt.zero_grad() loss_dis = torch.mean(-torch.mean(torch.min(d_real[0][-1] - 1, torch.zeros_like(d_real[0][-1]))) + -torch.mean(torch.min(-d_fake[0][-1] - 1, torch.zeros_like(d_fake[0][-1])))) + \ torch.mean(-torch.mean(torch.min(d_real[1][-1] - 1, torch.zeros_like(d_real[1][-1]))) + -torch.mean(torch.min(-d_fake[1][-1] - 1, torch.zeros_like(d_fake[1][-1])))) loss_dis.backward() dis_opt.step() e_g_loss.append(loss.item()) e_d_loss.append(loss_dis.item()) #plt.imshow((gen.detach().cpu().numpy()[0]).transpose(1, 2, 0)) #plt.pause(.01) #print(i, 'g_loss', e_g_loss[-1], 'd_loss', e_d_loss[-1]) os.makedirs(save_path / str(epoch), exist_ok=True) Image.fromarray((gen.detach().cpu().numpy()[0].transpose(1, 2, 0) * 255.0).astype(np.uint8)).save( save_path / str(epoch) / f'{i}.png') print('g_loss', np.mean(e_g_loss), 'd_loss', np.mean(e_d_loss)) # save cp = {} for param in params: cp[param] = eval(param).state_dict() torch.save(cp, save_path / 'latest.pth' ) #{param:eval(param).state_dict() for param in params})
def convert(cfg): dataset_path = Path(utils.to_absolute_path( "datasets")) / cfg.dataset.path #zerospeech/datasets/2019/english with open(dataset_path / "speakers.json") as file: # 말하는 사람들 이름 써있는 데이터 speakers = sorted(json.load(file)) # speakers라는 객체로 저장 synthesis_list_path = Path(utils.to_absolute_path( cfg.synthesis_list)) # ???인걸 보니 우리가 파이썬에서 돌릴때 지정해줘야함 with open(synthesis_list_path) as file: synthesis_list = json.load( file) # datasets/2019/english에 있는 synthesis.json보면됨 in_dir = Path(utils.to_absolute_path( cfg.in_dir)) # ???임. zerospeech 폴더로 경로따면 될듯. (./) out_dir = Path(utils.to_absolute_path( cfg.out_dir)) #???임. 목소리 변환된 결과를 저장할 경로 out_dir.mkdir(exist_ok=True, parents=True) device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # gpu안되면 cpu로 encoder = Encoder( **cfg.model.encoder) #ZeroSpeech/config/model/default에 있는 encoder decoder = Decoder( **cfg.model.decoder) #ZeroSpeech/config/model/default에 있는 decoder encoder.to(device) # cpu or gpu decoder.to(device) # cpu or gpu print("Load checkpoint from: {}:".format(cfg.checkpoint) ) ### ???로 되어있는데 pretrained, 혹은 checkpoint까지 학습된 모델 있으면 그 모델의 위치로 지정 checkpoint_path = utils.to_absolute_path(cfg.checkpoint) checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage ) # checkpoint에 지정된 weight들을 불러옵니다 encoder.load_state_dict(checkpoint["encoder"]) decoder.load_state_dict(checkpoint["decoder"]) encoder.eval() decoder.eval() meter = pyloudnorm.Meter( cfg.preprocessing.sr ) #sr:16000으로 조정?? https://www.christiansteinmetz.com/projects-blog/pyloudnorm 소음 관련같습니다.. for wav_path, speaker_id, out_filename in tqdm( synthesis_list ): #"english/test/S002_0379088085","V002","V002_0379088085" wav_path = in_dir / wav_path # ./english/test/S002_0379088085 wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.preprocessing.sr) ref_loudness = meter.integrated_loudness(wav) #인풋의 음량을 측정인듯 wav = wav / np.abs(wav).max() * 0.999 mel = librosa.feature.melspectrogram( preemphasis(wav, cfg.preprocessing.preemph), sr=cfg.preprocessing.sr, n_fft=cfg.preprocessing.n_fft, n_mels=cfg.preprocessing.n_mels, hop_length=cfg.preprocessing.hop_length, win_length=cfg.preprocessing.win_length, fmin=cfg.preprocessing.fmin, power=1) logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db) logmel = logmel / cfg.preprocessing.top_db + 1 mel = torch.FloatTensor(logmel).unsqueeze(0).to( device) #unsqueeze()함수는 인수로 받은 위치에 새로운 차원을 삽입 #https://subinium.github.io/pytorch-Tensor-Variable/#%EB%8D%94%EB%AF%B8-%EC%B0%A8%EC%9B%90-%EC%B6%94%EA%B0%80%EC%99%80-%EC%82%AD%EC%A0%9C--squeeze--unsqueeze #https://datascienceschool.net/view-notebook/4f3606fd839f4320a4120a56eec1e228/ speaker = torch.LongTensor([speakers.index(speaker_id) ]).to(device) # 마찬가지로 텐서로 만드는데 #텐서에는 자료형이라는 것이 있습니다. 각 데이터형별로 정의되어져 있는데, #예를 들어 32비트의 유동 소수점은 torch.FloatTensor를, 64비트의 부호 있는 정수는 torch.LongTensor를 사용합니다. #GPU 연산을 위한 자료형도 있습니다. 예를 들어 torch.cuda.FloatTensor가 그 예입니다. # 즉 mel은 소수점있고 speaker는 소숫점 없으니까! with torch.no_grad( ): # 자동미분,벡터연산한 결과의 연산기록 추적못하게 https://bob3rdnewbie.tistory.com/315 z, _ = encoder.encode(mel) output = decoder.generate(z, speaker) output_loudness = meter.integrated_loudness(output) #아웃풋의 음량을 측정인듯 output = pyloudnorm.normalize.loudness(output, output_loudness, ref_loudness) # 아웃풋의 음량을 input에 넣은 wav의 음량과 동일하게 변경 path = out_dir / out_filename librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
def test(args): ''' compute bleu score on all images, and average its bleu score ''' train_json_path = './data/annotations/captions_train2014.json' test_json_path = './data/annotations/captions_val2014.json' train_image_dir = './data/train2014' test_image_dir = './data/val2014' if args.eval == 'eval': print('eval bleu') jsonPath = test_json_path image_dir = test_image_dir else: print('train bleu') jsonPath = train_json_path image_dir = train_image_dir # Image preprocessing # In generation phase, we need should not random crop, just resize transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wraper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build model encoder = Encoder(embed_size=args.embed_size).eval() decoder = Decoder(stateful=False, embed_size=args.embed_size, hidden_size=args.hidden_size, vocab_size=len(vocab), num_layers=args.num_layers).eval() encoder = encoder.to(device) decoder = decoder.to(device) # load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path, map_location=device)) decoder.load_state_dict(torch.load(args.decoder_path, map_location=device)) name_caption_frame = get_image_name(jsonPath) unique_image_names = pd.unique(name_caption_frame['file_name']) # Add image directory path train2014 or val2014 unique_image_names = [ os.path.join(image_dir, image_name) for image_name in unique_image_names ] total_generated_score4 = [] total_theoratical_score4 = [] total_generated_score3 = [] total_theoratical_score3 = [] total_generated_score2 = [] total_theoratical_score2 = [] total_generated_score1 = [] total_theoratical_score1 = [] # Parallelize the process def score_helper(image_path): caption = generate_caption(image_path, vocab, encoder, decoder, transform) generated_score4, theoratical_score4 = bleu4_score( image_path, caption, name_caption_frame) total_generated_score4.append(generated_score4) total_theoratical_score4.append(theoratical_score4) generated_score3, theoratical_score3 = bleu3_score( image_path, caption, name_caption_frame) total_generated_score3.append(generated_score3) total_theoratical_score3.append(theoratical_score3) generated_score2, theoratical_score2 = bleu2_score( image_path, caption, name_caption_frame) total_generated_score2.append(generated_score2) total_theoratical_score2.append(theoratical_score2) generated_score1, theoratical_score1 = bleu1_score( image_path, caption, name_caption_frame) total_generated_score1.append(generated_score1) total_theoratical_score1.append(theoratical_score1) _ = pd.Series(unique_image_names).apply(score_helper) print('Average bleu-4 score:', sum(total_generated_score4) / len(total_generated_score4), ' | Average theoratical bleu-4 score:', sum(total_theoratical_score4) / len(total_theoratical_score4)) print('Average bleu-3 score:', sum(total_generated_score3) / len(total_generated_score3), ' | Average theoratical bleu-3 score:', sum(total_theoratical_score3) / len(total_theoratical_score3)) print('Average bleu-2 score:', sum(total_generated_score2) / len(total_generated_score2), ' | Average theoratical bleu-2 score:', sum(total_theoratical_score2) / len(total_theoratical_score2)) print('Average bleu-1 score:', sum(total_generated_score1) / len(total_generated_score1), ' | Average theoratical bleu-1 score:', sum(total_theoratical_score1) / len(total_theoratical_score1))
start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] best_bleu4 = checkpoint['bleu-4'] decoder = checkpoint['decoder'] decoder_optimizer = checkpoint['decoder_optimizer'] encoder = checkpoint['encoder'] encoder_optimizer = checkpoint['encoder_optimizer'] if fine_tune_encoder is True and encoder_optimizer is None: encoder.fine_tune(fine_tune_encoder) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) # Move to GPU, if available decoder = decoder.to(device) encoder = encoder.to(device) # Loss function criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader( CaptionDataset(data_folder, data_name, 'TRAIN', transform=transforms.Compose([normalize])), batch_size=batch_size, shuffle=True,
def main(): global epochs_since_improvement, best_loss_tr encoder = Encoder() decoder = DecoderWithAttention(encoder_dim, lstm_input_dim, decoder_dim, attention_dim, output_dim) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=decoder_lr) encoder = encoder.to(device) decoder = decoder.to(device) trainLoader = torch.utils.data.DataLoader(Dataset(driver, circuit_tr, curvatureLength, historyLength, predLength), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) cMean_tr = trainLoader.dataset.cMean cStd_tr = trainLoader.dataset.cStd vMean_tr = trainLoader.dataset.vMean vStd_tr = trainLoader.dataset.vStd aMean_tr = trainLoader.dataset.aMean aStd_tr = trainLoader.dataset.aStd validLoader = torch.utils.data.DataLoader(Dataset(driver, circuit_vl, curvatureLength, historyLength, predLength, cMean=cMean_tr, cStd=cStd_tr, vMean=vMean_tr, vStd=vStd_tr, aMean=aMean_tr, aStd=aStd_tr), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) print('Training version.{} (A->V)'.format(vNumber)) print('Training data ({} - {})'.format(driver, circuit_tr)) print('Validation data ({} - {})'.format(driver, circuit_vl)) print('curvature len {}'.format(curvatureLength)) print('history len {}'.format(historyLength)) print('pred len {}'.format(predLength)) print('hiddenDimension {}'.format(hiddenDimension)) print('\nTraining...\n') for epoch in tqdm(range(start_epoch, epochs)): loss, vMape, vRmse, vCorr, aCorr = train( trainLoader=trainLoader, encoder=encoder, decoder=decoder, criterion=criterion, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch) writer.add_scalars('Loss', {'tr': loss}, epoch) writer.add_scalars('MAPE', {'tr': vMape}, epoch) writer.add_scalars('RMSE', {'tr': vRmse}, epoch) writer.add_scalars('vCorr', {'tr': vCorr}, epoch) writer.add_scalars('aCorr', {'tr': aCorr}, epoch) is_best = loss < best_loss_tr best_loss_tr = min(loss, best_loss_tr) if not is_best: epochs_since_improvement += 1 print( '\nEpoch {} Epoch Epochs since last improvement (unit: 100): {}\n' .format(epoch, epochs_since_improvement)) else: epochs_since_improvement = 0 if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: adjust_learning_rate(epoch, encoder_optimizer, 0.8) adjust_learning_rate(epoch, decoder_optimizer, 0.8) if epoch % 5 == 0: loss_vl, vMape_vl, vRmse_vl, vCorr_vl, aCorr_vl = validate( validLoader=validLoader, encoder=encoder, decoder=decoder, criterion=criterion) writer.add_scalars('Loss', {'vl': loss_vl}, epoch) writer.add_scalars('MAPE', {'vl': vMape_vl}, epoch) writer.add_scalars('RMSE', {'vl': vRmse_vl}, epoch) writer.add_scalars('vCorr', {'vl': vCorr_vl}, epoch) writer.add_scalars('aCorr', {'vl': aCorr_vl}, epoch) if epoch % 10 == 0: save_checkpoint(chptFolderPath, encoder, decoder, epoch, cMean_tr, cStd_tr, vMean_tr, vStd_tr, aMean_tr, aStd_tr, curvatureLength, historyLength) writer.close()
def main(): epoch = 1000 batch_size = 256 hidden_dim = 128 encoder = Encoder(num_words, hidden_dim, n_layers=args.n_layers, bidirectional=args.bidirectional).to(device) if args.attn: decoder = AttnDecoder(hidden_dim, num_words, max_seqlen, n_layers=args.n_layers).to(device) else: decoder = Decoder(hidden_dim, num_words, n_layers=args.n_layers).to(device) if args.train: weight = torch.ones(num_words) weight[word2idx[PAD_TOKEN]] = 0 encoder = encoder.to(device) decoder = decoder.to(device) weight = weight.to(device) encoder_optimizer = Adam(encoder.parameters(), lr=0.001) decoder_optimizer = Adam(decoder.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss(ignore_index=word2idx[PAD_TOKEN]) np.random.seed(1124) order = np.arange(len(train_X)) best_loss = 1e10 best_percentage = 0 best_percentage_epoch = 0 best_epoch = 0 start_epoch = 0 if args.resume: start_epoch, best_loss = load_checkpoint(args.model_path, encoder, encoder_optimizer, decoder, decoder_optimizer) for e in range(start_epoch, start_epoch + epoch): if e - best_percentage_epoch > 2: break np.random.shuffle(order) shuffled_train_X = train_X[order] shuffled_train_Y = train_Y[order] train_loss = 0 valid_loss = 0 for b in tqdm(range(int(len(order) // batch_size))): batch_x = torch.LongTensor( shuffled_train_X[b * batch_size:(b + 1) * batch_size].tolist()).t() batch_y = torch.LongTensor( shuffled_train_Y[b * batch_size:(b + 1) * batch_size].tolist()).t() batch_x, batch_y = batch_x.to(device), batch_y.to(device) train_loss += train(batch_x, batch_y, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) train_loss /= b all_control_cnt, all_hit_cnt = [], [] for b in range(len(valid_X) // batch_size): batch_x = torch.LongTensor(valid_X[b * batch_size:(b + 1) * batch_size].tolist()).t() batch_y = torch.LongTensor(valid_Y[b * batch_size:(b + 1) * batch_size].tolist()).t() batch_x, batch_y = batch_x.to(device), batch_y.to(device) val_loss, control_cnt, hit_cnt = valid(batch_x, batch_y, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) valid_loss += val_loss all_control_cnt.extend(control_cnt) all_hit_cnt.extend(hit_cnt) valid_loss /= b all_control_cnt = np.array(all_control_cnt) all_hit_cnt = np.array(all_hit_cnt) nonzero = all_control_cnt != 0 all_control_cnt = all_control_cnt[nonzero] all_hit_cnt = all_hit_cnt[nonzero] percentage = np.mean(all_hit_cnt / all_control_cnt) logger.info( "epoch {}, train_loss {:.4f}, valid_loss {:.4f}, best_epoch {}, best_loss {:.4f}, control_cnt {}, hit_cnt {}, percentage {:.4f}" .format(e, train_loss, valid_loss, best_epoch, best_loss, np.sum(all_control_cnt), np.sum(all_hit_cnt), percentage)) if percentage > best_percentage: best_percentage = percentage best_percentage_epoch = e torch.save( { 'encoder_state_dict': encoder.state_dict(), 'encoder_optimizer_state_dict': encoder_optimizer.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'decoder_optimizer_state_dict': decoder_optimizer.state_dict(), 'epoch': e, 'loss': valid_loss, 'percentage': best_percentage, }, args.model_path) if valid_loss < best_loss: best_loss = valid_loss best_epoch = e torch.save( { 'encoder_state_dict': encoder.state_dict(), 'encoder_optimizer_state_dict': encoder_optimizer.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'decoder_optimizer_state_dict': decoder_optimizer.state_dict(), 'epoch': e, 'loss': valid_loss }, args.model_path) batch_x = torch.LongTensor(valid_X[:batch_size].tolist()).t() batch_y = torch.LongTensor(valid_Y[:batch_size].tolist()).t() batch_x, batch_y = batch_x.to(device), batch_y.to(device) input_chinese, output_chinese = predict(batch_x, batch_y, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 20) logger.info('*** Results ***') logger.info('Best Hit Accuracy: {}'.format(best_percentage)) logger.info( 'Best Hit Accuracy Epoch: {}'.format(best_percentage_epoch)) for inp, out in zip(input_chinese, output_chinese): logger.info('{}\t||\t{}'.format(inp, out)) logger.info(encoder) logger.info(decoder) logger.info('\n\n' + '=' * 100 + '\n\n') else: print(encoder) print(decoder)