def train_ae(batch, total_loss_ae, start_time, i): autoencoder.train() autoencoder.zero_grad() source, target, lengths = batch source = to_gpu(args.cuda, Variable(source)) target = to_gpu(args.cuda, Variable(target)) # Create sentence length mask over padding mask = target.gt(0) masked_target = target.masked_select(mask) # examples x ntokens output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens) # output: batch x seq_len x ntokens output = autoencoder(source, lengths, noise=True) # output_size: batch_size, maxlen, self.ntokens flattened_output = output.view(-1, ntokens) masked_output = \ flattened_output.masked_select(output_mask).view(-1, ntokens) loss = criterion_ce(masked_output/args.temp, masked_target) loss.backward() # `clip_grad_norm` to prevent exploding gradient in RNNs / LSTMs torch.nn.utils.clip_grad_norm(autoencoder.parameters(), args.clip) optimizer_ae.step() total_loss_ae += loss.data accuracy = None if i % args.log_interval == 0 and i > 0: # accuracy probs = F.softmax(masked_output) max_vals, max_indices = torch.max(probs, 1) accuracy = torch.mean(max_indices.eq(masked_target).float()).data[0] cur_loss = total_loss_ae[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | acc {:8.2f}' .format(epoch, i, len(train_data), elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), accuracy)) with open("./output/{}/logs.txt".format(args.outf), 'a') as f: f.write('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | acc {:8.2f}\n'. format(epoch, i, len(train_data), elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), accuracy)) total_loss_ae = 0 start_time = time.time() return total_loss_ae, start_time
def evaluate_autoencoder(data_source, epoch): # Turn on evaluation mode which disables dropout. autoencoder.eval() total_loss = 0 ntokens = len(corpus.dictionary.word2idx) all_accuracies = 0 bcnt = 0 for i, batch in enumerate(data_source): source, target, lengths = batch source = to_gpu(args.cuda, Variable(source, volatile=True)) target = to_gpu(args.cuda, Variable(target, volatile=True)) mask = target.gt(0) masked_target = target.masked_select(mask) # examples x ntokens output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens) # output: batch x seq_len x ntokens output = autoencoder(source, lengths, noise=True) flattened_output = output.view(-1, ntokens) masked_output = \ flattened_output.masked_select(output_mask).view(-1, ntokens) total_loss += criterion_ce(masked_output/args.temp, masked_target).data # accuracy max_vals, max_indices = torch.max(masked_output, 1) all_accuracies += \ torch.mean(max_indices.eq(masked_target).float()).data[0] bcnt += 1 aeoutf = "./output/%s/%d_autoencoder.txt" % (args.outf, epoch) with open(aeoutf, "a") as f: max_values, max_indices = torch.max(output, 2) max_indices = \ max_indices.view(output.size(0), -1).data.cpu().numpy() target = target.view(output.size(0), -1).data.cpu().numpy() for t, idx in zip(target, max_indices): # real sentence chars = " ".join([corpus.dictionary.idx2word[x] for x in t]) f.write(chars) f.write("\n") # autoencoder output sentence chars = " ".join([corpus.dictionary.idx2word[x] for x in idx]) f.write(chars) f.write("\n\n") return total_loss[0] / len(data_source), all_accuracies/bcnt
def encode(self, indices, lengths, noise): embeddings = self.embedding(indices) packed_embeddings = pack_padded_sequence(input=embeddings, lengths=lengths, batch_first=True) # Encode packed_output, state = self.encoder(packed_embeddings) hidden, cell = state # batch_size x nhidden hidden = hidden[-1] # get hidden state of last layer of encoder # normalize to unit ball (l2 norm of 1) - p=2, dim=1 norms = torch.norm(hidden, 2, 1) # For older versions of PyTorch use: hidden = torch.div(hidden, norms.expand_as(hidden)) # For newest version of PyTorch (as of 8/25) use this: # hidden = torch.div(hidden, norms.unsqueeze(1).expand_as(hidden)) if noise and self.noise_radius > 0: gauss_noise = torch.normal(means=torch.zeros(hidden.size()), std=self.noise_radius) hidden = hidden + to_gpu(self.gpu, Variable(gauss_noise)) return hidden
def train_gan_d(batch): # clamp parameters to a cube for p in gan_disc.parameters(): p.data.clamp_(-args.gan_clamp, args.gan_clamp) autoencoder.train() autoencoder.zero_grad() gan_disc.train() gan_disc.zero_grad() # positive samples ---------------------------- # generate real codes source, target, lengths = batch source = to_gpu(args.cuda, Variable(source)) target = to_gpu(args.cuda, Variable(target)) # batch_size x nhidden real_hidden = autoencoder(source, lengths, noise=False, encode_only=True) real_hidden.register_hook(grad_hook) # loss / backprop errD_real = gan_disc(real_hidden) errD_real.backward(one) # negative samples ---------------------------- # generate fake codes noise = to_gpu(args.cuda, Variable(torch.ones(args.batch_size, args.z_size))) noise.data.normal_(0, 1) # loss / backprop fake_hidden = gan_gen(noise) errD_fake = gan_disc(fake_hidden.detach()) errD_fake.backward(mone) # `clip_grad_norm` to prvent exploding gradient problem in RNNs / LSTMs torch.nn.utils.clip_grad_norm(autoencoder.parameters(), args.clip) optimizer_gan_d.step() optimizer_ae.step() errD = -(errD_real - errD_fake) return errD, errD_real, errD_fake
def train_gan_g(): gan_gen.train() gan_gen.zero_grad() noise = to_gpu(args.cuda, Variable(torch.ones(args.batch_size, args.z_size))) noise.data.normal_(0, 1) fake_hidden = gan_gen(noise) errG = gan_disc(fake_hidden) # loss / backprop errG.backward(one) optimizer_gan_g.step() return errG
def train_lm(eval_path, save_path): # generate examples indices = [] noise = to_gpu(args.cuda, Variable(torch.ones(100, args.z_size))) for i in range(1000): noise.data.normal_(0, 1) fake_hidden = gan_gen(noise) max_indices = autoencoder.generate(fake_hidden, args.maxlen) indices.append(max_indices.data.cpu().numpy()) indices = np.concatenate(indices, axis=0) # write generated sentences to text file with open(save_path+".txt", "w") as f: # laplacian smoothing for word in corpus.dictionary.word2idx.keys(): f.write(word+"\n") for idx in indices: # generated sentence words = [corpus.dictionary.idx2word[x] for x in idx] # truncate sentences to first occurrence of <eos> truncated_sent = [] for w in words: if w != '<eos>': truncated_sent.append(w) else: break chars = " ".join(truncated_sent) f.write(chars+"\n") # train language model on generated examples lm = train_ngram_lm(kenlm_path=args.kenlm_path, data_path=save_path+".txt", output_path=save_path+".arpa", N=args.N) # load sentences to evaluate on with open(eval_path, 'r') as f: lines = f.readlines() sentences = [l.replace('\n', '') for l in lines] ppl = get_ppl(lm, sentences) return ppl
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=None, dilation=1, bias=True, w_init_gain='linear'): super(ConvNorm, self).__init__() if padding is None: assert (kernel_size % 2 == 1) padding = int(dilation * (kernel_size - 1) / 2) self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias) self.conv = to_gpu(self.conv) torch.nn.init.xavier_uniform_( self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
def encode(self, indices, lengths=None, noise=True): embeddings = self.embedding(indices) embeddings = embeddings.transpose(1, 2) c_pre_lin = self.encoder(embeddings) c_pre_lin = c_pre_lin.squeeze(2) hidden = self.linear(c_pre_lin.permute(0, 2, 1)) # normalize to unit ball (l2 norm of 1) - p=2, dim=1 norms = torch.norm(hidden, 2, 1) if norms.ndimension() == 1: norms = norms.unsqueeze(1) hidden = torch.div(hidden, norms.unsqueeze(1).expand_as(hidden)) if noise and self.noise_radius > 0: normal = Normal(torch.zeros(hidden.size()), self.noise_radius * torch.ones(hidden.size())) gauss_noise = normal.sample() # gauss_noise = torch.normal(means=torch.zeros(hidden.size()), # std=self.noise_radius*torch.ones(hidden.size())) if self.gpu: gauss_noise = gauss_noise.cuda() hidden = hidden + to_gpu(self.gpu, Variable(gauss_noise)) return hidden
def __init__(self, emsize, nhidden, ntokens, nlayers, noise_radius=0.2, hidden_init=False, dropout=0, gpu=False): super(Seq2Seq, self).__init__() self.nhidden = nhidden self.emsize = emsize self.ntokens = ntokens self.nlayers = nlayers self.noise_radius = noise_radius self.hidden_init = hidden_init self.dropout = dropout self.gpu = gpu self.start_symbols = to_gpu(gpu, Variable(torch.ones(10, 1).long())) # Vocabulary embedding self.embedding = nn.Embedding(ntokens, emsize) self.embedding_decoder = nn.Embedding(ntokens, emsize) # RNN Encoder and Decoder self.encoder = nn.LSTM(input_size=emsize, hidden_size=nhidden, num_layers=nlayers, dropout=dropout, batch_first=True) decoder_input_size = emsize+nhidden self.decoder = nn.LSTM(input_size=decoder_input_size, hidden_size=nhidden, num_layers=1, dropout=dropout, batch_first=True) # Initialize Linear Transformation self.linear = nn.Linear(nhidden, ntokens) self.init_weights()
def parse_batch(self, batch): # text_padded, input_lengths, mel_padded, gate_padded, \ # output_lengths = batch input_lengths, mask_padded, words_sorted, \ select_target_padded, mel_padded, gate_padded, output_lengths = batch input_lengths = to_gpu(input_lengths).long() mask_padded = to_gpu(mask_padded).float() # mask_padded = to_gpu(mask_padded).long() select_target_padded = to_gpu(select_target_padded).long() max_len = torch.max(input_lengths.data).item() mel_padded = to_gpu(mel_padded).float() gate_padded = to_gpu(gate_padded).float() output_lengths = to_gpu(output_lengths).long() return ((input_lengths, mask_padded, select_target_padded, words_sorted, mel_padded, max_len, output_lengths), (mel_padded, gate_padded, select_target_padded))
def train(model_directory, epochs, learning_rate, epochs_per_checkpoint, batch_size, seed): torch.manual_seed(seed) torch.cuda.manual_seed(seed) criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() # model.upsample = torch.nn.Sequential() #replace the upsample step with no operation as we manually control samples # model.upsample.weight = None # model.upsample.bias = None optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 checkpoint_path = find_checkpoint(model_directory) if checkpoint_path is not None: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = SimpleWaveLoader() train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=None, batch_size=batch_size, pin_memory=False, drop_last=True) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) loss = criterion(y_pred, y) reduced_loss = loss.data.item() loss.backward() optimizer.step() #print out the loss, and save to a file print("{}:\t{:.9f}".format(iteration, reduced_loss)) with open(os.path.join(model_directory, 'loss_history.txt'), 'a') as f: f.write('%s\n' % str(reduced_loss)) iteration += 1 torch.cuda.empty_cache() if (epoch != 0 and epoch % epochs_per_checkpoint == 0): checkpoint_path = os.path.join(model_directory, 'checkpoint_%d' % iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path)
def alignment(self, sentences, poss, customs, lengths, bert_sent, bert_sent_type, bert_sent_mask): batch_size = lengths.size(0) if self.config.use_bert: bert_output = self.bertmodel(input_ids=bert_sent, attention_mask=bert_sent_mask, token_type_ids=bert_sent_type) bert_output = bert_output[0] # masked mean masked_output = torch.mul(bert_sent_mask.unsqueeze(2), bert_output) mask_len = torch.sum(bert_sent_mask, dim=1, keepdim=True) bert_output = torch.sum(masked_output, dim=1, keepdim=False) / mask_len utterance_text = bert_output else: # extract features from text modality sentences = self.embed(sentences) final_h1t, final_h2t = self.extract_features( sentences, lengths, self.trnn1, self.trnn2, self.tlayer_norm) utterance_text = torch.cat( (final_h1t, final_h2t), dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1) # extract features from pos modality 可以直接替换 final_h1p, final_h2p = self.extract_features(poss, lengths, self.prnn1, self.prnn2, self.player_norm) utterance_pos = torch.cat( (final_h1p, final_h2p), dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1) utterance_cust = customs # Shared-private encoders self.shared_private(utterance_text, utterance_pos, utterance_cust) # For reconstruction self.reconstruct() # 1-LAYER TRANSFORMER FUSION #shape= [9*96*32] h = torch.stack(( self.utt_private_t, self.utt_private_p, self.utt_private_c, self.utt_shared_t, self.utt_shared_p, self.utt_shared_c, ), dim=0) h = self.transformer_encoder(h) h = torch.cat(( h[0], h[1], h[2], h[3], h[4], h[5], ), dim=1) features = to_gpu(torch.empty((0, 12 * self.config.hidden_size))) hx = self.hx for x in h: x = x.unsqueeze(0).unsqueeze(0) # if self.config.rnncell == "lstm": # _, (hx, _) = self.conversation_rnn(input=x, hx=hx.detach()) # else: # _, hx = self.conversation_rnn(input=x, hx=hx.detach()) _, hx = self.conversation_rnn(input=x, hx=hx.detach()) features = torch.cat((features, hx.view(1, -1)), dim=0) self.hx = hx.detach() o = self.fusion(features) return o
def epoch_step(loader, desc, model, criterion, metrics, scaler, opt=None, batch_accum=1): is_train = opt is not None if is_train: model.train() criterion.train() else: model.eval() criterion.eval() pbar = tqdm.tqdm(total=len(loader), desc=desc, leave=False, mininterval=2) loc_loss = n = 0 loc_accum = 1 for x, y in loader: x = to_gpu(x, args.dist.gpu) y = to_gpu(y, args.dist.gpu) # x = x.to(memory_format=torch.channels_last) with torch.cuda.amp.autocast(): logits = model(x) loss = criterion(logits, y) / batch_accum if is_train: scaler.scale(loss).backward() if loc_accum == batch_accum: scaler.step(opt) scaler.update() for p in model.parameters(): p.grad = None # opt.zero_grad() loc_accum = 1 else: loc_accum += 1 logits = logits.detach() bs = len(x) loc_loss += loss.item() * bs * batch_accum n += bs for metric in metrics.values(): metric.update(logits, y) torch.cuda.synchronize() if args.dist.local_rank == 0: postfix = {"loss": f"{loc_loss / n:.3f}"} postfix.update({ k: f"{metric.evaluate():.3f}" for k, metric in metrics.items() }) if is_train: postfix.update( {"lr": f'{next(iter(opt.param_groups))["lr"]:.3}'}) pbar.set_postfix(**postfix) pbar.update() if is_train and loc_accum != batch_accum: scaler.step(opt) scaler.update() for p in model.parameters(): p.grad = None # opt.zero_grad() pbar.close() return loc_loss / n
def __init__(self, emsize, nhidden, ntokens, nlayers, conv_windows="5-5-3", conv_strides="2-2-2", conv_layer="500-700-1000", activation=nn.LeakyReLU(0.2, inplace=True), noise_r=0.2, share_decoder_emb=False, hidden_init=False, dropout=0, gpu=False, pooling_enc="avg"): super(Seq2Seq2CNNLSTMEncoderDecoder, self).__init__() self.nhidden = nhidden self.emsize = emsize self.ntokens = ntokens self.nlayers = nlayers self.noise_r = noise_r self.hidden_init = hidden_init self.dropout = dropout self.gpu = gpu # for CNN encoder self.arch_conv_filters = conv_layer self.arch_conv_strides = conv_strides self.arch_conv_windows = conv_windows self.start_symbols = to_gpu(gpu, Variable(torch.ones(10, 1).long())) # Vocabulary embedding self.embedding = nn.Embedding(ntokens, emsize) self.embedding_prem = nn.Embedding(ntokens, emsize) self.embedding_decoder1 = nn.Embedding(ntokens, emsize) self.embedding_decoder2 = nn.Embedding(ntokens, emsize) self.embedding_decoder3 = nn.Embedding(ntokens, emsize) # for CNN hypo encoder conv_layer_sizes = [emsize] + [int(x) for x in conv_layer.split('-')] conv_strides_sizes = [int(x) for x in conv_strides.split('-')] conv_windows_sizes = [int(x) for x in conv_windows.split('-')] self.encoder = nn.Sequential() for i in range(len(conv_layer_sizes) - 1): layer = nn.Conv1d(conv_layer_sizes[i], conv_layer_sizes[i + 1], \ conv_windows_sizes[i], stride=conv_strides_sizes[i]) self.encoder.add_module("layer-" + str(i + 1), layer) bn = nn.BatchNorm1d(conv_layer_sizes[i + 1]) self.encoder.add_module("bn-" + str(i + 1), bn) self.encoder.add_module("activation-" + str(i + 1), activation) if pooling_enc == "max": self.pooling_enc = nn.AdaptiveMaxPool1d(1) else: self.pooling_enc = nn.AdaptiveAvgPool1d(1) self.linear_enc = nn.Linear(1000, nhidden) # for CNN prem encoder self.encoder_prem = nn.LSTM(input_size=emsize, hidden_size=nhidden, num_layers=nlayers, dropout=dropout, batch_first=True) decoder_input_size = emsize + nhidden * 2 self.decoder1 = nn.LSTM(input_size=decoder_input_size, hidden_size=nhidden, num_layers=1, dropout=dropout, batch_first=True) self.decoder2 = nn.LSTM(input_size=decoder_input_size, hidden_size=nhidden, num_layers=1, dropout=dropout, batch_first=True) self.decoder3 = nn.LSTM(input_size=decoder_input_size, hidden_size=nhidden, num_layers=1, dropout=dropout, batch_first=True) # Initialize Linear Transformation self.linear = nn.Linear(nhidden, ntokens) self.init_weights() if share_decoder_emb: self.embedding_decoder2.weight = self.embedding_decoder1.weight self.embedding_decoder3.weight = self.embedding_decoder1.weight
def evaluate_autoencoder(data_source, epoch): # Turn on evaluation mode which disables dropout. autoencoder.eval() enc_classifier.eval() total_loss = 0 ntokens = args.ntokens nclasses = args.nclasses all_accuracies = 0 all_class_accuracies = 0 bcnt = 0 for i, batch in enumerate(data_source): source, target, lengths, tags = batch source = to_gpu(args.cuda, Variable(source, volatile=True)) target = to_gpu(args.cuda, Variable(target, volatile=True)) mask = target.gt(0) masked_target = target.masked_select(mask) # examples x ntokens output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens) # output: batch x seq_len x ntokens output = autoencoder(source, lengths, noise=True) output_encode_only = autoencoder(source, lengths, noise=False, encode_only=True) output_classifier = enc_classifier(output_encode_only) _, output_classifier = torch.max(output_classifier, -1) flattened_output = output.view(-1, ntokens) masked_output = \ flattened_output.masked_select(output_mask).view(-1, ntokens) total_loss += criterion_ce(masked_output / args.temp, masked_target).data # accuracy max_vals, max_indices = torch.max(masked_output, 1) all_accuracies += \ torch.mean(max_indices.eq(masked_target).float()).item() bcnt += 1 output_classifier = output_classifier.data.cpu().numpy() tags = tags.numpy() all_class_accuracies += \ np.equal(output_classifier, tags).sum() aeoutf = "./output/%s/%d_autoencoder.txt" % (args.outf, epoch) with open(aeoutf, "a") as f: max_values, max_indices = torch.max(output, 2) max_indices = \ max_indices.view(output.size(0), -1).data.cpu().numpy() target = target.view(output.size(0), -1).data.cpu().numpy() for t, idx, cls, cls_real in zip(target, max_indices, output_classifier, tags): # real sentence chars = " ".join([corpus.dictionary.idx2word[x] for x in t]) f.write(str(cls_real)) f.write("\t") f.write(chars) f.write("\n") # autoencoder output sentence chars = " ".join([corpus.dictionary.idx2word[x] for x in idx]) f.write(str(cls)) f.write("\t") f.write(chars) f.write("\n\n") return total_loss.item( ) / bcnt, all_accuracies / bcnt, all_class_accuracies / len(data_source)
def inference(self, inputs, mask_padded): hidden_features, label_scores_charts, embedding_outputs = self.tree_encoder.parse_batch( inputs, return_label_scores_charts=True) batch_size_inner = hidden_features.size(0) sentence_max_length = hidden_features.size(1) structure_features = [] for i, label_scores_chart in enumerate(label_scores_charts): sentence_length = label_scores_chart.size(0) label_scores_cnn_output = self.structure_cnn(label_scores_chart) label_scores_cnn_output = label_scores_cnn_output[1:-1, :] # label_scores_cnn_output = to_gpu(label_scores_cnn_output).float() label_scores_cnn_output = label_scores_cnn_output.float() label_scores_cnn_output_padder = torch.zeros( [sentence_max_length - sentence_length + 2, 300]) label_scores_cnn_output_padder = to_gpu( label_scores_cnn_output_padder).float() # label_scores_cnn_output_padder = label_scores_cnn_output_padder.float() label_scores_cnn_output_padded = torch.cat( [label_scores_cnn_output, label_scores_cnn_output_padder], 0) structure_features.append(label_scores_cnn_output_padded) structure_features_reshape = torch.cat(structure_features, 0) structure_features = structure_features_reshape structure_features = torch.reshape(structure_features, [batch_size_inner, -1, 300]) additional_select_features = torch.cat( [embedding_outputs, structure_features], 2) # select_pred = self.poly_phoneme_classifier.inference(additional_select_features) select_pred = self.poly_phoneme_classifier(additional_select_features, mask_padded) yinsu_id_inputs = torch.matmul(select_pred, self.pinyin_to_yinsu_dict) yinsu_id_inputs = torch.reshape(yinsu_id_inputs, [batch_size_inner, -1]).long() yinsu_embedded_inputs = self.yinsu_embedding(yinsu_id_inputs) hidden_inputs = hidden_features.transpose(1, 2) structure_features = structure_features.transpose(1, 2) additional_features = torch.cat([hidden_inputs, structure_features], 1) additional_features = additional_features.permute(0, 2, 1) additional_features_repeat = torch.repeat_interleave( additional_features, repeats=4, dim=1) features_for_encoder = torch.cat( [additional_features_repeat, yinsu_embedded_inputs], 2) features_for_encoder = features_for_encoder.permute(0, 2, 1) encoder_outputs = self.encoder.inference(features_for_encoder) mel_outputs, gate_outputs, alignments = self.decoder.inference( encoder_outputs) # hidden_inputs = hidden_features.transpose(1, 2) # structure_features = structure_features.transpose(1, 2) # additional_features = torch.cat([hidden_inputs, structure_features], 1) # additional_features = additional_features.permute(0, 2, 1) # additional_features_repeat = torch.repeat_interleave(additional_features, repeats=4, dim=1) # features_for_decoder = torch.cat([additional_features_repeat, yinsu_embedded_inputs], 2) # mel_outputs, gate_outputs, alignments = self.decoder.inference(features_for_decoder) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet # embedded_inputs = self.embedding(inputs).transpose(1, 2) # encoder_outputs = self.encoder.inference(embedded_inputs) # mel_outputs, gate_outputs, alignments = self.decoder.inference( # encoder_outputs) # mel_outputs_postnet = self.postnet(mel_outputs) # mel_outputs_postnet = mel_outputs + mel_outputs_postnet outputs = self.parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], select_pred) return outputs
def init_hidden(self, bsz): zeros1 = Variable(torch.zeros(self.nlayers, bsz, self.nhidden)) zeros2 = Variable(torch.zeros(self.nlayers, bsz, self.nhidden)) return (to_gpu(self.gpu, zeros1), to_gpu(self.gpu, zeros2))
def train(num_gpus, rank, group_name, output_directory, log_directory, checkpoint_path, hparams): torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(hparams.sigma) model = WaveGlow(hparams).cuda() Taco2 = load_pretrained_taco('tacotron2.pt', hparams) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = TextMelLoader(hparams.training_files, hparams) collate_fn = TextMelCollate() # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== batch_size = hparams.batch_size train_loader = DataLoader(trainset, num_workers=0, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) # Get shared output_directory readya if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if hparams.with_tensorboard and rank == 0: logger = prepare_directories_and_logger(output_directory, log_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) print("Total Epochs: {}".format(hparams.epochs)) print("Batch Size: {}".format(hparams.batch_size)) print("learning rate: {}".format(hparams.learning_rate)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch( batch) with torch.no_grad(): enc_outputs, alignments = Taco2( (text_padded, input_lengths, mel_padded, max_len, output_lengths)) # mel_padded = mel_padded.transpose(1, 2) # mel_padded = mel_padded / torch.abs(mel_padded).max().item() mel_pos = torch.arange(1000) mel_pos = to_gpu(mel_pos).long().unsqueeze(0) mel_pos = mel_pos.expand(hparams.batch_size, -1) src_pos = torch.arange(hparams.n_position) src_pos = to_gpu(src_pos).long().unsqueeze(0) src_pos = src_pos.expand(hparams.batch_size, -1) mel_padded = (mel_padded + 5) / 10 z, log_s_list, log_det_w_list, dec_enc_attn = model( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) outputs = (z, log_s_list, log_det_w_list, dec_enc_attn) loss = criterion(outputs, alignments) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if hparams.with_tensorboard and rank == 0: logger.log_training(reduced_loss, grad_norm, learning_rate, iteration) if (iteration % hparams.iters_per_checkpoint == 0): if rank == 0: mel_predict, test_attn = model.test( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) logger.log_alignment(model, dec_enc_attn, alignments, mel_padded, mel_predict, test_attn, iteration) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== if train_data_config["no_chunks"]: criterion = MaskedCrossEntropyLoss() else: criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() ema = ExponentialMovingAverage(ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=200000, gamma=0.5) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model, optimizer, scheduler, ema) iteration += 1 # next iteration is iteration + 1 trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config) validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print(train_data_config) if train_data_config["no_chunks"]: collate_fn = utils.collate_fn else: collate_fn = torch.utils.data.dataloader.default_collate train_loader = DataLoader(trainset, num_workers=1, shuffle=False, collate_fn=collate_fn, sampler=train_sampler, batch_size=batch_size, pin_memory=True, drop_last=True) valid_loader = DataLoader(validset, num_workers=1, shuffle=False, sampler=valid_sampler, batch_size=1, pin_memory=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) writer = SummaryWriter(log_dir) print("Checkpoints writing to: {}".format(log_dir)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): if low_memory: torch.cuda.empty_cache() scheduler.step() model.zero_grad() if train_data_config["no_chunks"]: x, y, seq_lens = batch seq_lens = to_gpu(seq_lens) else: x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) if train_data_config["no_chunks"]: loss = criterion(y_pred, y, seq_lens) else: loss = criterion(y_pred, y) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus)[0] else: reduced_loss = loss.data[0] loss.backward() optimizer.step() for name, param in model.named_parameters(): if name in ema.shadow: ema.update(name, param.data) print("{}:\t{:.9f}".format(iteration, reduced_loss)) if rank == 0: writer.add_scalar('loss', reduced_loss, iteration) if (iteration % iters_per_checkpoint == 0 and iteration): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, scheduler, learning_rate, iteration, checkpoint_path, ema, wavenet_config) if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]): if low_memory: torch.cuda.empty_cache() if rank == 0: model_eval = nv_wavenet.NVWaveNet(**(model.export_weights())) for j, valid_batch in enumerate(valid_loader): mel, audio = valid_batch mel = to_gpu(mel).float() cond_input = model.get_cond_input(mel) predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO) predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256) writer.add_audio("valid/predicted_audio_{}".format(j), predicted_audio, iteration, 22050) audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256) writer.add_audio("valid_true/audio_{}".format(j), audio, iteration, 22050) if low_memory: torch.cuda.empty_cache() iteration += 1
def forward(self, inputs): text_lengths, mask_padded, select_target, words_sorted, mels, max_len, output_lengths = inputs text_lengths, output_lengths = text_lengths.data, output_lengths.data # batch_size_inner = select_target.size(0) # sentence_max_length = select_target.size(1) # print('CHECK words_sorted:', words_sorted) # predicted_trees, scores = self.tree_encoder.parse_batch(words_sorted) # print('CHECK scores:', scores) # tree_shows = [p.convert().linearize() for p in predicted_trees] # print('CHECK scores:', tree_shows) hidden_features, label_scores_charts, embedding_outputs = self.tree_encoder.parse_batch( words_sorted, return_label_scores_charts=True) # print('CHECK character_padded:', character_padded.shape) # print('CHECK hidden_features:', hidden_features.shape) batch_size_inner = hidden_features.size(0) sentence_max_length = hidden_features.size(1) structure_features = [] for i, label_scores_chart in enumerate(label_scores_charts): sentence_length = label_scores_chart.size(0) label_scores_cnn_output = self.structure_cnn(label_scores_chart) label_scores_cnn_output = label_scores_cnn_output[1:-1, :] # label_scores_cnn_output = to_gpu(label_scores_cnn_output).float() label_scores_cnn_output = label_scores_cnn_output.float() label_scores_cnn_output_padder = torch.zeros( [sentence_max_length - sentence_length + 2, 300]) label_scores_cnn_output_padder = to_gpu( label_scores_cnn_output_padder).float() # label_scores_cnn_output_padder = label_scores_cnn_output_padder.float() label_scores_cnn_output_padded = torch.cat( [label_scores_cnn_output, label_scores_cnn_output_padder], 0) structure_features.append(label_scores_cnn_output_padded) structure_features_reshape = torch.cat(structure_features, 0) structure_features = structure_features_reshape structure_features = torch.reshape(structure_features, [batch_size_inner, -1, 300]) # print('CHECK structure_features:', structure_features.shape) # select_target_to_loss = torch.reshape(select_target, [-1, 6]) # print('CHECK select_target:', select_target_to_loss) # select_target = select_target_to_loss.unsqueeze(-1) # character_embedded_inputs = self.character_embedding(character_padded) # poly_yinsu_embedded_inputs = self.yinsu_embedding(poly_yinsu_padded) # Pretain to have structure features with character_embedded_inputs [B, L, 512], actually 300 # character_embedded_inputs = self.character_embedding(character_padded) additional_select_features = torch.cat( [embedding_outputs, structure_features], 2) # print('CHECK embedding_outputs:', embedding_outputs) # print('CHECK mask_padded:', mask_padded) # select_pred = self.poly_phoneme_classifier(embedding_outputs, mask_padded) select_pred = self.poly_phoneme_classifier(additional_select_features, mask_padded) # print('CHECK select_pred:', select_pred) # select_pred_to_loss = torch.reshape(select_pred, [-1, 6]) # print('CHECK select_pred:', select_pred_to_loss) # select_pred = select_pred_to_loss.unsqueeze(-1) select_accuracy = self.poly_phoneme_classifier.select_acc( select_target, select_pred, mask_padded) print('CHECK select_accuracy:', select_accuracy) # poly_yinsu_embedded_inputs = torch.reshape(poly_yinsu_embedded_inputs, [-1, 6, 512]) # poly_yinsu_embedded_inputs = poly_yinsu_embedded_inputs.permute(0, 2, 1) # phoneme_selected_inputs = torch.bmm(poly_yinsu_embedded_inputs, select_pred) # phoneme_selected_inputs = phoneme_selected_inputs.squeeze(-1) # phoneme_selected_inputs = torch.reshape(phoneme_selected_inputs, [batch_size_inner, -1, 512]) # phoneme_selected_inputs = phoneme_selected_inputs.permute(0, 2, 1) # print('CHECK pinyin_to_yinsu_dict:', self.pinyin_to_yinsu_dict) # print('CHECK pinyin_to_yinsu_dict:', self.pinyin_to_yinsu_dict.shape) # print('CHECK select_pred:', select_pred.shape) # yinsu_id_pred = torch.argmax(select_pred, 2) yinsu_id_inputs = torch.matmul(select_pred, self.pinyin_to_yinsu_dict) yinsu_id_inputs = torch.reshape(yinsu_id_inputs, [batch_size_inner, -1]).long() yinsu_embedded_inputs = self.yinsu_embedding(yinsu_id_inputs) # print('CHECK yinsu_embedded_inputs:', yinsu_embedded_inputs.shape) # print('CHECK yinsu_embedded_inputs:', yinsu_embedded_inputs) # Encoder Features Shape = [B, Features length, L] hidden_inputs = hidden_features.transpose(1, 2) structure_features = structure_features.transpose(1, 2) # additional_features = torch.cat([hidden_inputs, phoneme_selected_inputs, structure_features], 1) additional_features = torch.cat([hidden_inputs, structure_features], 1) # print('CHECK additional_features:', additional_features.shape) additional_features = additional_features.permute(0, 2, 1) additional_features_repeat = torch.repeat_interleave( additional_features, repeats=4, dim=1) # features_for_decoder = torch.cat([additional_features_repeat, yinsu_embedded_inputs], 2) features_for_encoder = torch.cat( [additional_features_repeat, yinsu_embedded_inputs], 2) features_for_encoder = features_for_encoder.permute(0, 2, 1) encoder_outputs = self.encoder(features_for_encoder, text_lengths * 4) # print('CHECK encoder_outputs:', encoder_outputs.shape) # mel_outputs, gate_outputs, alignments = self.decoder(features_for_decoder, mels, memory_lengths=text_lengths*4) mel_outputs, gate_outputs, alignments = self.decoder( encoder_outputs, mels, memory_lengths=text_lengths * 4) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return self.parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], select_pred, output_lengths)
corpus = Corpus(datafiles, maxlen=args.maxlen, vocab_size=args.vocab_size, lowercase=args.lowercase, vocab=vocabdict) # save arguments ntokens = len(corpus.dictionary.word2idx) print("Vocabulary Size: {}".format(ntokens)) args.ntokens = ntokens eval_batch_size = 100 en_data = batchify(corpus.data[args.corpus_name], eval_batch_size, shuffle=False) print(len(en_data)) print("Loaded data!") model_args, idx2word, autoencoder, gan_gen, gan_disc = load_models( args.outf, args.epochs, twodecoders=True) if args.cuda: autoencoder = autoencoder.cuda() gan_gen = gan_gen.cuda() gan_disc = gan_disc.cuda() one = to_gpu(args.cuda, torch.FloatTensor([1])) mone = one * -1 evaluate_generator(1, False)
def train_gan_d(ae_index, batch): autoencoder, optimizer_ae = autoencoders[ae_index], ae_optimizers[ae_index] gan_disc, optimizer_gan_d = gan_discs[ae_index], gan_d_optimizers[ae_index] # clamp parameters to a cube for p in gan_disc.parameters(): p.data.clamp_(-args.gan_clamp, args.gan_clamp) autoencoder.train() autoencoder.zero_grad() gan_disc.train() gan_disc.zero_grad() # positive samples ---------------------------- # generate real codes source, target, lengths = batch source = to_gpu(args.cuda, Variable(source)) target = to_gpu(args.cuda, Variable(target)) # batch_size x nhidden real_hidden = autoencoder(source, lengths, noise=False, encode_only=True) real_hidden.register_hook(make_grad_hook(autoencoder)) # loss / backprop errD_real = gan_disc(real_hidden) errD_real.backward(one) # negative samples ----------------------------i # generate fake codes # noise = to_gpu(args.cuda, Variable(torch.ones(args.batch_size, args.z_size))) # noise.data.normal_(0, 1) fake_hiddens = [] for other_index, other_autoencoder in enumerate(autoencoders): if other_index == ae_index: continue fake_hidden = other_autoencoder(source, lengths, noise=False, encode_only=True) # TODO: noise=True fake_hidden.register_hook(make_grad_hook( other_autoencoder)) # maybe register hook? Not sure. fake_hiddens.append(fake_hidden) # loss / backprop # fake_hidden = gan_gen(noise) total_errD_fake = None errD_fakes = [gan_disc(fh.detach()) for fh in fake_hiddens] for errD_fake in errD_fakes: errD_fake.backward(mone) if total_errD_fake is None: total_errD_fake = errD_fake else: total_errD_fake += errD_fake # Alernatively, we might prefer: total_errD_fake.backward(mone) # `clip_grad_norm` to prvent exploding gradient problem in RNNs / LSTMs torch.nn.utils.clip_grad_norm(autoencoder.parameters(), args.clip) optimizer_gan_d.step() optimizer_ae.step() errD = -(errD_real - total_errD_fake) return errD, errD_real, total_errD_fake
def train_ae(ae_index, batch, total_loss_ae, start_time, i): autoencoder, ae_optimizer = autoencoders[ae_index], ae_optimizers[ae_index] ae_args = autoencoders_args[ae_index] autoencoder.train() autoencoder.zero_grad() source, target, lengths = batch source = to_gpu(args.cuda, Variable(source)) target = to_gpu(args.cuda, Variable(target)) # Create sentence length mask over padding mask = target.gt(0) masked_target = target.masked_select(mask) # examples x ntokens output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens) # output: batch x seq_len x ntokens output = autoencoder(source, lengths, noise=True) # output_size: batch_size, maxlen, self.ntokens flattened_output = output.view(-1, ntokens) masked_output = \ flattened_output.masked_select(output_mask).view(-1, ntokens) loss = criterion_ce(masked_output / args.temp, masked_target) loss.backward() # `clip_grad_norm` to prevent exploding gradient in RNNs / LSTMs torch.nn.utils.clip_grad_norm(autoencoder.parameters(), args.clip) ae_optimizer.step() total_loss_ae += loss.data accuracy = None if i % args.log_interval == 0 and i > 0: # accuracy probs = F.softmax(masked_output) max_vals, max_indices = torch.max(probs, 1) accuracy = torch.mean(max_indices.eq(masked_target).float()).data[0] cur_loss = total_loss_ae[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | acc {:8.2f}'.format( epoch, i, len(ae_args.train_data), elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), accuracy)) with open("./output/{}/logs.txt".format(ae_args.outf), 'a') as f: f.write('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | acc {:8.2f}\n'.format( epoch, i, len(ae_args.train_data), elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), accuracy)) total_loss_ae = 0 start_time = time.time() return total_loss_ae, start_time
def __init__(self, config): super(MISA, self).__init__() self.config = config self.text_size = config.embedding_size self.pos_size = config.pos_size self.cust_size = config.custom_size self.input_sizes = input_sizes = [ self.text_size, self.pos_size, self.cust_size ] self.hidden_sizes = hidden_sizes = [ int(self.text_size), int(self.pos_size), int(self.cust_size) ] self.output_size = output_size = config.num_classes self.dropout_rate = dropout_rate = config.dropout self.activation = self.config.activation() self.tanh = nn.Tanh() self.hx = to_gpu(torch.randn((2, 1, config.hidden_size * 6), )) rnn = nn.LSTM if self.config.rnncell == "lstm" else nn.GRU # defining modules - two layer bidirectional LSTM with layer norm in between if self.config.use_bert: # Initializing a BERT bert-base-uncased style configuration bertconfig = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True) self.bertmodel = BertModel.from_pretrained('bert-base-uncased', config=bertconfig) else: # self.embed = nn.Embedding(len(config.word2id), input_sizes[0]) self.embed = self.add_embeddings self.trnn1 = rnn(input_sizes[0], hidden_sizes[0], bidirectional=True) self.trnn2 = rnn(2 * hidden_sizes[0], hidden_sizes[0], bidirectional=True) self.prnn1 = rnn(input_sizes[1], hidden_sizes[1], bidirectional=True) self.prnn2 = rnn(2 * hidden_sizes[1], hidden_sizes[1], bidirectional=True) self.conversation_rnn = nn.GRU(input_size=config.hidden_size * 6, hidden_size=config.hidden_size * 6, num_layers=1, bidirectional=True, batch_first=True) ########################################## # mapping modalities to same sized space ########################################## if self.config.use_bert: self.project_t = nn.Sequential() self.project_t.add_module( 'project_t', nn.Linear(in_features=768, out_features=config.hidden_size)) self.project_t.add_module('project_t_activation', self.activation) self.project_t.add_module('project_t_layer_norm', nn.LayerNorm(config.hidden_size)) else: self.project_t = nn.Sequential() self.project_t.add_module( 'project_t', nn.Linear(in_features=hidden_sizes[0] * 4, out_features=config.hidden_size)) self.project_t.add_module('project_t_activation', self.activation) self.project_t.add_module('project_t_layer_norm', nn.LayerNorm(config.hidden_size)) self.project_p = nn.Sequential() self.project_p.add_module( 'project_p', nn.Linear(in_features=hidden_sizes[1] * 4, out_features=config.hidden_size)) self.project_p.add_module('project_p_activation', self.activation) self.project_p.add_module('project_p_layer_norm', nn.LayerNorm(config.hidden_size)) self.project_c = nn.Sequential() self.project_c.add_module( 'project_c', nn.Linear(in_features=self.cust_size, out_features=config.hidden_size)) self.project_c.add_module('project_c_activation', self.activation) self.project_c.add_module('project_c_layer_norm', nn.LayerNorm(config.hidden_size)) ########################################## # private encoders ########################################## self.private_t = nn.Sequential() self.private_t.add_module( 'private_t_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.private_t.add_module('private_t_activation_1', self.activation) self.private_p = nn.Sequential() self.private_p.add_module( 'private_p_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.private_p.add_module('private_p_activation_1', self.activation) self.private_c = nn.Sequential() self.private_c.add_module( 'private_c_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.private_c.add_module('private_c_activation_1', self.activation) ########################################## # shared encoder ########################################## self.shared = nn.Sequential() self.shared.add_module( 'shared_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.shared.add_module('shared_activation_1', self.activation) ########################################## # reconstruct ########################################## self.recon_t = nn.Sequential() self.recon_t.add_module( 'recon_t_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.recon_p = nn.Sequential() self.recon_p.add_module( 'recon_p_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.recon_c = nn.Sequential() self.recon_c.add_module( 'recon_c_1', nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size)) self.fusion = nn.Sequential() self.fusion.add_module( 'fusion_layer_1', nn.Linear(in_features=self.config.hidden_size * 12, out_features=self.config.hidden_size)) self.fusion.add_module('fusion_layer_1_dropout', nn.Dropout(dropout_rate)) self.fusion.add_module('fusion_layer_1_activation', self.activation) self.fusion.add_module( 'fusion_layer_3', nn.Linear(in_features=self.config.hidden_size, out_features=output_size)) self.tlayer_norm = nn.LayerNorm((hidden_sizes[0] * 2, )) self.player_norm = nn.LayerNorm((hidden_sizes[1] * 2, )) encoder_layer = nn.TransformerEncoderLayer( d_model=self.config.hidden_size, nhead=2) self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)
# In[6]: print("Training...") with open("{}/log.txt".format(args.outf), 'a') as f: f.write('Training...\n') # schedule of increasing GAN training loops if args.niters_gan_schedule != "": gan_schedule = [int(x) for x in args.niters_gan_schedule.split("-")] else: gan_schedule = [] niter_gan = 25 fixed_noise = to_gpu(args.cuda,Variable(torch.ones(args.batch_size, args.z_size))) fixed_noise.data.normal_(0, 1) one = to_gpu(args.cuda, torch.FloatTensor([1])) mone = one * -1 #one = to_gpu(args.cuda, torch.FloatTensor([1])) #mone = Variable(torch.tensor(-1.0).cuda())#one * -1 #one = Variable(one, requires_grad=True).cuda()#torch.tensor(1.0, dtype=torch.float64,device=torch.device('cuda:0')) #mone = #Variable(mone, requires_grad=True).cuda() #mone = torch.tensor(-1.0, dtype=torch.float64,device=torch.device('cuda:0')) for epoch in range(1, args.epochs + 1): # update gan training schedule if epoch in gan_schedule: niter_gan += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 #trainset = Mel2SampOnehot(**data_config) trainset = DeepMels(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): total_loss = 0 print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) loss = criterion(y_pred, y) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus)[0] else: reduced_loss = loss.data[0] loss.backward() optimizer.step() total_loss += reduced_loss if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 print("epoch:{}, total epoch loss:{}".format(epoch, total_loss))
def main(): state_dict = torch.load(args.ae_model) with open(args.ae_args) as f: ae_args = json.load(f) corpus = Corpus(args.data_file, args.dict_file, vocab_size=ae_args['vocab_size']) autoencoder = Seq2Seq(emsize=ae_args['emsize'], nhidden=ae_args['nhidden'], ntokens=ae_args['ntokens'], nlayers=ae_args['nlayers'], noise_radius=ae_args['noise_radius'], hidden_init=ae_args['hidden_init'], dropout=ae_args['dropout'], gpu=args.cuda) autoencoder.load_state_dict(state_dict) for param in autoencoder.parameters(): param.requires_grad = False # save arguments with open(os.path.join(out_dir, 'args.json'), 'w') as f: json.dump(vars(args), f) log.info('[Data and AE model loaded.]') gan_gen = MLP_G(ninput=args.nhidden, noutput=args.nhidden, layers=args.arch_g) gan_disc = MLP_D(ninput=2 * args.nhidden, noutput=1, layers=args.arch_d) optimizer_gan_g = optim.Adam(gan_gen.parameters(), lr=args.lr_gan_g, betas=(args.beta1, 0.999)) optimizer_gan_d = optim.Adam(gan_disc.parameters(), lr=args.lr_gan_d, betas=(args.beta1, 0.999)) criterion_ce = nn.CrossEntropyLoss() if args.cuda: autoencoder = autoencoder.cuda() gan_gen = gan_gen.cuda() gan_disc = gan_disc.cuda() criterion_ce = criterion_ce.cuda() one = to_gpu(args.cuda, torch.FloatTensor([1])) mone = one * -1 train_pairs = BatchGen(corpus.get_chunks(size=2), args.batch_size) def train_gan_g(batch): gan_gen.train() gan_gen.zero_grad() source, _ = batch source = to_gpu(args.cuda, Variable(source)) source_hidden = autoencoder(source, noise=False, encode_only=True) fake_hidden = gan_gen(source_hidden) errG = gan_disc(source_hidden, fake_hidden) # loss / backprop errG.backward(one) optimizer_gan_g.step() return errG def train_gan_d(batch): # clamp parameters to a cube for p in gan_disc.parameters(): p.data.clamp_(-args.gan_clamp, args.gan_clamp) gan_disc.train() gan_disc.zero_grad() # positive samples ---------------------------- # generate real codes source, target = batch source = to_gpu(args.cuda, Variable(source)) target = to_gpu(args.cuda, Variable(target)) # batch_size x nhidden source_hidden = autoencoder(source, noise=False, encode_only=True) target_hidden = autoencoder(target, noise=False, encode_only=True) # loss / backprop errD_real = gan_disc(source_hidden, target_hidden) errD_real.backward(one) # negative samples ---------------------------- # loss / backprop fake_hidden = gan_gen(source_hidden) errD_fake = gan_disc(source_hidden.detach(), fake_hidden.detach()) errD_fake.backward(mone) optimizer_gan_d.step() errD = -(errD_real - errD_fake) return errD, errD_real, errD_fake niter = 0 start_time = datetime.now() for t in range(args.updates): niter += 1 # train discriminator/critic for i in range(args.niters_gan_d): # feed a seen sample within this epoch; good for early training errD, errD_real, errD_fake = \ train_gan_d(next(train_pairs)) # train generator for i in range(args.niters_gan_g): errG = train_gan_g(next(train_pairs)) if niter % args.log_interval == 0: eta = str((datetime.now() - start_time) / (t + 1) * (args.updates - t - 1)).split('.')[0] log.info('[{}/{}] Loss_D: {:.6f} (real: {:.6f} ' 'fake: {:.6f}) Loss_G: {:.6f} ETA: {}'.format( niter, args.updates, errD.data.cpu()[0], errD_real.data.cpu()[0], errD_fake.data.cpu()[0], errG.data.cpu()[0], eta)) if niter % args.save_interval == 0: save_model(gan_gen, out_dir, 'gan_gen_model_{}.pt'.format(t)) save_model(gan_disc, out_dir, 'gan_disc_model_{}.pt'.format(t))
return errD, errD_real, errD_fake print("Training...") #1204delete #with open("./output/{}/logs.txt".format(outf), 'a') as f: # f.write('Training...\n') # schedule of increasing GAN training loops if niters_gan_schedule != "": gan_schedule = [int(x) for x in niters_gan_schedule.split("-")] else: gan_schedule = [] niter_gan = 1 fixed_noise = to_gpu(cuda, Variable(torch.ones(batch_size, z_size))) fixed_noise.data.normal_(0, 1) one = to_gpu(cuda, torch.FloatTensor([1])) mone = one * -1 best_ppl = None impatience = 0 all_ppl = [] for epoch in range(1, epochs + 1): # update gan training schedule if epoch in gan_schedule: niter_gan += 1 print("GAN training loop schedule increased to {}".format(niter_gan)) #1204 delete #with open("./output/{}/logs.txt".format(outf), 'a') as f: # f.write("GAN training loop schedule increased to {}\n".
def evaluate_autoencoder(whichdecoder, data_source, epoch, seper=""): # Turn on evaluation mode which disables dropout. autoencoder.eval() total_loss = 0 ntokens = len(corpus.dictionary.word2idx) all_accuracies = 0 bcnt = 0 for i, batch in enumerate(data_source): source, target, lengths = batch source = to_gpu(args.cuda, Variable(source, volatile=True)) target = to_gpu(args.cuda, Variable(target, volatile=True)) mask = target.gt(0) masked_target = target.masked_select(mask) # examples x ntokens output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens) hidden = autoencoder(0, source, lengths, noise=False, encode_only=True) # output: batch x seq_len x ntokens if whichdecoder == 1: output = autoencoder(1, source, lengths, noise=False) flattened_output = output.view(-1, ntokens) masked_output = \ flattened_output.masked_select(output_mask).view(-1, ntokens) # accuracy max_vals1, max_indices1 = torch.max(masked_output, 1) all_accuracies += \ torch.mean(max_indices1.eq(masked_target).float()).data[0] max_values1, max_indices1 = torch.max(output, 2) max_indices2 = autoencoder.generate(2, hidden, maxlen=50) else: output = autoencoder(2, source, lengths, noise=False) flattened_output = output.view(-1, ntokens) masked_output = \ flattened_output.masked_select(output_mask).view(-1, ntokens) # accuracy max_vals2, max_indices2 = torch.max(masked_output, 1) all_accuracies += \ torch.mean(max_indices2.eq(masked_target).float()).data[0] max_values2, max_indices2 = torch.max(output, 2) max_indices1 = autoencoder.generate(1, hidden, maxlen=50) total_loss += criterion_ce(masked_output/args.temp, masked_target).data bcnt += 1 aeoutf_targ = "%s/%d_output_decoder_%d_targ%s.txt"%(args.outf, epoch, whichdecoder, seper) aeoutf_one = "%s/%d_output_decoder_%d_one%s.txt"%(args.outf, epoch, whichdecoder, seper) aeoutf_two = "%s/%d_output_decoder_%d_two%s.txt"%(args.outf, epoch, whichdecoder, seper) with open(aeoutf_targ, 'w') as f_targ, open(aeoutf_one,'w') as f_one, open(aeoutf_two,'w') as f_two: max_indices1 = \ max_indices1.view(output.size(0), -1).data.cpu().numpy() max_indices2 = \ max_indices2.view(output.size(0), -1).data.cpu().numpy() target = target.view(output.size(0), -1).data.cpu().numpy() for t, idx1, idx2 in zip(target, max_indices1, max_indices2): # real sentence chars = " ".join([corpus.dictionary.idx2word[x] for x in t]) f_targ.write(chars + "\n") # transfer sentence chars = " ".join([corpus.dictionary.idx2word[x] for x in idx1]) f_one.write(chars + "\n") # transfer sentence chars = " ".join([corpus.dictionary.idx2word[x] for x in idx2]) f_two.write(chars + "\n") return total_loss[0] / len(data_source), all_accuracies/bcnt
def train_ae_and_classifier(batch, total_loss_ae, start_time, i, perturb=None, epsilon=0.0, alpha=0.0, pgd_iters=0): autoencoder.train() autoencoder.zero_grad() enc_classifier.train() enc_classifier.zero_grad() source, target, lengths, tags = batch source = to_gpu(args.cuda, Variable(source)) target = to_gpu(args.cuda, Variable(target)) tags = to_gpu(args.cuda, Variable(tags)) # Create sentence length mask over padding mask = target.gt(0) masked_target = target.masked_select(mask) # examples x ntokens output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens) # output: batch x seq_len x ntokens output = autoencoder(source, lengths, noise=True) # output tags: batch_size x nclasses output_encode_only = autoencoder(source, lengths, noise=False, encode_only=True) output_classifier = enc_classifier(output_encode_only) perturbed_code = None if perturb == 'fgsm': output_encode_only.retain_grad() classifier_loss = criterion_ce(output_classifier, tags) enc_classifier.zero_grad() classifier_loss.backward(retain_graph=True) code_grad = output_encode_only.grad.data perturbed_code = fgsm_attack(output_encode_only, epsilon, code_grad) elif perturb == 'pgd': perturbed_code = output_encode_only.clone().detach() for step_idx in range(pgd_iters): perturbed_code.requires_grad = True adv_scores = enc_classifier(perturbed_code) tmp_loss = criterion_ce(adv_scores, tags) enc_classifier.zero_grad() tmp_loss.backward(retain_graph=True) # step in the direction of the gradient perturbed_code = perturbed_code + alpha * perturbed_code.grad.sign( ) # Workaround as PyTorch doesn't have elementwise clip # from: https://gist.github.com/oscarknagg/45b187c236c6262b1c4bbe2d0920ded6#file-projected_gradient_descent-py perturbed_code = torch.max( torch.min(perturbed_code, output_encode_only + epsilon), output_encode_only - epsilon).detach() perturbed_code = torch.clamp(perturbed_code, -0.34, 0.32) print(i) print(i) # output_size: batch_size, maxlen, self.ntokens flattened_output = output.view(-1, ntokens) masked_output = \ flattened_output.masked_select(output_mask).view(-1, ntokens) loss = criterion_ce(masked_output / args.temp, masked_target) classifier_loss = criterion_ce(output_classifier, tags) loss += classifier_loss if perturbed_code != None: output_classifier_adversarial = enc_classifier(perturbed_code) classifier_adversarial_loss = criterion_ce( output_classifier_adversarial, tags) loss += classifier_adversarial_loss loss.backward() # `clip_grad_norm` to prevent exploding gradient in RNNs / LSTMs torch.nn.utils.clip_grad_norm(autoencoder.parameters(), args.clip) torch.nn.utils.clip_grad_norm(enc_classifier.parameters(), args.clip) optimizer_ae.step() optimizer_enc_classifier.step() total_loss_ae += loss.data accuracy = None if i % args.log_interval == 0 and i > 0: # accuracy probs = F.softmax(masked_output, dim=-1) max_vals, max_indices = torch.max(probs, 1) _, predicted_tags = torch.max(output_classifier, 1) accuracy = torch.mean(max_indices.eq(masked_target).float()).item() accuracy_classifier = torch.mean( predicted_tags.eq(tags).float()).item() cur_loss = total_loss_ae.item() / args.log_interval cur_loss_classifier = classifier_loss.item() elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | acc {:8.2f} | acc_cla {:8.2f} | loss_cla {:8.2f}' .format(epoch, i, len(train_data), elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), accuracy, accuracy_classifier, cur_loss_classifier)) with open("./output/{}/logs.txt".format(args.outf), 'a') as f: f.write( '| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | acc {:8.2f} | acc_cla {:8.2f} | loss_cla {:8.2f}\n' .format(epoch, i, len(train_data), elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), accuracy, accuracy_classifier, cur_loss_classifier)) total_loss_ae = 0 start_time = time.time() return total_loss_ae, start_time
def parse_batch(self, batch): a, b = batch a = to_gpu(a).float() b = b.cuda() return (a, b)
inverter = inverter.cpu() gan_gen = gan_gen.cpu() gan_disc = gan_disc.cpu() print("Training...") with open("./output/{}/logs.txt".format(args.outf), 'a') as f: f.write('Training...\n') # schedule of increasing GAN training loops if args.niters_gan_schedule != "": gan_schedule = [int(x) for x in args.niters_gan_schedule.split("-")] else: gan_schedule = [] niter_gan = 1 fixed_noise = to_gpu(args.cuda, Variable(torch.ones(args.batch_size, args.z_size))) fixed_noise.data.normal_(0, 1) one = to_gpu(args.cuda, torch.tensor(1, dtype=torch.float)) mone = one * -1 impatience = 0 all_ppl = [] best_ppl = None for epoch in range(start_epoch, args.epochs + 1): # update gan training schedule if epoch in gan_schedule: niter_gan += 1 print("GAN training loop schedule increased to {}".format(niter_gan)) with open("./output/{}/logs.txt".format(args.outf), 'a') as f:
def train(self): curr_patience = patience = self.train_config.patience num_trials = 1 self.criterion = criterion = nn.CrossEntropyLoss(reduction="mean") self.loss_diff = DiffLoss() self.loss_recon = MSE() self.loss_cmd = CMD() self.loss_sim = SimLoss() best_valid_loss = float('inf') # lr_scheduler # lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.5, patience=2) # lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3, gamma=0.65) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.3) train_losses = [] # self.eval(mode="test", to_print=True) for epoch in range(self.train_config.n_epoch): print("epoch: ", epoch) print( f"learning rate: {self.optimizer.state_dict()['param_groups'][0]['lr']}" ) self.model.train() train_loss, train_loss_cls, train_loss_sim, train_loss_diff, train_loss_recon, train_floor = [], [], [], [], [], [] for batch in self.train_data_loader: # print(f"learning rate batch: {self.optimizer.state_dict()['param_groups'][0]['lr']}") self.model.zero_grad() #t = text, p = pos, c = cust,y = label, l = length t, p, c, y, l, bert_sent, bert_sent_type, bert_sent_mask = batch t = to_gpu(t) p = to_gpu(p) c = to_gpu(c) y = to_gpu(y) l = to_gpu(l) bert_sent = to_gpu(bert_sent) bert_sent_type = to_gpu(bert_sent_type) bert_sent_mask = to_gpu(bert_sent_mask) y_tilde = self.model(t, p, c, l, bert_sent, bert_sent_type, bert_sent_mask) y = y.squeeze() cls_loss = criterion(y_tilde, y.long()) diff_loss = self.get_diff_loss() recon_loss = self.get_recon_loss() similarity_loss = self.get_sim_loss() loss = cls_loss + \ self.train_config.diff_weight * diff_loss + \ self.train_config.sim_weight * similarity_loss + \ self.train_config.recon_weight * recon_loss # loss = cls_loss bottom = 0.6 floor = (loss - bottom).abs() + bottom floor.backward() torch.nn.utils.clip_grad_value_([ param for param in self.model.parameters() if param.requires_grad ], 1.5 * self.train_config.clip) self.optimizer.step() train_loss_cls.append(cls_loss.item()) train_loss_diff.append(diff_loss.item()) train_loss_recon.append(recon_loss.item()) train_loss_sim.append(similarity_loss.item()) train_loss.append(loss.item()) train_floor.append(floor.item()) train_losses.append(train_loss) print(f"Training loss: {round(np.mean(train_loss), 4)}") print(f"Training floor: {round(np.mean(train_floor), 4)}") print(f"cls loss: {round(np.mean(train_loss_cls), 4)}") print(f"diff loss: {round(np.mean(train_loss_diff), 4)}") print(f"sim loss: {round(np.mean(train_loss_sim), 4)}") print(f"recon loss: {round(np.mean(train_loss_recon), 4)}") valid_loss, valid_acc = self.eval(mode="dev") print(f"valid_loss : {round(valid_loss,4)}") print(f"Current patience: {curr_patience}.") if valid_loss <= best_valid_loss: best_valid_loss = round(valid_loss, 6) print("Found new best model on dev set!") if not os.path.exists('checkpoints'): os.makedirs('checkpoints') torch.save(self.model.state_dict(), f'checkpoints/model.std') torch.save(self.optimizer.state_dict(), f'checkpoints/optim.std') self.eval(mode="test", to_print=True) curr_patience = patience else: print(f"best_valid_loss : {round(best_valid_loss, 6)}") curr_patience -= 1 if curr_patience <= -1: print( "Running out of patience, loading previous best model." ) self.eval(mode="test", to_print=True) num_trials -= 1 curr_patience = patience self.model.load_state_dict( torch.load(f'checkpoints/model.std')) self.optimizer.load_state_dict( torch.load(f'checkpoints/optim.std')) lr_scheduler.step() print( f"Current learning rate: {self.optimizer.state_dict()['param_groups'][0]['lr']}" ) if num_trials <= 0: print("Running out of patience, early stopping.") break self.eval(mode="test", to_print=True)
def grad_hook(grad): global g_factor newgrad = grad * to_gpu(args.cuda, Variable(g_factor)) return newgrad
def init_state(self, bsz): zeros = Variable(torch.zeros(self.nlayers, bsz, self.nhidden)) return to_gpu(self.gpu, zeros)
def evaluate_autoencoder(whichdecoder, data_source, references, epoch): # Turn on evaluation mode which disables dropout. autoencoder.eval() total_loss = 0 ntokens = len(corpus.dictionary.word2idx) all_accuracies = 0 bcnt = 0 for i, batch in enumerate(data_source): source, target, lengths = batch source = to_gpu(args.cuda, Variable(source, volatile=True)) target = to_gpu(args.cuda, Variable(target, volatile=True)) mask = target.gt(0) masked_target = target.masked_select(mask) # examples x ntokens output_mask = mask.unsqueeze(1).expand(mask.size(0), ntokens) hidden = autoencoder(0, source, lengths, noise=False, encode_only=True) # output: batch x seq_len x ntokens if whichdecoder == 1: output = autoencoder(1, source, lengths, noise=False) flattened_output = output.view(-1, ntokens) masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens) # accuracy max_vals1, max_indices1 = torch.max(masked_output, 1) all_accuracies += torch.mean(max_indices1.eq(masked_target).float()).data[0] max_values1, max_indices1 = torch.max(output, 2) max_indices2 = autoencoder.generate(2, hidden, maxlen=50) else: output = autoencoder(2, source, lengths, noise=False) flattened_output = output.view(-1, ntokens) masked_output = flattened_output.masked_select(output_mask).view(-1, ntokens) # accuracy max_vals2, max_indices2 = torch.max(masked_output, 1) all_accuracies += torch.mean(max_indices2.eq(masked_target).float()).data[0] max_values2, max_indices2 = torch.max(output, 2) max_indices1 = autoencoder.generate(1, hidden, maxlen=50) total_loss += criterion_ce(masked_output / args.temp, masked_target).data bcnt += 1 aeoutf_from = "{}/{}_output_decoder_{}_from.txt".format( args.outf, epoch, whichdecoder) aeoutf_tran = "{}/{}_output_decoder_{}_tran.txt".format( args.outf, epoch, whichdecoder) aeoutf_bleu = "{}/{}_output_decoder_{}_bleu.txt".format( args.outf, epoch, whichdecoder) candidate = [] counter = 0 with open(aeoutf_from, 'w') as f_from, open(aeoutf_tran, 'w') as f_trans, open(aeoutf_bleu, 'w') as f_bleu: max_indices1 = max_indices1.view(output.size(0), -1).data.cpu().numpy() max_indices2 = max_indices2.view(output.size(0), -1).data.cpu().numpy() target = target.view(output.size(0), -1).data.cpu().numpy() tran_indices = max_indices2 if whichdecoder == 1 else max_indices1 for t, tran_idx in zip(target, tran_indices): # real sentence chars = " ".join([corpus.dictionary.idx2word[x] for x in t]) f_from.write(chars) f_from.write("\n") # transfer sentence chars = " ".join([corpus.dictionary.idx2word[x] for x in tran_idx]) candidate = ", ".join([corpus.dictionary.idx2word[x] for x in tran_idx]) f_trans.write(chars) f_trans.write("\n") if counter < len(references): BLEU_score = sentence_bleu(references[counter], candidate) f_bleu.write(BLEU_score) f_bleu.write("\n") counter = counter + 1 return total_loss[0] / len(data_source), all_accuracies / bcnt
return errD, errD_real, errD_fake print("Training...") with open("./output/{}/logs.txt".format(args.outf), 'a') as f: f.write('Training...\n') # schedule of increasing GAN training loops if args.niters_gan_schedule != "": gan_schedule = [int(x) for x in args.niters_gan_schedule.split("-")] else: gan_schedule = [] niter_gan = 1 fixed_noise = to_gpu(args.cuda, Variable(torch.ones(args.batch_size, args.z_size))) fixed_noise.data.normal_(0, 1) one = to_gpu(args.cuda, torch.FloatTensor([1])) mone = one * -1 best_ppl = None impatience = 0 all_ppl = [] for epoch in range(1, args.epochs+1): # update gan training schedule if epoch in gan_schedule: niter_gan += 1 print("GAN training loop schedule increased to {}".format(niter_gan)) with open("./output/{}/logs.txt".format(args.outf), 'a') as f: f.write("GAN training loop schedule increased to {}\n". format(niter_gan))