def train_model(model, lr, epochs, train_loader, val_loader, patience): optimizer = Adagrad(model.parameters(), lr) criterion = nn.MSELoss() best_rmse = 100 rounds_no_imporve = 0 for epoch in range(epochs): for users, items, x, y in train_loader: y_pred = model(users, items, x) loss = criterion(y_pred.reshape(-1), y) optimizer.zero_grad() loss.backward() optimizer.step() logging.info('Last train loss: {0:.3f}'.format( loss.detach().cpu().numpy().tolist())) with torch.no_grad(): errors = np.array([]) for users, items, x, y in val_loader: y_pred = model(users, items, x) group_errors = (y_pred - y).reshape(-1).cpu().numpy() errors = np.concatenate([errors, group_errors]) rmse = (errors**2).mean()**0.5 logging.info('Test RMSE: {0:.3f}'.format(rmse)) if rmse < best_rmse: best_rmse = rmse rounds_no_imporve = 0 else: rounds_no_imporve += 1 if rounds_no_imporve >= patience: return model return model
def fit(self, data_loader, print_freq=1000, num_epochs=10): ''' fit to the data Parameters ---------- data_loader : DataLoader if enumerated, it returns array-like object of shape (batch_size, length), where each element corresponds to word index. print_freq : int how frequent to print loss num_epochs : int the number of epochs ''' def repackage_hidden(h): """Wraps hidden states in new Variables, to detach them from their history.""" if type(h) == Variable: return Variable(h.data) else: return tuple(repackage_hidden(v) for v in h) if self.padding_idx is None: criterion = nn.CrossEntropyLoss() else: criterion = nn.CrossEntropyLoss(ignore_index=self.padding_idx) optimizer = Adagrad(self.parameters()) i = 0 running_loss = 0 for epoch in range(num_epochs): for each_idx, each_batch in enumerate(data_loader): batch_var = Variable(each_batch, requires_grad=False) if self.use_gpu: batch_var = batch_var.cuda() try: pred_batch = self.forward(batch_var[:, :-1]) except: import ipdb; ipdb.set_trace() pred_batch.contiguous() batch_var.contiguous() tgt = batch_var[:, :-1] tgt.contiguous() loss = criterion(pred_batch.view(-1, self.vocab_size), tgt.view(-1)) loss.backward() optimizer.step() self.init_hidden() # print statistics running_loss += loss.data[0] i += 1 if i % print_freq == print_freq-1: print('epoch: {}\t total examples: {}\t loss: {}'.format( epoch + 1, (i + 1) * self.batch_size, running_loss / print_freq)) running_loss = 0.0 print('Finished Training')
def demo_pytorch_vae_mnist(hidden_sizes=[200, 200], latent_dim=5, distribution_type='bernoulli', minibatch_size=20, checkpoints=100, n_epochs=20): cp = Checkpoints(checkpoints) model = VAEModel( encoder=make_mlp_encoder(visible_dim=784, hidden_sizes=hidden_sizes, latent_dim=latent_dim), decoder=make_mlp_decoder(latent_dim=latent_dim, hidden_sizes=hidden_sizes, visible_dim=784, dist_type=distribution_type), latent_dim=latent_dim, ) # optimizer = Adam(params = model.parameters()) # optimizer = RMSprop(params = model.parameters()) # optimizer = Adamax(params = model.parameters()) optimizer = Adagrad(params=model.parameters()) # optimizer = SGD(lr=0.001, params = model.parameters()) train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])), batch_size=minibatch_size, shuffle=True) for epoch in range(n_epochs): for batch_idx, (x, y) in enumerate(train_loader): epoch_pt = epoch + batch_idx / len(train_loader) optimizer.zero_grad() loss = -model.elbo(x.flatten(1)).sum() loss.backward() optimizer.step() rate = measure_global_rate('training') if cp(): print(f'Mean Rate at Epoch {epoch_pt:.2g}: {rate:.3g}iter/s') z_samples = model.prior().sample((64, )) x_dist = model.decode(z_samples) dbplot(x_dist.mean.reshape(-1, 28, 28), 'Sample Means', title=f'Sample Means at epoch {epoch_pt:.2g}')
def fit(self, seq_list: List, objective='cross_entropy', print_freq=1000, num_epochs=10, sgd_kwargs={}): ''' train LSTM using DataLoader Parameters ---------- seq_list : list each element corresponds to a sequence objective : str objective function print_freq : int how frequently loss is printed num_epochs : int the number of training epochs sgd_kwargs : dict keywords fed into SGD ''' if objective == 'cross_entropy': criterion = nn.CrossEntropyLoss() elif objective == 'mse': criterion = nn.MSELoss() elif objective == 'nll': # nll stands for negative log-likelihood criterion = nn.NLLLoss() else: raise NotImplementedError optimizer = Adagrad(self.parameters(), **sgd_kwargs) i = 0 running_loss = 0 for epoch in range(num_epochs): for each_idx in range(0, len(seq_list), self.batch_size): each_seq = torch.stack( seq_list[each_idx:each_idx + self.batch_size], dim=1) seq = Variable(each_seq, requires_grad=False) optimizer.zero_grad() pred_seq = self.forward(seq[:-1]) loss = criterion(pred_seq, seq[:-1]) loss.backward() optimizer.step() self.init_hidden() # print statistics running_loss += loss.data[0] i += 1 if i % print_freq == print_freq-1: print('epoch: {}\t total examples: {}\t loss: {}'.format( epoch + 1, i + 1, running_loss / print_freq)) running_loss = 0.0 print('Finished Training')
def train(inputs, outputs, model, l1, l2, lr=1e-5, epochs=10000): criterion = torch.nn.CrossEntropyLoss() optimizer = Adagrad(model.parameters(), lr=lr) log = [] for _ in range(epochs): prediction = model(inputs) acc = tn((prediction.max(1)[1] == outputs).float().mean()) original_loss = criterion(prediction, outputs) penalty = model.penalty(l1, l2) error = original_loss + penalty optimizer.zero_grad() error.backward() log.append( (tn(original_loss), tn(penalty), tn(get_sparsity(model)), acc)) optimizer.step() return np.array(log)
def train(self, device): set_random_seed() self.loaded_data.negative_sample() # Compose Graph NN gnn_channel = GNNChannel(self.sr_ent_num, self.tg_ent_num, self.dim, self.layer_num, self.drop_out, self.channels) self.gnn_channel = gnn_channel gnn_channel.to(device) gnn_channel.train() # Prepare optimizer optimizer = Adagrad(filter(lambda p: p.requires_grad, gnn_channel.parameters()), lr=self.learning_rate, weight_decay=self.l2_regularization) criterion = AlignLoss(self.margin_gamma) best_hit_at_1 = 0 best_epoch_num = 0 for epoch_num in range(1, self.epoch_num + 1): gnn_channel.train() optimizer.zero_grad() sr_seed_hid, tg_seed_hid, _, _ = gnn_channel.forward( self.loaded_data.train_sr_ent_seeds, self.loaded_data.train_tg_ent_seeds) loss = criterion(sr_seed_hid, tg_seed_hid) loss.backward() optimizer.step() if epoch_num % self.nega_sample_freq == 0: if str(self.directory).find('DWY100k') >= 0: self.loaded_data.negative_sample() else: self.negative_sample() hit_at_1 = self.evaluate(epoch_num, gnn_channel, print_info=False, device=device) if hit_at_1 > best_hit_at_1: best_hit_at_1 = hit_at_1 best_epoch_num = epoch_num print('Model best Hit@1 on valid set is %.2f at %d epoch.' % (best_hit_at_1, best_epoch_num)) return best_hit_at_1, best_epoch_num
def train(inputs, outputs, model, l1, l2, lr=1e-5, epochs=10000): criterion = torch.nn.MSELoss() optimizer = Adagrad(model.parameters(), lr=lr) # Everyone starts the same way for p in model.parameters(): # p.data.fill_(1.0) pass log = [] for _ in range(epochs): original_loss = criterion(model(inputs), outputs) penalty = model.penalty(l1, l2) error = original_loss + penalty optimizer.zero_grad() error.backward() log.append((tn(original_loss), tn(penalty), tn(get_sparsity(model)))) optimizer.step() return np.array(log)
def run(dim, ds, epochs, attempts, lrs, reg_coef): losses = pd.DataFrame(columns=['lr', 'epoch', 'attempt', 'loss']) total_epochs = len(lrs) * len(attempts) * len(epochs) with tqdm(total=total_epochs, desc='lr = NA, attempt = NA, epoch = NA, loss = NA', unit='epochs', ncols=140) as pbar: for lr in lrs: for attempt in attempts: x = torch.empty(dim, requires_grad=True, dtype=torch.double) torch.nn.init.normal_(x) opt = Adagrad([x], lr=lr) for epoch in epochs: train_loss = 0 for X, y in DataLoader(ds, shuffle=True, batch_size=1): opt.zero_grad() if y.item() == 0: score = -torch.dot(X[0, :], x) else: score = torch.dot(X[0, :], x) loss = torch.log1p(torch.exp(score)) + (reg_coef / 2) * torch.dot(x, x) loss.backward() train_loss += loss.item() opt.step() train_loss /= len(ds) losses = losses.append(pd.DataFrame.from_dict( {'loss': [train_loss], 'epoch': [epoch], 'lr': [lr], 'attempt': [attempt]}), sort=True) pbar.update() pbar.set_description(desc=f'lr = {lr}, attempt = {attempt}, epoch = {epoch}, loss = {train_loss}') return losses
def main(): parser = argparse.ArgumentParser() parser.add_argument("--build_pre_train", action='store_true', help="Whether to build Pre-Train data.") parser.add_argument("--train_path", type=str, default="../data/tacred_train.json", help="Path to unlabled data.") parser.add_argument("--dev_path", type=str, default="../data/tacred_dev.json", help="Path to unlabled data.") parser.add_argument("--test_path", type=str, default="../data/tacred_test.json", help="Path to unlabled data.") parser.add_argument("--explanation_data_path", type=str, default="../data/tacred_explanations.json", help="Path to explanation data.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for train.") parser.add_argument("--eval_batch_size", default=128, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=0.001, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--epochs", default=25, # will train for 24, stopping criteria of 0.9 f1 type=int, help="Number of Epochs for training") parser.add_argument('--embeddings', type=str, default="glove.840B.300d", help="initial embeddings to use") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gamma', type=float, default=0.5, help="weight of sim_loss") parser.add_argument('--emb_dim', type=int, default=300, help="embedding vector size") parser.add_argument( '--hidden_dim', type=int, default=300, help="hidden vector size of lstm (really 2*hidden_dim, due to bilstm)") parser.add_argument('--model_save_dir', type=str, default="", help="where to save the model") parser.add_argument('--experiment_name', type=str, default="official", help="what to save the model file as") parser.add_argument('--load_model', action='store_true', help="Whether to load a model") parser.add_argument('--start_epoch', type=int, default=0, help="start_epoch") parser.add_argument('--use_adagrad', action='store_true', help="use adagrad optimizer") args = parser.parse_args() torch.manual_seed(args.seed) random.seed(args.seed) sample_rate = 0.6 lower_bound = -20.0 dataset = "tacred" if args.build_pre_train: build_pre_train_find_datasets_from_splits( args.train_path, args.dev_path, args.test_path, args.explanation_data_path, embedding_name=args.embeddings, sample_rate=sample_rate, dataset=dataset) save_string = generate_save_string(dataset, args.embeddings, sample=sample_rate) with open("../data/pre_train_data/train_data_{}.p".format(save_string), "rb") as f: train_dataset = pickle.load(f) primary_eval_path = "../data/pre_train_data/rq_data_{}.p".format( save_string) # optional secondary eval, can set this to the empty string secondary_eval_path = "../data/pre_train_data/dev_data_{}.p".format( save_string) with open("../data/vocabs/vocab_{}.p".format(save_string), "rb") as f: vocab = pickle.load(f) with open("../data/pre_train_data/sim_data_{}.p".format(save_string), "rb") as f: sim_data = pickle.load(f) pad_idx = vocab["<pad>"] if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") custom_vocab = build_custom_vocab(dataset, len(vocab)) custom_vocab_length = len(custom_vocab) model = Find_Module.Find_Module(emb_weight=vocab.vectors, padding_idx=pad_idx, emb_dim=args.emb_dim, hidden_dim=args.hidden_dim, cuda=torch.cuda.is_available(), custom_token_count=custom_vocab_length) del vocab # prepping variables for storing training progress epochs = args.epochs epoch_string = str(epochs) epoch_losses = [] dev_2_epoch_losses = [] best_f1_score = -1 best_dev_2_f1_score = -1 best_dev_loss = float('inf') if args.load_model: model.load_state_dict( torch.load("../data/saved_models/Find-Module-pt_{}.p".format( args.experiment_name))) print("loaded model") with open("../data/result_data/loss_per_epoch_Find-Module-pt_{}.csv". format(args.experiment_name)) as f: reader = csv.reader(f) next(reader) for row in reader: epoch_losses.append(row) if float(row[-1]) > best_f1_score: best_f1_score = float(row[-1]) if float(row[3]) < best_dev_loss: best_dev_loss = float(row[3]) with open( "../data/result_data/dev_2_loss_per_epoch_Find-Module-pt_{}.csv" .format(args.experiment_name)) as f: reader = csv.reader(f) next(reader) for row in reader: dev_2_epoch_losses.append(row) if float(row[-1]) > best_dev_2_f1_score: best_dev_2_f1_score = float(row[-1]) print("loaded past results") model = model.to(device) # Get L_sim Data ready real_query_tokens, _ = BaseVariableLengthDataset.variable_length_batch_as_tensors( sim_data["queries"], pad_idx) real_query_tokens = real_query_tokens.to(device) query_labels = sim_data["labels"] queries_by_label = {} for i, label in enumerate(query_labels): if label in queries_by_label: queries_by_label[label][i] = 1 else: queries_by_label[label] = [0] * len(query_labels) queries_by_label[label][i] = 1 query_index_matrix = [] for i, label in enumerate(query_labels): query_index_matrix.append(queries_by_label[label][:]) query_index_matrix = torch.tensor(query_index_matrix) neg_query_index_matrix = 1 - query_index_matrix for i, row in enumerate(neg_query_index_matrix): neg_query_index_matrix[i][i] = 1 query_index_matrix = query_index_matrix.to(device) neg_query_index_matrix = neg_query_index_matrix.to(device) # define the optimizer if args.use_adagrad: optimizer = Adagrad(model.parameters(), lr=args.learning_rate) else: optimizer = AdamW(model.parameters(), lr=args.learning_rate) # define loss functions find_loss_function = nn.BCEWithLogitsLoss( pos_weight=torch.tensor([20.0]).to(device)) sim_loss_function = similarity_loss_function for epoch in range(args.start_epoch, args.start_epoch + epochs): print('\n Epoch {:} / {:}'.format(epoch + 1, args.start_epoch + epochs)) total_loss, find_total_loss, sim_total_loss = 0, 0, 0 batch_count = 0 model.train() # iterate over batches for step, batch in enumerate( tqdm( train_dataset.as_batches(batch_size=args.train_batch_size, seed=epoch))): # push the batch to gpu batch = [r.to(device) for r in batch] tokens, queries, labels = batch # clear previously calculated gradients model.zero_grad() # get model predictions for the current batch token_scores = model.find_forward(tokens, queries, lower_bound) pos_scores, neg_scores = model.sim_forward(real_query_tokens, query_index_matrix, neg_query_index_matrix) # compute the loss between actual and predicted values find_loss = find_loss_function(token_scores, labels) sim_loss = sim_loss_function(pos_scores, neg_scores) string_loss = find_loss + args.gamma * sim_loss # add on to the total loss find_total_loss = find_total_loss + find_loss.item() sim_total_loss = sim_total_loss + sim_loss.item() total_loss = total_loss + string_loss.item() batch_count += 1 if batch_count % 100 == 0 and batch_count > 0: print( (find_total_loss, sim_total_loss, total_loss, batch_count)) # backward pass to calculate the gradients string_loss.backward() # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # update parameters optimizer.step() # compute the training loss of the epoch train_avg_loss = total_loss / batch_count train_avg_find_loss = find_total_loss / batch_count train_avg_sim_loss = sim_total_loss / batch_count print("Starting Primary Evaluation") eval_results = evaluate_find_module( primary_eval_path, real_query_tokens, query_index_matrix, neg_query_index_matrix, lower_bound, model, find_loss_function, sim_loss_function, args.eval_batch_size, args.gamma) dev_avg_loss, dev_avg_find_loss, dev_avg_sim_loss, dev_f1_score, total_og_scores, total_new_scores = eval_results print("Finished Primary Evaluation") if dev_f1_score > best_f1_score or (dev_f1_score == best_f1_score and dev_avg_loss < best_dev_loss): print("Saving Model") if len(args.model_save_dir) > 0: dir_name = args.model_save_dir else: dir_name = "../data/saved_models/" torch.save( model.state_dict(), "{}Find-Module-pt_{}.p".format(dir_name, args.experiment_name)) with open( "../data/result_data/best_dev_total_og_scores_{}.p".format( args.experiment_name), "wb") as f: pickle.dump(total_og_scores, f) with open( "../data/result_data/best_dev_total_new_scores_{}.p". format(args.experiment_name), "wb") as f: pickle.dump(total_new_scores, f) best_f1_score = dev_f1_score best_dev_loss = dev_avg_loss epoch_losses.append( (train_avg_loss, train_avg_find_loss, train_avg_sim_loss, dev_avg_loss, dev_avg_find_loss, dev_avg_sim_loss, dev_f1_score)) print("Best Primary F1: {}".format(str(best_f1_score))) print(epoch_losses[-3:]) if len(secondary_eval_path) > 0: print("Starting Secondary Evaluation") eval_results = evaluate_find_module( secondary_eval_path, real_query_tokens, query_index_matrix, neg_query_index_matrix, lower_bound, model, find_loss_function, sim_loss_function, args.eval_batch_size, args.gamma) dev_2_avg_loss, dev_2_avg_find_loss, dev_2_avg_sim_loss, dev_2_f1_score, total_og_scores, total_new_scores = eval_results print("Finished Secondary Evaluation") if dev_2_f1_score > best_dev_2_f1_score: best_dev_2_f1_score = dev_2_f1_score with open( "../data/result_data/best_dev_2_total_og_scores_{}.p". format(args.experiment_name), "wb") as f: pickle.dump(total_og_scores, f) with open( "../data/result_data/best_dev_2_total_new_scores_{}.p". format(args.experiment_name), "wb") as f: pickle.dump(total_new_scores, f) dev_2_epoch_losses.append((dev_2_avg_loss, dev_2_avg_find_loss, dev_2_avg_sim_loss, dev_2_f1_score)) print("Best Secondary F1: {}".format(str(best_dev_2_f1_score))) print(dev_2_epoch_losses[-3:]) if best_f1_score > 0.9: break with open( "../data/result_data/loss_per_epoch_Find-Module-pt_{}.csv".format( args.experiment_name), "w") as f: writer = csv.writer(f) writer.writerow([ 'train_loss', 'train_find_loss', 'train_sim_loss', 'dev_loss', 'dev_find_loss', 'dev_sim_loss', 'dev_f1_score' ]) for row in epoch_losses: writer.writerow(row) if len(secondary_eval_path) > 0: with open( "../data/result_data/dev_2_loss_per_epoch_Find-Module-pt_{}.csv" .format(args.experiment_name), "w") as f: writer = csv.writer(f) writer.writerow([ "dev_2_avg_loss", "dev_2_avg_find_loss", "dev_2_avg_sim_loss", "dev_2_f1_score" ]) for row in dev_2_epoch_losses: writer.writerow(row)
class VAE(BaseEstimator, TransformerMixin): """ :param decoder: Address neural network in decoder. Possible values are bernoulli and gaussian. """ def __init__(self, nohiddens: int = 400, nolatents: int = 20, nosamples: int = 1, noepochs: int = 15, batch_size: int = 100, show_every: int = 100, decoder: str = 'bernoulli', outdir: str = 'output/'): super(BaseEstimator, self).__init__() makedirs(outdir, exist_ok=True) self.noinputs = None self.nohiddens = nohiddens self.nolatents = nolatents self.nosamples = nosamples self.noepochs = noepochs self.batch_size = batch_size self.show_every = show_every self.outdir = outdir if decoder == 'bernoulli': self.fit = self.__fit_bernoulli elif decoder == 'gaussian': self.fit = self.__fit_gaussian else: raise ValueError(f'Unknown decoder type: "{decoder}".') self.logger = getLogger(__name__) self.model = None self.opt = None def fit(self, X): """Method fit is overloaded during construction of VAE estimator. See constructor for details. """ def transform(self, X: Tensor) -> LatentVariable: latvar = LatentVariable(self.model, *self.model.encode(X)) return latvar def inverse_transform(self, X: Tensor) -> Tensor: origin = self.model.decode(X) if isinstance(origin, tuple): return origin[0] else: return origin def __fit(self, dataset: Tensor, model: VAEBase): it = DataLoader(dataset, batch_size=self.batch_size, shuffle=True) dur = self.noepochs * ceil(len(dataset) / self.batch_size) history = History(zeros(dur), zeros(dur), zeros(dur), zeros(dur), zeros(dur)) hooks = CombinedHook() hooks.add(LossHook) hooks.add(RekonstruktHook, dataset[:10, :]) hooks.add(LatentSamplerHook, self.nolatents) hooks.prehook(self, history) self.model = model self.noinputs = model.noinputs self.opt = Adagrad(self.model.parameters(), lr=0.01) # See Section 5. for epoch in range(self.noepochs): for i, x in enumerate(it): self.opt.zero_grad() # Apply model in the following steps: # (a) encode datapoint into latent space; # (b) sample points from latent space; # (c) decode sampled points from latent space. mu, logsigma2 = self.model.encode(x) z = self.model.sample(mu, logsigma2) X = self.model.decode(z) # Estimate KL-divergence and reconstruction error (RE). kl = self.model.kl(mu, logsigma2) re = self.model.re(x, X) # Do error backpropagation. loss = kl + re loss.backward() self.opt.step() # Aggregation runtime statistics. history.append(epoch=epoch, batch=i, kl=float(kl / self.batch_size), re=float(re / self.batch_size)) if i % self.show_every == 0: hooks.hook(self, history) # Print status before exit. hooks.posthook(self, history) # Return itself for calls chaining. return self def __fit_bernoulli(self, dataset: Tensor): params = self.get_params() params['noinputs'] = dataset.shape[1] model = VAEBernoulliDecoder(**params) return self.__fit(dataset, model) def __fit_gaussian(self, dataset: Tensor): params = self.get_params() params['noinputs'] = dataset.shape[1] model = VAEGaussianDecoder(**params) return self.__fit(dataset, model)
class trainW2V: """ To train a word2vec model on a text obtained from pubmed scraping. """ def __init__(self, text, windowSize=5, negWords=15, embedDim=200, vocabSize=None, nOccur=10, phMinCount=5, phThresh=10, phDepth=2, wInit='scaled-uniform', epochs=50, batchSize= 1024, optimizer='SGD', lr=0.01, patience=5, epsilon=1e-5, raw=False, tShuff=False, saveFreq=-1, restoreBest=True, outPath='./'): """ Args: text (nested list): input text as list of sentences. windowSize (int): size of the context window. negWords (int): number of negative words used in training. embedDim (int): dimensionality of the embedded space (default 200). vocabSize (int): size of the the vocabulary (default None) nOccur (int): minimum number of occurrencies to keep a word in the dictionary, can be overwritten by vocabSiz (default 10). phMinCount (int): minimum number of occurrences to keep a phrase (default 5). phThresh (float): minimum score to keep a phrase (default 10). phDepth (int): number of recursions during phrase search (1 = bi-grams, default 2). wInit (string): distribution from which to draw initial node weights (only 'scaled-uniform' and 'xavier' are currently available, default 'scaled-uniform'). epochs (int): number of epochs (default 50). batchSize (int): size of batches (default 1024). optimizer (str): optimizer choice, 'SGD' amd 'Adagrad' only (default 'SGD'). lr (float): learning rage (default .01). patience (int): early stop patience (default 5). epsilon (float): early stop epsilon (default 1e-5). raw (bool): if True clean the input text (default True). tShuff (bool): shuffle training set at each epoch (default false). saveFreq (int): frequency of model checkpoints, if < 0 don't save checkpoints (default -1). restoreBest (bool): restore and save best model by early stopping. outPath (string): path to directory where to save the trained models. """ """ Set up training dataset and batches. """ self.trainDs = textDataset(text, windowSize, negWords, vocabSize=vocabSize, nOccur=nOccur, phMinCount=phMinCount, phThresh=phThresh, phDepth=phDepth, raw=raw) self.trainBatch = DataLoader(self.trainDs, batch_size = batchSize, shuffle = tShuff) """ Set up model """ self.model = skipGram(int(self.trainDs.wDict.shape[0]), embedDim, wInit) """ Send model to GPU if available. """ if torch.cuda.is_available(): self.model.cuda() self.epochs = epochs if optimizer == 'SGD': # no momentum allowed with sparse matrices :( self.optimizer = SGD(self.model.parameters(), lr=lr) elif optimizer == 'Adagrad': self.optimizer = Adagrad(self.model.parameters(), lr=lr) else: print ('ERROR: '+optimizer+' is not available, please select SGD or Adagrad.') sys.exit(1) self.losses = [] """ Set up early stopping. """ self.earlStop = EarlyStopping(patience=patience, epsilon=epsilon, keepBest=True) self.restoreBest = restoreBest self.saveFreq = saveFreq if self.saveFreq < 0: self.saveFreq = self.epochs + 1 self.outPath = outPath if not os.path.exists(self.outPath): os.makedirs(self.outPath) def train(self): """ Run the training of the model. """ for epoch in tqdm(range(self.epochs), desc='Epoch'): pBarB = tqdm(enumerate(self.trainBatch), total=len(self.trainBatch), desc='Batch') for batchNum, batch in pBarB: wordBatch = batch[0] contBatch = batch[1] negaBatch = batch[2] """ Move batches to GPU if available. """ if torch.cuda.is_available(): wordBatch = wordBatch.cuda() contBatch = contBatch.cuda() negaBatch = negaBatch.cuda() """ Core of training. """ self.optimizer.zero_grad() loss = self.model(wordBatch, contBatch, negaBatch) loss.backward() self.optimizer.step() pBarB.set_postfix({'loss' : '{:.5f}'.format(loss.item())}) """ Store loss. """ self.losses.append(loss.item()) """ Save checkpoint model every n-th epoch. """ if epoch > 0 and epoch%self.saveFreq == 0: self.saveModel(name='_{:d}_{:.5f}'.format(epoch,loss)) """ Early stop check. """ self.earlStop(loss, self.model) if self.earlStop.earlyStop: print('Limit loss improvement reached, stopping the training.') break """ Restore and save best model. """ if self.restoreBest: self.model = self.earlStop.bestModel def saveModel(self, name): """ Saves any model and its dictionary. Args: name (string): file name. """ torch.save({'model_state_dict': self.model.state_dict(), 'word_to_ix': self.trainDs.wDict['word'].to_dict() }, os.path.join(self.outPath, 'model_'+name+'.pt')) def getEmbedded(self): """ Returns the embedding layer weights, equivalent to the word vectors in the embedded space. Returns: (numpy array): the embedding layer weights. """ return self.model.getEmbedded()
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join(self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) s_t_1_origin = s_t_1 batch_size = batch.batch_size step_losses = [] sample_idx = [] sample_log_probs = Variable(torch.zeros(batch_size)) baseline_idx = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing, shape [batch_size] final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) # sample if di == 0: # use decoder input[0], which is <BOS> sample_t_1 = dec_batch[:, di] s_t_sample = s_t_1_origin c_t_sample = Variable(torch.zeros((batch_size, 2 * config.hidden_dim))) final_dist, s_t_sample, c_t_sample, attn_dist, p_gen, next_coverage = self.model.decoder(sample_t_1, s_t_sample, encoder_outputs, encoder_feature, enc_padding_mask, c_t_sample, extra_zeros, enc_batch_extend_vocab, coverage, di) # according to final_dist to sample # change sample_t_1 dist = torch.distributions.Categorical(final_dist) sample_t_1 = Variable(dist.sample()) # record sample idx sample_idx.append(sample_t_1) # tensor list # compute sample probability sample_log_probs += torch.log( final_dist.gather(1, sample_t_1.view(-1, 1))) # gather value along axis=1. given index # baseline if di == 0: # use decoder input[0], which is <BOS> baseline_t_1 = dec_batch[:, di] s_t_sample = s_t_1_origin c_t_sample = Variable(torch.zeros((batch_size, 2 * config.hidden_dim))) final_dist, s_t_baseline, c_t_baseline, attn_dist, p_gen, next_coverage = self.model.decoder(baseline_t_1, s_t_baseline, encoder_outputs, encoder_feature, enc_padding_mask, c_t_baseline, extra_zeros, enc_batch_extend_vocab, coverage, di) # according to final_dist to get baseline # change baseline_t_1 baseline_t_1 = torch.autograd.Variable(final_dist.max(1)) # get max value along axis=1 # record baseline probability baseline_idx.append(baseline_t_1) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) # according to sample_idx and baseline_idx to compute RL loss # map sample/baseline_idx to string # compute rouge score # compute loss sample_idx = torch.stack(sample_idx, dim=1).squeeze() # expect shape (batch_size, seq_len) baseline_idx = torch.stack(baseline_idx, dim=1).squeeze() rl_loss = torch.zeros(batch_size) for i in range(sample_idx.shape[0]): # each example in a batch sample_y = data.outputids2words(sample_idx[i], self.vocab, (batch.art_oovs[i] if config.pointer_gen else None)) baseline_y = data.outputids2words(baseline_idx[i], self.vocab, (batch.art_oovs[i] if config.pointer_gen else None)) true_y = batch.original_abstracts[i] sample_score = rouge_l_f(sample_y, true_y) baseline_score = rouge_l_f(baseline_y, true_y) sample_score = Variable(sample_score) baseline_score = Variable(baseline_score) rl_loss[i] = baseline_score - sample_score rl_loss = rl_loss * sample_log_probs gamma = 0.9984 loss = (1 - gamma) * loss + gamma * rl_loss loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 5000 == 0: self.save_model(running_avg_loss, iter)
class Train(object): def __init__(self, model_file_path=None): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) if not model_file_path: train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) else: train_dir = re.sub('/model/model.*', '', model_file_path) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.create_file_writer(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) # self.optimizer = Adam(params) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def f(self, x, alpha): # # 1 - x ** alpha # k = utils.EPOCH / (utils.MAX_EPOCH / 2) - 1 # return k * x + (1 - k)/2 return 1 - x**alpha def get_loss_mask(self, src, tgt, absts, alpha=config.alpha): loss_mask = [] for i in range(len(src)): # debug('src[i]',src[i]) # debug('tgt[i]',src[i]) # cnt = 0 # tgt_i = [t for t in tgt[i] if t != 1] # src_i = set([s for s in src[i] if s != 1]) # debug('src_i',src_i) # m = [t for t in tgt_i if t not in src_i ] # # for token in tgt_i: # # if token not in src_i: # # cnt += 1 # cnt = len(m) # abst = round(cnt / len(tgt_i),4) abst = absts[i] loss_factor = self.f(abst, alpha) loss_mask.append(loss_factor) return torch.Tensor(loss_mask).cuda() def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) # debug(batch.original_articles[0]) # debug(batch.original_abstracts[0]) loss_mask = self.get_loss_mask(enc_batch, dec_batch, batch.absts) # debug('loss_mask',loss_mask) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage, tau = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) # debug('enc_batch',enc_batch.size()) # debug('dec_batch',dec_batch.size()) # debug('final_dist', final_dist.size()) # debug('target',target) # debug('gold_probs',gold_probs) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask # debug('step_loss_before',step_loss) # debug('config.loss_mask',config.loss_mask) if config.loss_mask: step_loss = step_loss * loss_mask # pass # debug('step_loss_after',step_loss) step_losses.append(step_loss) if config.DEBUG: # break pass sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) if not config.DEBUG: loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item(), tau def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() start_iter = iter while iter < n_iters: batch = self.batcher.next_batch() loss, tau = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if config.DEBUG: debug('iter', iter) if iter - start_iter > config.BREAK_POINT: break if iter % 100 == 0: self.summary_writer.flush() print_interval = 100 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) if config.adaptive_sparsemax: print('tau + eps', [ round(e[0], 4) for e in (tau + config.eps).detach().cpu().numpy().tolist() ]) start = time.time() if iter % 5000 == 0: self.save_model(running_avg_loss, iter)
class Train: def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, moving_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': moving_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.do_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] # 在训练到某个epoch,需要切换到coverage结构,因此需要使用新的optimizer状态。此处控制切换时机。 if not config.do_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, context_v, coverage = \ get_encoder_variables(batch, use_cuda) # dec_lens_var:一个batch的decoder目标序列长度 dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_decoder_variables(batch, use_cuda) self.optimizer.zero_grad() if 0 in enc_lens: print('=================') print(enc_batch.shape) print(enc_lens) print(enc_batch) print('=================') encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) d_hc = self.model.reduce_state(encoder_hidden) # decoder初始h,c step_losses = [] # for step in tqdm.tqdm(range(min(max_dec_len, config.max_dec_steps))): for step in range(min(max_dec_len, config.max_dec_steps)): d_inp = dec_batch[:, step] # Teacher forcing final_dist, d_hc, context_v, attn_dist, p_gen, next_coverage = self.model.decoder( d_inp, d_hc, encoder_outputs, encoder_feature, enc_padding_mask, context_v, extra_zeros, enc_batch_extend_vocab, coverage, step) target = target_batch[:, step] # gather每一步target id的预测概率 gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.do_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) # encoder的累计分布作为损失,见原论文 step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, step] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, moving_avg_loss = self.setup_train(model_file_path) start = time.time() pbar = tqdm.tqdm(total=n_iters) while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) moving_avg_loss = calc_moving_avg_loss(loss, moving_avg_loss, self.summary_writer, iter) iter += 1 pbar.update(1) if iter % 100 == 0: self.summary_writer.flush() print_interval = 100 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 5000 == 0: self.save_model(moving_avg_loss, iter) pbar.close()
class Trainer: def __init__(self, config): self.config = config self.device = config['device'] self.step = 0 if os.path.exists('../vocab.pt'): self.vocab = torch.load('../vocab.pt') else: self.vocab = Vocab(config['vocab_file'], config['vocab_size']) torch.save(self.vocab, '../vocab.pt') self.train_data = CNNDMDataset('train', config['data_path'], config, self.vocab) self.validate_data = CNNDMDataset('val', config['data_path'], config, self.vocab) self.setup(config) def setup(self, config): self.model = Model(config).to(config['device']) self.optimizer = Adagrad(self.model.parameters(), lr=config['learning_rate'], initial_accumulator_value=0.1) # self.optimizer = Adam(self.model.parameters(),lr = config['learning_rate'],betas = config['betas']) checkpoint = None if config[ 'train_from'] != '': # Counter在两次mostCommon间, 相同频率的元素可能以不同的次序输出...! logging('Train from %s' % config['train_from']) checkpoint = torch.load(config['train_from'], map_location='cpu') self.model.load_state_dict(checkpoint['model']) self.step = checkpoint['step'] self.vocab = checkpoint['vocab'] self.optimizer.load_state_dict(checkpoint['optimizer']) # print('State dict parameters:') # for n in model.state_dict().keys(): # print(n) #self.optimizer = Adam(self.model.parameters(),lr = config['learning_rate'],betas = config['betas']) def train_one(self, batch): """ coverage not implemented """ config = self.config enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros = \ get_input_from_batch(batch, config, self.device) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, self.device) pred = self.model(enc_batch, dec_batch, enc_padding_mask, dec_padding_mask, enc_batch_extend_vocab, extra_zeros) # >>>>>>>> DEBUG Session <<<<<<<<< # print("ENC\n") # print(enc_batch) # print("DEC\n") # print(dec_batch) # print("TGT\n") # print(target_batch) # print("ENCP\n") # print(enc_padding_mask) # print("DECP\n") # print(dec_padding_mask) # encs = [self.vocab.id2word(int(v)) for v in enc_batch[:, 0]] # decs = [self.vocab.id2word(int(v)) for v in dec_batch[:, 0]] # print(' '.join(encs)) # print(' '.join(decs)) #print(pred.max(dim=-1)[1][:,0]) # #loss = self.model.nll_loss(pred, target_batch, dec_lens_var) loss = self.model.label_smoothing_loss(pred, target_batch) return loss def train(self): config = self.config train_loader = DataLoader(self.train_data, batch_size=config['batch_size'], shuffle=True, collate_fn=Collate()) running_avg_loss = 0 self.model.train() for _ in range(config['train_epoch']): for batch in train_loader: self.step += 1 loss = self.train_one(batch) running_avg_loss = calc_running_avg_loss( loss.item(), running_avg_loss) loss.div(float(config['gradient_accum'])).backward() if self.step % config[ 'gradient_accum'] == 0: # gradient accumulation clip_grad_norm_(self.model.parameters(), config['max_grad_norm']) self.optimizer.step() self.optimizer.zero_grad() if self.step % config['report_every'] == 0: logging("Step %d Train loss %.3f" % (self.step, running_avg_loss)) if self.step % config['save_every'] == 0: self.save() if self.step % config['validate_every'] == 0: self.validate() @torch.no_grad() def validate(self): self.model.eval() validate_loader = DataLoader(self.validate_data, batch_size=self.config['batch_size'], shuffle=False, collate_fn=Collate()) losses = [] for batch in tqdm(validate_loader): loss = self.train_one(batch) losses.append(loss.item()) self.model.train() ave_loss = sum(losses) / len(losses) logging('Validate loss : %f' % ave_loss) def save(self): state = { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'step': self.step, 'vocab': self.vocab } save_path = os.path.join(self.config['model_path'], 'model_s%d.pt' % self.step) logging('Saving model step %d to %s...' % (self.step, save_path)) torch.save(state, save_path)
class TrainSeq2Seq(object): def __init__(self, is_word_level=False, is_combined=False, alpha=0.3): self.vocab = Vocab(config.vocab_path, config.vocab_size) # self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', # batch_size=config.batch_size, single_pass=False) self.dataset = DailyMailDataset("train", self.vocab) #time.sleep(15) self.is_word_level = is_word_level self.is_combined = is_combined self.alpha = alpha if is_word_level: print("Using Word Level Policy Gradient") elif is_combined: print("Using Combined Policy Gradient w/ alpha = ", alpha) else: print("Using Sentence Level Policy Gradient") train_dir = './train_dumps' # train_dir = './train_dumps' if not os.path.exists(train_dir): #print('create dict') os.mkdir(train_dir) self.model_dir = os.path.join( train_dir, 'dumps_model_{:%m_%d_%H_%M}'.format(datetime.now())) if not os.path.exists(self.model_dir): #print('create folder') os.mkdir(self.model_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) return model_save_path def setup(self, seqseq_model, model_file_path): self.model = seqseq_model params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) #self.optimizer = Adam(params, lr=initial_lr) start_iter, start_loss = 0, 0 if model_file_path is not None: print("Loading checkpoint .... ") state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if config.use_gpu: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch_nll(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, config.use_gpu) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, config.use_gpu) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def train_nll(self, n_iters, iter, running_avg_loss): start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch_nll(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) print("Iteration:", iter, " loss:", loss, " Running avg loss:", running_avg_loss) iter += 1 print_interval = 1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 1000 == 0: self.save_model(running_avg_loss, iter) def train_pg(self, n_iters, start_iter, start_running_avg_loss, start_pg_losses, start_run_avg_losses, num_epochs=50): """ The generator is trained using policy gradients, using the reward from the discriminator. Training is done for num_batches batches. """ dataloader = DataLoader(self.dataset, batch_size=config.batch_size, shuffle=True, num_workers=1, collate_fn=create_batch_collate( self.vocab, config.batch_size)) # pg_batcher = Batcher(config.train_data_path, self.vocab, mode='train', # batch_size=config.batch_size, single_pass=False) # # time.sleep(15) start = time.time() running_avg_loss = start_running_avg_loss pg_losses = start_pg_losses run_avg_losses = start_run_avg_losses iteration = start_iter for epoch in range(num_epochs): print("Epoch :", epoch + 1) for batch in dataloader: iteration += 1 loss = self.train_one_batch_pg(batch) running_avg_loss = calc_running_avg_loss( loss, running_avg_loss, iteration) print("Iteration:", iteration, " PG loss:", loss, " Running avg loss:", running_avg_loss) pg_losses.append(loss) run_avg_losses.append(running_avg_loss) print_interval = 10 if iteration % print_interval == 0: print( 'steps %d, seconds for %d batch: %.2f , loss: %f' % (iteration, print_interval, time.time() - start, loss)) start = time.time() if iteration % 10 == 0: # Dump model and losses model_file_path = self.save_model(running_avg_loss, iteration) pickle.dump( pg_losses, open( os.path.join( self.model_dir, 'train_pg_losses_{}.p'.format(iteration)), 'wb')) pickle.dump( run_avg_losses, open( os.path.join( self.model_dir, 'train_run_avg_losses_{}.p'.format(iteration)), 'wb')) # Run eval eval_processor = Evaluate_pg( model_file_path, is_word_level=self.is_word_level, is_combined=self.is_combined, alpha=self.alpha) eval_losses = eval_processor.run_eval( self.model_dir, iteration) # Check if we should stop avg_eval_loss = np.mean(eval_losses) if running_avg_loss < avg_eval_loss: print("Stopping at iteration {}".format(iteration)) break def compute_policy_grads_using_rewards(self, sentence_rewards, word_rewards, sentence_losses, word_losses, word_to_sent_ind): if self.is_combined: pg_losses = [[(self.alpha * word_reward + (1 - self.alpha) * sentence_rewards[i][word_to_sent_ind[i][j]]) * word_losses[i][j] for j, word_reward in enumerate(abstract_rewards) if j < len(word_to_sent_ind[i])] for i, abstract_rewards in enumerate(word_rewards)] pg_losses = [sum(pg) for pg in pg_losses] elif self.is_word_level: pg_losses = [[ word_reward * word_losses[i][j] for j, word_reward in enumerate(abstract_rewards) if j < len(word_to_sent_ind[i]) ] for i, abstract_rewards in enumerate(word_rewards)] pg_losses = [sum(pg) for pg in pg_losses] else: pg_losses = [[ rs * sentence_losses[ri][rsi] for rsi, rs in enumerate(r) ] for ri, r in enumerate(sentence_rewards)] pg_losses = [sum(pg) for pg in pg_losses] return pg_losses def compute_pg_loss(self, orig, pred, sentence_losses, split_predictions, word_losses, word_to_sent_ind): sentence_rewards = None word_rewards = None # First compute the rewards if not self.is_word_level or self.is_combined: sentence_rewards = get_sentence_rewards(orig, pred) if self.is_word_level or self.is_combined: word_rewards = get_word_level_rewards(orig, split_predictions) pg_losses = self.compute_policy_grads_using_rewards( sentence_rewards=sentence_rewards, word_rewards=word_rewards, sentence_losses=sentence_losses, word_losses=word_losses, word_to_sent_ind=word_to_sent_ind) return pg_losses def compute_batched_sentence_loss(self, word_losses, orig, pred): orig_sum = [] new_pred = [] pred_sum = [] sentence_losses = [] # Convert the original sum as one single string per article for i in range(len(orig)): orig_sum.append(' '.join(map(str, orig[i]))) new_pred.append([]) pred_sum.append([]) sentence_losses.append([]) batch_sent_indices = [] for i in range(len(pred)): sentence = [] sentence = pred[i] losses = word_losses[i] sentence_indices = [] count = 0 while len(sentence) > 0: try: idx = sentence.index(".") except ValueError: idx = len(sentence) sentence_indices.extend([count for _ in range(idx)]) if count > 0: new_pred[i].append(new_pred[i][count - 1] + sentence[:idx + 1]) else: new_pred[i].append(sentence[:idx + 1]) sentence_losses[i].append(sum(losses[:idx + 1])) sentence = sentence[idx + 1:] losses = losses[idx + 1:] count += 1 batch_sent_indices.append(sentence_indices) for i in range(len(pred)): for j in range(len(new_pred[i])): pred_sum[i].append(' '.join(map(str, new_pred[i][j]))) pg_losses = self.compute_pg_loss(orig_sum, pred_sum, sentence_losses, split_predictions=pred, word_losses=word_losses, word_to_sent_ind=batch_sent_indices) return pg_losses def train_one_batch_pg(self, batch): batch_size = batch.batch_size enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, config.use_gpu) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, config.use_gpu) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] output_ids = [] # Begin with START symbol y_t_1 = torch.ones(batch_size, dtype=torch.long) * self.vocab.word2id( data.START_DECODING) if config.use_gpu: y_t_1 = y_t_1.cuda() for _ in range(batch_size): output_ids.append([]) step_losses.append([]) for di in range(min(max_dec_len, config.max_dec_steps)): #y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) # NLL step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask # Move on to next token _, idx = torch.max(final_dist, 1) idx = idx.reshape(batch_size, -1).squeeze() y_t_1 = idx for i, pred in enumerate(y_t_1): if not pred.item() == data.PAD_TOKEN: output_ids[i].append(pred.item()) for i, loss in enumerate(step_loss): step_losses[i].append(step_loss[i]) # Obtain the original and predicted summaries original_abstracts = batch.original_abstracts_sents predicted_abstracts = [ data.outputids2words(ids, self.vocab, None) for ids in output_ids ] # Compute the batched loss batched_losses = self.compute_batched_sentence_loss( step_losses, original_abstracts, predicted_abstracts) #batched_losses = Variable(batched_losses, requires_grad=True) losses = torch.stack(batched_losses) losses = losses / dec_lens_var loss = torch.mean(losses) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item()
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) train_dir = os.path.join(config.log_root, 'train_{}'.format(stamp)) if not os.path.exists(train_dir): os.makedirs(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter_step): """保存模型""" state = { 'iter': iter_step, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) model_save_path = os.path.join(self.model_dir, 'model_{}_{}'.format(iter_step, stamp)) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): """模型初始化或加载、初始化迭代次数、损失、优化器""" # 初始化模型 self.model = Model(model_file_path) # 模型参数的列表 params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr # lr_coverage和lr二选一 # 定义优化器 self.optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) # 初始化迭代次数和损失 start_iter, start_loss = 0, 0 # 如果传入的已存在的模型路径,加载模型继续训练 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if USE_CUDA: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(DEVICE) return start_iter, start_loss def train_one_batch(self, batch): """ 训练一个batch,返回该batch的loss。 enc_batch: torch.Size([16, 400]), 16篇文章的编码,不足400词的用pad的编码补足, oov词汇用0编码; enc_padding_mask: torch.Size([16, 400]), 对应pad的位置为0,其余为1; enc_lens: numpy.ndarray, 列表内每个元素表示每篇article的单词数; enc_batch_extend_vocab:torch.Size([16, 400]), 16篇文章的编码;oov词汇用超过词汇表的编码; extra_zeros: torch.Size([16, 文章oov词汇数量]) zero tensor; c_t_1: torch.Size([16, 512]) zero tensor; coverage: Variable(torch.zeros(batch_size, max_enc_seq_len)) if is_coverage==True else None;coverage模式时后续有值 ---------------------------------------- dec_batch: torch.Size([16, 100]) 摘要编码含有开始符号编码以及PAD; dec_padding_mask: torch.Size([16, 100]) 对应pad的位置为0,其余为1; max_dec_len: 标量,摘要词语数量,不包含pad dec_lens_var: torch.Size([16] 摘要词汇数量 target_batch: torch.Size([16, 100]) 目标摘要编码含有STOP符号编码以及PAD """ enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch) # 暂时未理解extra_zeros含义 self.optimizer.zero_grad() """ # 记得修改Batch类添加vocab属性 print("模型输入文章编码:", "*"*100) print("enc_batch:", enc_batch, enc_batch.size()) print("enc_batch[-1]:", enc_batch[-1]) # print("batch._id_to_word:", batch.vocab._id_to_word) print("enc_batch[-1]原文:", [batch.vocab.id2word(idx) for idx in enc_batch[-1].cpu().numpy()]) print("-"*50) print("enc_padding_mask:", enc_padding_mask, enc_padding_mask.size()) print("-"*50) print("enc_lens:", enc_lens, enc_lens.shape) print("-"*50) print("enc_batch_extend_vocab", enc_batch_extend_vocab, enc_batch_extend_vocab.size()) print("enc_batch_extend_vocab[-1]:", enc_batch_extend_vocab[-1]) print("enc_batch_extend_vocab[-1]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in enc_batch_extend_vocab[-1].cpu().numpy()]) print("-"*50) print("extra_zeros:", extra_zeros, extra_zeros.size()) print("-"*50) print("c_t_1:", c_t_1, c_t_1.size()) print("-"*50) print("coverage:", coverage) print("*"*100) print("模型输入摘要编码,包括源和目标:", "*"*100) print("dec_batch:", dec_batch, dec_batch.size()) print("dec_batch[0]:", dec_batch[0]) # print("batch._id_to_word:", batch.vocab._id_to_word) print("dec_batch[0]原文:", [batch.vocab.id2word(idx) for idx in dec_batch[0].cpu().numpy()]) print("-"*50) print("dec_padding_mask:", dec_padding_mask, dec_padding_mask.size()) print("-"*50) print("max_dec_len:", max_dec_len) print("-"*50) print("dec_lens_var", dec_lens_var, dec_lens_var.size()) print("-"*50) print("target_batch:", target_batch, target_batch.size()) print("-"*50) print("target_batch[0]:", target_batch[0], target_batch[0].size()) print("target_batch[0]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in target_batch[0].cpu().numpy()]) print("*"*100) input("任意键继续>>>") """ # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim]) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) # (h,c) = ([3, B, hid_dim], [3, B, hid_dim]) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # 摘要的一个单词,batch里的每个句子的同一位置的单词编码 # print("y_t_1:", y_t_1, y_t_1.size()) final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] # 摘要的下一个单词的编码 # print("target-iter:", target, target.size()) # print("final_dist:", final_dist, final_dist.size()) # input("go on>>") # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000 gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() # 取出目标单词的概率gold_probs step_loss = -torch.log(gold_probs + config.eps) # 最大化gold_probs,也就是最小化step_loss(添加负号) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): # 训练设置,包括 iter_step, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter_step < n_iters: # 获取下一个batch数据 batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter_step) iter_step += 1 if iter_step % 100 == 0: self.summary_writer.flush() # print_interval = 1000 if iter_step % 100 == 0: # lr = self.optimizer.state_dict()['param_groups'][0]['lr'] logging.info('steps %d, seconds for %d steps: %.2f, loss: %f' % (iter_step, 10, time.time() - start, loss)) start = time.time() # 50000次迭代就保存一下模型 if iter_step % 50000 == 0: logging.info("model saved = {}/{}".format(int(iter_step / 50000) + 1, int(config.max_iterations/50000) + 1)) self.save_model(running_avg_loss, iter_step)
class Trainer(): def __init__(self, model, args, train_dataset, eval_dataset, test_dataset, vocab, is_train=True): self.model = model #.to(args.device) self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.test_dataset = test_dataset self.is_train = is_train self.vocab = vocab self.params = list(model.encoder.parameters()) + \ list(model.decoder.parameters()) + list(model.reduce_state.parameters()) initial_lr = args.lr_coverage if args.is_coverage else args.lr self.optimizer = Adagrad( self.params, lr=initial_lr, initial_accumulator_value=args.adagrad_init_acc) def get_train_dataloader(self): if self.train_dataset is None: raise ValueError('Trainer: training requires a train_dataset.') return BucketIterator(dataset=self.train_dataset, batch_size=self.args.batch_size, device=self.args.device, sort_key=lambda x: len(x.source), sort_within_batch=True) def get_eval_dataloader(self): if self.eval_dataset is None: raise ValueError('Trainer: eval requires a eval_dataset.') return BucketIterator(dataset=self.eval_dataset, batch_size=self.args.batch_size, device=self.args.device, sort_key=lambda x: len(x.source), sort_within_batch=True) def get_test_dataloader(self): if self.test_dataset is None: raise ValueError('Trainer: testing requires a test_dataset.') return BucketIterator(dataset=self.test_dataset, batch_size=self.args.batch_size, device=self.args.device, sort_key=lambda x: len(x.source), sort_within_batch=True) def get_mask(self, batch): # print('each batch', batch[0].size()) maxlen = batch[0].size()[1] max_enc_seq_len = batch[1] mask = torch.arange(maxlen).to(self.args.device) mask = mask[None, :] < max_enc_seq_len[:, None] # print(batch.source[0]*mask) return mask def get_extra_features(self, batch): unk_index = self.vocab.stoi[UNKNOWN_TOKEN] batch = batch.cpu().detach().numpy() batch_size = batch.shape[0] max_art_oovs = max([Counter(sample)[unk_index] for sample in batch]) extra_zeros = None enc_batch_extend_vocab = np.full_like( batch, fill_value=self.vocab.stoi[PAD_TOKEN]) max_art_oovs = 0 for i, sample_index in enumerate(batch): oov_word_count = len(self.vocab) for j, word_index in enumerate(sample_index): if word_index == unk_index: enc_batch_extend_vocab[i, j] = oov_word_count oov_word_count += 1 max_art_oovs = max(max_art_oovs, oov_word_count) max_art_oovs -= len(self.vocab) enc_batch_extend_vocab = Variable( torch.from_numpy(enc_batch_extend_vocab).long()) extra_zeros = Variable(torch.zeros((batch_size, max_art_oovs))) return extra_zeros, enc_batch_extend_vocab, max_art_oovs def save_model(self, running_avg_loss, iter, model_dir): if not os.path.exists(model_dir): os.makedirs(model_dir) state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss, 'vocab': self.vocab } model_save_path = os.path.join( model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def evaluate(self, eval_dataset=None, iter=0, is_test=False): if is_test: eval_iter = self.get_test_dataloader() else: eval_iter = self.get_eval_dataloader() self.model.eval() running_avg_loss = 0 with torch.no_grad(): for i, batch in tqdm(enumerate(eval_iter), total=len(eval_iter)): # print(batch.source[0].size()) # exit() batch_size = batch.batch_size # encoder part enc_padding_mask = self.get_mask(batch.source) enc_batch = batch.source[0] enc_lens = batch.source[1] encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) coverage = Variable(torch.zeros(batch.source[0].size())).to( self.args.device) c_t_1 = Variable( torch.zeros( (batch_size, 2 * self.args.hidden_dim))).to(self.args.device) extra_zeros, enc_batch_extend_vocab, max_art_oovs = self.get_extra_features( batch.source[0]) extra_zeros = extra_zeros.to(self.args.device) enc_batch_extend_vocab = enc_batch_extend_vocab.to( self.args.device) # decoder part dec_batch = batch.target[0][:, :-1] # print(dec_batch.size()) target_batch = batch.target[0][:, 0:] dec_lens_var = batch.target[1] dec_padding_mask = self.get_mask(batch.target) max_dec_len = max(dec_lens_var) step_losses = [] for di in range(min(max_dec_len, self.args.max_dec_steps) - 1): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + self.args.eps) if self.args.is_coverage: step_coverage_loss = torch.sum( torch.min(attn_dist, coverage), 1) step_loss = step_loss + self.args.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) norm = clip_grad_norm_(self.model.encoder.parameters(), self.args.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), self.args.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), self.args.max_grad_norm) self.optimizer.step() # running_avg_loss = loss if running_avg_loss == 0 else running_avg_loss * decay + (1 - decay) * loss # running_avg_loss = min(running_avg_loss, 12) name = 'Test' if is_test else 'Evaluation' calc_running_avg_loss(loss.item(), running_avg_loss, summary_writer, iter, name) # iter += 1 # def predict(self, source_sentence): def train(self, model_path=None): train_iter = self.get_train_dataloader() iter, running_avg_loss = 0, 0 start = time.time() for epoch in range(self.args.epoches): print(f"Epoch: {epoch+1}") self.model.train() for i, batch in tqdm(enumerate(train_iter), total=len(train_iter)): # print(batch.source[0].size()) # exit() batch_size = batch.batch_size # encoder part enc_padding_mask = self.get_mask(batch.source) enc_batch = batch.source[0] enc_lens = batch.source[1] encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) coverage = Variable(torch.zeros(batch.source[0].size())).to( self.args.device) c_t_1 = Variable( torch.zeros( (batch_size, 2 * self.args.hidden_dim))).to(self.args.device) extra_zeros, enc_batch_extend_vocab, max_art_oovs = self.get_extra_features( batch.source[0]) extra_zeros = extra_zeros.to(self.args.device) enc_batch_extend_vocab = enc_batch_extend_vocab.to( self.args.device) # decoder part dec_batch = batch.target[0][:, :-1] # print(dec_batch.size()) target_batch = batch.target[0][:, 0:] dec_lens_var = batch.target[1] dec_padding_mask = self.get_mask(batch.target) max_dec_len = max(dec_lens_var) step_losses = [] for di in range(min(max_dec_len, self.args.max_dec_steps) - 1): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + self.args.eps) if self.args.is_coverage: step_coverage_loss = torch.sum( torch.min(attn_dist, coverage), 1) step_loss = step_loss + self.args.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() norm = clip_grad_norm_(self.model.encoder.parameters(), self.args.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), self.args.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), self.args.max_grad_norm) self.optimizer.step() running_avg_loss = calc_running_avg_loss( loss.item(), running_avg_loss, summary_writer, iter, 'Train') iter += 1 if iter % self.args.flush: # print('flush') summary_writer.flush() # print_interval = 10 # if iter % print_interval == 0: # print(f'steps {iter}, batch number: {i} with {time.time() - start} seconds, loss: {loss}') # start = time.time() # if iter % 300 == 0: self.save_model(running_avg_loss, iter, model_dir) self.evaluate(self.eval_dataset, epoch) self.evaluate(self.test_dataset, epoch, True)
class WeightedHolE(nn.Module): def __init__(self, *args, **kwargs): super(WeightedHolE, self).__init__() # self.add_hyperparam('rparam', kwargs.pop('rparam', 0.0)) self.learning_rate = kwargs.get('lr', _DEF_LEARNING_RATE) entity_dim, _, relation_dim = args[0] embed_dim = args[1] self._max_epochs = kwargs.get('max_epochs', _DEF_MAX_EPOCHS) init_relations = kwargs.get('init_relations') if init_relations is not None: self.R = nn.Parameter(init_relations) else: self.R = nn.Parameter(torch.FloatTensor(relation_dim, embed_dim).uniform_(-.1,.1)) self.R.my_name = 'R' self.R.grad = torch.zeros_like(self.R) pretrained_ent = kwargs.get('pretrained_entities') if pretrained_ent is not None: self.E = nn.Parameter(pretrained_ent) else: self.E = nn.Parameter(torch.FloatTensor(entitiy_dim, embed_dim).uniform_(-.1,.1)) self.E.my_name = 'E' self.E.grad = torch.zeros_like(self.E) self.loss_function = nn.SoftMarginLoss(reduction='sum') self.optim = Adagrad(list(self.parameters()), lr=self.learning_rate) def forward(self, xs, ys, minibatch_size): for loss, grads in self._optim(list(zip(xs, ys)), minibatch_size): yield loss, grads def _optim(self, xys, minibatch_size): for self._epoch in range(1, self._max_epochs+1): self.loss = 0 self.optim.zero_grad() self.train() # shuffle training examples indices = list(range(len(xys))) shuffle(indices) # store epoch for callback self.epoch_start = timeit.default_timer() # process mini-batches lower_iter, upper_iter = count(0, minibatch_size), count(minibatch_size, minibatch_size) for lower, upper in zip(lower_iter, upper_iter): # select indices for current batch if lower >= len(indices): break batch_examples = [xys[idx] for idx in indices[lower:upper]] triples,ys = zip(*batch_examples) ss,ps,os = zip(*triples) ss,ps,os,ys=torch.LongTensor(ss), torch.LongTensor(ps), torch.LongTensor(os), torch.FloatTensor(ys) yscores = self._scores(ss, ps, os) # see Holographic Embeddings, eq. 2 self.loss = self.loss_function(yscores, ys) print('loss', self.loss) fs = -(ys * torch.sigmoid(-yscores)).unsqueeze(1) entity_grad, entity_idxs = self._fn_Entity_Grad(yscores, ss, os, ps, fs) relation_grad, relation_idxs = self._fn_Relation_Grad(yscores, ss, os, ps, fs) #print('grad rel', relation_grads.shape, torch.sum(relation_grads)) for param in self.parameters(): if param.my_name == 'R': self.R.grad = relation_grad if param.my_name == 'E': for col,row_grads in zip(entity_idxs, entity_grad): # FIXME use index_put_ self.E.grad[col] = row_grads self.optim.step() #batch_loss, batch_grads = self._process_batch(bxys) #yield batch_loss, batch_grads def _fn_Entity_Grad(self, yscores, ss, os, ps, fs): sparse_indices, Sm, n = grad_sum_matrix(torch.cat((ss, os))) combined = torch.cat((fs * ccorr(self.R[ps], self.E[os]), fs * cconv(self.E[ss], self.R[ps])), dim=0) grads = torch.mm(Sm, combined) / n.unsqueeze(1) return grads, sparse_indices def _fn_Relation_Grad(self, yscores, ss, os, ps, fs): sparse_indices, Sm, n = grad_sum_matrix(ps) grads = torch.mm(Sm, fs * ccorr(self.E[ss], self.E[os])) / n return grads, sparse_indices def _scores(self, ss, ps, os): return torch.sum(self.R[ps] * ccorr(self.E[ss], self.E[os]), dim=1) def _update(self, g, idx=None): self.p2[idx] += g * g H = np.maximum(np.sqrt(self.p2[idx]), 1e-7) self.param[idx] -= self.learning_rate * g / H
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 100 == 0: self.summary_writer.flush() print_interval = 50 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 5000 == 0: self.save_model(running_avg_loss, iter)
class Trainer: def __init__(self, config): self.config = config self.step = 0 self.vocab = Vocab(config.vocab_file, config.vocab_size) self.train_data = CNNDMDataset('train', config.data_path, config, self.vocab) self.validate_data = CNNDMDataset('val', config.data_path, config, self.vocab) # self.model = Model(config).to(device) # self.optimizer = None self.setup(config) def setup(self, config): model = Model(config) checkpoint = None if config.train_from != '': logging('Train from %s' % config.train_from) checkpoint = torch.load(config.train_from, map_location='cpu') model.load_state_dict(checkpoint['model']) self.step = checkpoint['step'] self.model = model.to(device) self.optimizer = Adagrad(model.parameters(), lr=config.learning_rate, initial_accumulator_value=config.initial_acc) if checkpoint is not None: self.optimizer.load_state_dict(checkpoint['optimizer']) def train_one(self, batch): config = self.config enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, config, device) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, device) encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] for di in range(max_dec_len): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss def train(self): config = self.config train_loader = DataLoader(self.train_data, batch_size=config.batch_size, shuffle=True, collate_fn=Collate()) running_avg_loss = 0 self.model.train() for e in range(config.train_epoch): for batch in train_loader: self.step += 1 self.optimizer.zero_grad() loss = self.train_one(batch) loss.backward() clip_grad_norm_(self.model.parameters(), config.max_grad_norm) self.optimizer.step() #print(loss.item()) running_avg_loss = calc_running_avg_loss( loss.item(), running_avg_loss) if self.step % config.report_every == 0: logging("Step %d Train loss %.3f" % (self.step, running_avg_loss)) if self.step % config.validate_every == 0: self.validate() if self.step % config.save_every == 0: self.save(self.step) if self.step % config.test_every == 0: pass @torch.no_grad() def validate(self): self.model.eval() validate_loader = DataLoader(self.validate_data, batch_size=self.config.batch_size, shuffle=False, collate_fn=Collate()) losses = [] for batch in validate_loader: loss = self.train_one(batch) losses.append(loss.item()) self.model.train() ave_loss = sum(losses) / len(losses) logging('Validate loss : %f' % ave_loss) def save(self, step): state = { 'model': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'step': step } save_path = os.path.join(self.config.model_path, 'model_s%d.pt' % step) logging('Saving model step %d to %s...' % (step, save_path)) torch.save(state, save_path)
def train(): target_field = Field(sequential=True, init_token=START_DECODING, eos_token=STOP_DECODING, pad_token=PAD_TOKEN, batch_first=True, include_lengths=True, unk_token=UNKNOWN_TOKEN, lower=True) source_field = Field(sequential=True, init_token=SENTENCE_START, eos_token=SENTENCE_END, pad_token=PAD_TOKEN, batch_first=True, include_lengths=True, unk_token=UNKNOWN_TOKEN, lower=True) train_path = '../data/incar_alexa/train_public.pickle' dev_path = '../data/incar_alexa/dev_public.pickle' test_path = '../data/incar_alexa/test_public.pickle' path = '../data/cnn_stories_tokenized' summary_writer = SummaryWriter(config.summary_path) train_src, train_tgt, train_id = load_data(train_path) dev_src, dev_tgt, dev_id = load_data(dev_path) test_src, test_tgt, test_id = load_data(test_path) # train_data = prepare_data_cnn(path) # # print(train_data[0]) # train_src = [dt['src'] for dt in train_data] # train_tgt = [dt['tgt'] for dt in train_data] # train_id = [dt['id'] for dt in train_data] # train_src, test_src, train_tgt, test_tgt = train_test_split( # train_src, train_tgt, test_size=0.15, random_state=123) # train_id, test_id = train_test_split( # train_id, test_size=0.15, random_state=123) # # print(f"{len(train_src)}, {len(train_tgt)}") # train_src, dev_src, train_tgt, dev_tgt = train_test_split( # train_src, train_tgt, test_size=0.15, random_state=123) # train_id, dev_id = train_test_split( # train_id, test_size=0.15, random_state=123) # print(source_field.preprocess(train_src[0])) # exit() train_src_preprocessed = [source_field.preprocess(x) for x in train_src] dev_src_preprocessed = [source_field.preprocess(x) for x in dev_src] test_src_preprocessed = [source_field.preprocess(x) for x in test_src] train_tgt_preprocessed = [target_field.preprocess(x) for x in train_tgt] dev_tgt_preprocessed = [target_field.preprocess(x) for x in dev_tgt] test_tgt_preprocessed = [target_field.preprocess(x) for x in test_tgt] # train_src_preprocessed = source_field.apply(lambda x: source_field.preprocess(x)) vectors = Vectors( name='/home/binhna/Downloads/shared_resources/cc.en.300.vec', cache='/home/binhna/Downloads/shared_resources/') source_field.build_vocab([ train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed, dev_tgt_preprocessed ], vectors=vectors) target_field.build_vocab([ train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed, dev_tgt_preprocessed ], vectors=vectors) train_data = [{ 'src': src, 'tgt': tgt, 'id': id } for src, tgt, id in zip(train_src, train_tgt, train_id)] train_data = Mydataset(data=train_data, fields=(('source', source_field), ('target', target_field))) dev_data = [{ 'src': src, 'tgt': tgt, 'id': id } for src, tgt, id in zip(dev_src, dev_tgt, dev_id)] # print(dev_data[0]) dev_data = Mydataset(data=dev_data, fields=(('source', source_field), ('target', target_field))) test_data = [{ 'src': src, 'tgt': tgt, 'id': id } for src, tgt, id in zip(test_src, test_tgt, test_id)] test_data = Mydataset(data=test_data, fields=(('source', source_field), ('target', target_field))) # print(train_data[10].source) # print(train_data[10].target) # print(len(target_field.vocab)) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') train_iter, test_iter, dev_iter = BucketIterator.splits( datasets=(train_data, test_data, dev_data), batch_sizes=(config.batch_size, config.batch_size, config.batch_size), device=device, sort_key=lambda x: len(x.source), sort_within_batch=True) args = ARGS() setattr(args, 'vectors', source_field.vocab.vectors) setattr(args, 'vocab_size', len(source_field.vocab.itos)) setattr(args, 'emb_dim', vectors.dim) model = Model(args) params = list(model.encoder.parameters()) + list( model.decoder.parameters()) + list(model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) iter, running_avg_loss = 0, 0 start = time.time() for epoch in range(500): print(f"Epoch: {epoch+1}") for i, batch in tqdm(enumerate(train_iter), total=len(train_iter)): # print(batch.source[0].size()) # exit() batch_size = batch.batch_size # encoder part enc_padding_mask = get_mask(batch.source, device) enc_batch = batch.source[0] enc_lens = batch.source[1] encoder_outputs, encoder_feature, encoder_hidden = model.encoder( enc_batch, enc_lens) s_t_1 = model.reduce_state(encoder_hidden) coverage = Variable(torch.zeros(batch.source[0].size())).to(device) c_t_1 = Variable(torch.zeros( (batch_size, 2 * config.hidden_dim))).to(device) extra_zeros, enc_batch_extend_vocab, max_art_oovs = get_extra_features( batch.source[0], source_field.vocab) extra_zeros = extra_zeros.to(device) enc_batch_extend_vocab = enc_batch_extend_vocab.to(device) # decoder part dec_batch = batch.target[0][:, :-1] # print(dec_batch.size()) target_batch = batch.target[0][:, 0:] dec_lens_var = batch.target[1] dec_padding_mask = get_mask(batch.target, device) max_dec_len = max(dec_lens_var) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps) - 1): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum( torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() norm = clip_grad_norm_(model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.reduce_state.parameters(), config.max_grad_norm) optimizer.step() running_avg_loss = calc_running_avg_loss(loss.item(), running_avg_loss, summary_writer, iter) iter += 1 summary_writer.flush() # print_interval = 10 # if iter % print_interval == 0: # print(f'steps {iter}, batch number: {i} with {time.time() - start} seconds, loss: {loss}') # start = time.time() if iter % 300 == 0: save_model(model, optimizer, running_avg_loss, iter, config.model_dir)
class Train(object): def __init__(self): if config.is_hierarchical: raise Exception("Hierarchical PGN-AMI not supported!") self.vocab = Vocab(config.vocab_path, config.vocab_size) self.pad_id = self.vocab.word2id(PAD_TOKEN) self.start_id = self.vocab.word2id(START_DECODING) self.stop_id = self.vocab.word2id(STOP_DECODING) self.print_interval = config.print_interval train_dir = config.train_dir if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = train_dir if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join(self.model_dir, 'iter{}.pt'.format(iter)) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, ami_data, idx): # enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ # get_ami_input_from_batch(batch, use_cuda) # dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ # get_ami_output_from_batch(batch, use_cuda) enc_pack, dec_pack = get_a_batch(ami_data, idx, self.vocab, config.batch_size, config.max_enc_steps, config.max_dec_steps, self.start_id, self.stop_id, self.pad_id, sum_type='short', use_cuda=use_cuda) enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = enc_pack dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = dec_pack self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state.forward1(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) sys.stdout.flush() ami_data = load_ami_data('train') valid_data = load_ami_data('valid') # make the training data 100 random.shuffle(valid_data) ami_data.extend(valid_data[:6]) valid_data = valid_data[6:] num_batches = len(ami_data) idx = 0 # validation & stopping best_valid_loss = 1000000000 stop_counter = 0 while iter < n_iters: if idx == 0: print("shuffle training data") random.shuffle(ami_data) loss = self.train_one_batch(ami_data, idx) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) iter += 1 idx += config.batch_size if idx == num_batches: idx = 0 if iter % self.print_interval == 0: print("[{}] iter {}, loss: {:.5f}".format( str(datetime.now()), iter, loss)) sys.stdout.flush() if iter % config.save_every == 0: self.save_model(running_avg_loss, iter) if iter % config.eval_every == 0: valid_loss = self.run_eval(valid_data) print("valid_loss = {:.5f}".format(valid_loss)) if valid_loss < best_valid_loss: stop_counter = 0 best_valid_loss = valid_loss print("VALID better") else: stop_counter += 1 print( "VALID NOT better, counter = {}".format(stop_counter)) if stop_counter == config.stop_after: print("Stop training") return print("Finished training!") def eval_one_batch(self, eval_data, idx): enc_pack, dec_pack = get_a_batch(eval_data, idx, self.vocab, 1, config.max_enc_steps, config.max_dec_steps, self.start_id, self.stop_id, self.pad_id, sum_type='short', use_cuda=use_cuda) enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = enc_pack dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = dec_pack encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state.forward1(encoder_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, dim=1, index=target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_step_losses / dec_lens_var loss = torch.mean(batch_avg_loss) return loss.data.item() def run_eval(self, eval_data): running_avg_loss, iter = 0, 0 batch_losses = [] num_batches = len(eval_data) print("valid data size = {}".format(num_batches)) for idx in range(num_batches): loss = self.eval_one_batch(eval_data, idx) batch_losses.append(loss) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) print("#", end="") sys.stdout.flush() print() avg_loss = sum(batch_losses) / len(batch_losses) return avg_loss
class Train(object): def __init__(self, opt): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.opt = opt self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): # 训练设置,包括 if self.opt.load_model != None: model_file_path = os.path.join(self.model_dir, self.opt.load_model) else: model_file_path = None self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) if self.opt.train_mle == "yes": step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum( torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var mle_loss = torch.mean(batch_avg_loss) else: mle_loss = get_cuda(torch.FloatTensor([0])) # --------------RL training----------------------------------------------------- if self.opt.train_rl == "yes": # perform reinforcement learning training # multinomial sampling sample_sents, RL_log_probs = self.train_batch_RL( encoder_outputs, encoder_hidden, enc_padding_mask, encoder_feature, enc_batch_extend_vocab, extra_zeros, c_t_1, batch.art_oovs, coverage, greedy=False) with torch.autograd.no_grad(): # greedy sampling greedy_sents, _ = self.train_batch_RL(encoder_outputs, encoder_hidden, enc_padding_mask, encoder_feature, enc_batch_extend_vocab, extra_zeros, c_t_1, batch.art_oovs, coverage, greedy=True) sample_reward = self.reward_function(sample_sents, batch.original_abstracts) baseline_reward = self.reward_function(greedy_sents, batch.original_abstracts) # if iter%200 == 0: # self.write_to_file(sample_sents, greedy_sents, batch.original_abstracts, sample_reward, baseline_reward, iter) rl_loss = -( sample_reward - baseline_reward ) * RL_log_probs # Self-critic policy gradient training (eq 15 in https://arxiv.org/pdf/1705.04304.pdf) rl_loss = torch.mean(rl_loss) batch_reward = torch.mean(sample_reward).item() else: rl_loss = get_cuda(torch.FloatTensor([0])) batch_reward = 0 #loss.backward() (self.opt.mle_weight * mle_loss + self.opt.rl_weight * rl_loss).backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return mle_loss.item(), batch_reward def train_batch_RL(self, encoder_outputs, encoder_hidden, enc_padding_mask, encoder_feature, enc_batch_extend_vocab, extra_zeros, c_t_1, article_oovs, coverage, greedy): '''Generate sentences from decoder entirely using sampled tokens as input. These sentences are used for ROUGE evaluation Args :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size) :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size) :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf) :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism :param enc_batch_extend_vocab: Input batch that stores OOV ids :param article_oovs: Batch containing list of OOVs in each example :param greedy: If true, performs greedy based sampling, else performs multinomial sampling Returns: :decoded_strs: List of decoded sentences :log_probs: Log probabilities of sampled words ''' s_t_1 = self.model.reduce_state( encoder_hidden) # Decoder hidden states y_t_1 = get_cuda( torch.LongTensor(len(encoder_outputs)).fill_( self.vocab.word2id(data.START_DECODING)) ) # Input to the decoder #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf) inds = [] # Stores sampled indices for each time step decoder_padding_mask = [ ] # 存储生成样本的填充掩码 Stores padding masks of generated samples log_probs = [] # Stores log probabilites of generated samples mask = get_cuda( torch.LongTensor(len(encoder_outputs)).fill_(1) ) # Values that indicate whether [STOP] token has already been encountered; 1 => Not encountered, 0 otherwise for t in range(config.max_dec_steps): probs, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, t) if greedy is False: multi_dist = Categorical(probs) # 根据概率分布进行采样 y_t_1 = multi_dist.sample() # perform multinomial sampling log_prob = multi_dist.log_prob(y_t_1) log_probs.append(log_prob) else: _, y_t_1 = torch.max( probs, dim=1 ) # 取概率最大的词 #perform greedy sampling y_t_1 = y_t_1.detach() inds.append(y_t_1) mask_t = get_cuda(torch.zeros(len(encoder_outputs)) ) # Padding mask of batch for current time step mask_t[ mask == 1] = 1 # If [STOP] is not encountered till previous time step, mask_t = 1 else mask_t = 0 mask[ (mask == 1) + (y_t_1 == self.vocab.word2id(data.STOP_DECODING)) == 2] = 0 # If [STOP] is not encountered till previous time step and current word is [STOP], make mask = 0 decoder_padding_mask.append(mask_t) is_oov = (y_t_1 >= config.vocab_size ).long() # Mask indicating whether sampled word is OOV y_t_1 = (1 - is_oov) * y_t_1 + (is_oov) * self.vocab.word2id( data.UNKNOWN_TOKEN) # Replace OOVs with [UNK] token inds = torch.stack(inds, dim=1) decoder_padding_mask = torch.stack(decoder_padding_mask, dim=1) if greedy is False: # If multinomial based sampling, compute log probabilites of sampled words log_probs = torch.stack(log_probs, dim=1) log_probs = log_probs * decoder_padding_mask # Not considering sampled words with padding mask = 0 lens = torch.sum(decoder_padding_mask, dim=1) # Length of sampled sentence log_probs = torch.sum( log_probs, dim=1 ) / lens # (bs,) #compute normalizied log probability of a sentence decoded_strs = [] for i in range(len(encoder_outputs)): id_list = inds[i].cpu().numpy() oovs = article_oovs[i] S = data.outputids2words( id_list, self.vocab, oovs) # Generate sentence corresponding to sampled words try: end_idx = S.index(data.STOP_DECODING) S = S[:end_idx] except ValueError: S = S if len( S ) < 2: # If length of sentence is less than 2 words, replace it with "xxx"; Avoids setences like "." which throws error while calculating ROUGE S = ["xxx"] S = " ".join(S) decoded_strs.append(S) return decoded_strs, log_probs def reward_function(self, decoded_sents, original_sents): rouge = Rouge() try: scores = rouge.get_scores(decoded_sents, original_sents) except Exception: print( "Rouge failed for multi sentence evaluation.. Finding exact pair" ) scores = [] for i in range(len(decoded_sents)): try: score = rouge.get_scores(decoded_sents[i], original_sents[i]) except Exception: print("Error occured at:") print("decoded_sents:", decoded_sents[i]) print("original_sents:", original_sents[i]) score = [{"rouge-1": {"p": 0.0}}] scores.append(score[0]) rouge_l_p1 = [score["rouge-1"]["p"] for score in scores] rouge_l_p1 = get_cuda(torch.FloatTensor(rouge_l_p1)) return rouge_l_p1 def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() while iter < n_iters: batch = self.batcher.next_batch() loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) iter += 1 if iter % 50 == 0: self.summary_writer.flush() print_interval = 50 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 100 == 0: self.save_model(running_avg_loss, iter)
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) # print("MODE MUST BE train") # time.sleep(15) self.print_interval = config.print_interval train_dir = config.train_dir if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = train_dir if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) # self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join(self.model_dir, 'iter{}.pt'.format(iter)) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location= lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() if not config.is_hierarchical: encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens) s_t_1 = self.model.reduce_state.forward1(encoder_hidden) else: stop_id = self.vocab.word2id('.') pad_id = self.vocab.word2id('[PAD]') enc_sent_pos = get_sent_position(enc_batch, stop_id, pad_id) dec_sent_pos = get_sent_position(dec_batch, stop_id, pad_id) encoder_outputs, encoder_feature, encoder_hidden, sent_enc_outputs, sent_enc_feature, sent_enc_hidden, sent_enc_padding_mask, sent_lens, seq_lens2 = \ self.model.encoder(enc_batch, enc_lens, enc_sent_pos) s_t_1, sent_s_t_1 = self.model.reduce_state(encoder_hidden, sent_enc_hidden) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # Teacher forcing if not config.is_hierarchical: # start = datetime.now() final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) # print('NO HIER Time: ',datetime.now() - start) # import pdb; pdb.set_trace() else: # start = datetime.now() max_doc_len = enc_batch.size(1) final_dist, sent_s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, sent_s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, seq_lens2, sent_s_t_1, sent_enc_outputs, sent_enc_feature, sent_enc_padding_mask, sent_lens, max_doc_len, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) # print('DO HIER Time: ',datetime.now() - start) # import pdb; pdb.set_trace() target = target_batch[:, di] gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log(gold_probs + config.eps) if config.is_coverage: step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses/dec_lens_var loss = torch.mean(batch_avg_loss) # start = datatime.now() loss.backward() # print('{} HIER Time: {}'.format(config.is_hierarchical ,datetime.now() - start)) # import pdb; pdb.set_trace() clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): iter, running_avg_loss = self.setup_train(model_file_path) sys.stdout.flush() # data_path = "lib/data/batches_train.vocab50000.batch16.pk.bin" # with open(data_path, 'rb') as f: # stored_batches = pickle.load(f, encoding="bytes") # print("loaded data: {}".format(data_path)) # num_batches = len(stored_batches) while iter < n_iters: batch = self.batcher.next_batch() # batch_id = iter%num_batches # batch = stored_batches[batch_id] loss = self.train_one_batch(batch) # running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter) iter += 1 # if iter % 100 == 0: # self.summary_writer.flush() if iter % self.print_interval == 0: print("[{}] iter {}, loss: {:.5f}".format(str(datetime.now()), iter, loss)) sys.stdout.flush() if iter % config.save_every == 0: self.save_model(running_avg_loss, iter) print("Finished training!")
class Train(object): def __init__(self): self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.ouput_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.makedirs(train_dir) self.checkpoint_dir = os.path.join(train_dir, 'checkpoints') if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.train_summary_writer = tf.summary.create_file_writer( os.path.join(train_dir, 'log', 'train')) self.eval_summary_writer = tf.summary.create_file_writer( os.path.join(train_dir, 'log', 'eval')) def save_model(self, model_path, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } torch.save(state, model_path) def setup_train(self, model_file_path=None): self.model = Model(device, model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(device) return start_iter, start_loss def train_one_batch(self, batch, forcing_ratio=1): enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, device) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, device) self.optimizer.zero_grad() encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state(encoder_hidden) step_losses = [] y_t_1_hat = None for di in range(min(max_dec_len, config.max_dec_steps)): y_t_1 = dec_batch[:, di] # decide the next input if di == 0 or random.random() < forcing_ratio: x_t = y_t_1 # teacher forcing, use label from last time step as input else: # use embedding of UNK for all oov word y_t_1_hat[y_t_1_hat > self.vocab.size()] = self.vocab.word2id( UNKNOWN_TOKEN) x_t = y_t_1_hat.flatten( ) # use prediction from last time step as input final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( x_t, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) _, y_t_1_hat = final_dist.data.topk(1) target = target_batch[:, di].unsqueeze(1) step_loss = cal_NLLLoss(target, final_dist) if config.is_coverage: # if not using coverge, keep coverage=None step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1) step_loss = step_loss + config.cov_loss_wt * step_coverage_loss coverage = next_coverage step_mask = dec_padding_mask[:, di] # padding in target should not count into loss step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def train(self, n_iters, init_model_path=None): iter, avg_loss = self.setup_train(init_model_path) start = time.time() cnt = 0 best_model_path = None min_eval_loss = float('inf') while iter < n_iters: s = config.forcing_ratio k = config.decay_to_0_iter x = iter nere_zero = 0.0001 if config.forcing_decay_type: if x >= config.decay_to_0_iter: forcing_ratio = 0 elif config.forcing_decay_type == 'linear': forcing_ratio = s * (k - x) / k elif config.forcing_decay_type == 'exp': p = pow(nere_zero, 1 / k) forcing_ratio = s * (p**x) elif config.forcing_decay_type == 'sig': r = math.log((1 / nere_zero) - 1) / k forcing_ratio = s / (1 + pow(math.e, r * (x - k / 2))) else: raise ValueError('Unrecognized forcing_decay_type: ' + config.forcing_decay_type) else: forcing_ratio = config.forcing_ratio batch = self.batcher.next_batch() loss = self.train_one_batch(batch, forcing_ratio=forcing_ratio) model_path = os.path.join(self.checkpoint_dir, 'model_step_%d' % (iter + 1)) avg_loss = calc_avg_loss(loss, avg_loss) if (iter + 1) % config.print_interval == 0: with self.train_summary_writer.as_default(): tf.summary.scalar(name='loss', data=loss, step=iter) self.train_summary_writer.flush() logger.info('steps %d, took %.2f seconds, train avg loss: %f' % (iter + 1, time.time() - start, avg_loss)) start = time.time() if config.eval_interval is not None and ( iter + 1) % config.eval_interval == 0: start = time.time() logger.info("Start Evaluation on model %s" % model_path) eval_processor = Evaluate(self.model, self.vocab) eval_loss = eval_processor.run_eval() logger.info( "Evaluation finished, took %.2f seconds, eval loss: %f" % (time.time() - start, eval_loss)) with self.eval_summary_writer.as_default(): tf.summary.scalar(name='eval_loss', data=eval_loss, step=iter) self.eval_summary_writer.flush() if eval_loss < min_eval_loss: logger.info( "This is the best model so far, saving it to disk.") min_eval_loss = eval_loss best_model_path = model_path self.save_model(model_path, eval_loss, iter) cnt = 0 else: cnt += 1 if cnt > config.patience: logger.info( "Eval loss doesn't drop for %d straight times, early stopping.\n" "Best model: %s (Eval loss %f: )" % (config.patience, best_model_path, min_eval_loss)) break start = time.time() elif (iter + 1) % config.save_interval == 0: self.save_model(model_path, avg_loss, iter) iter += 1 else: logger.info( "Training finished, best model: %s, with train loss %f: " % (best_model_path, min_eval_loss))
train_loss = 0 for _iter in tqdm(range(cfg.EPOCH_ITERS)): batch_iterator = iter(dataloader) # zero the gradient buffers optimizer.zero_grad() (gt, patch_2, patch_3) = next(batch_iterator) # Use CUDA if possible if torch.cuda.device_count(): gt = gt.cuda() patch_2 = patch_2.cuda() patch_3 = patch_3.cuda() else: gt = gt patch_2 = patch_2 patch_3 = patch_3 softmax_scores = model.forward(patch_2=Variable(patch_2), patch_3=Variable(patch_3)) loss = px3_loss(softmax_scores, gt, loss_weights) loss.backward() optimizer.step() train_loss += loss.data[0] print('Epoch: {} Loss: {}'.format( epoch, train_loss / cfg.EPOCH_ITERS / cfg.BATCH_SIZE)) torch.save(model.state_dict(), cfg.SAVE_PATH.format(epoch)) print('Done.')
optim = Adagrad(model.parameters(), lr=0.01, weight_decay=0.003) batch_size = 1000 n_batch = n_train // batch_size + 1 for epoch in range(n_epoch): loss_t = 0 for i in range(n_batch): start = i*batch_size end = min(n_train, (i+1)*batch_size) x_batch = x_train[start:end] y_batch = y_train[start:end] model.zero_grad() y_p = model(x_batch) loss = l1(y_p, y_batch) loss.backward() optim.step() loss_t += loss.cpu().data.numpy() y_pred_v = model(x_valid) loss_v = l1(y_pred_v, y_valid) y_pred_a = model(x_all) loss_a = l1(y_pred_a, y_all) print('Epoch:{}, Loss:{}, Loss_valid:{}, Loss_all: {}'.format(epoch, loss_t/n_batch, loss_v.data.cpu().numpy(), loss_a.data.cpu().numpy())) print(y_valid.topk(10)) print(y_pred_v.topk(10)) print(y_all.topk(20)) print(y_pred_a.topk(20)) print('\ntesting.........................B5') y_p_B5 = model(x_B5) loss_b5 = l1(y_p_B5, y_B5) print(loss_b5) print(y_p_B5)
class Train(object): def __init__(self): #config("print.vocab_path ",config.vocab_path) self.vocab = Vocab(config.vocab_path, config.vocab_size) self.batcher = Batcher(config.train_data_path, self.vocab, mode='train', batch_size=config.batch_size, single_pass=False) time.sleep(15) train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) self.model_dir = os.path.join(train_dir, 'model') if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.summary_writer = tf.summary.FileWriter(train_dir) def save_model(self, running_avg_loss, iter): state = { 'iter': iter, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': running_avg_loss } model_save_path = os.path.join( self.model_dir, 'model_%d_%d' % (iter, int(time.time()))) torch.save(state, model_save_path) def setup_train(self, model_file_path=None): self.model = Model(model_file_path) params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \ list(self.model.reduce_state.parameters()) #print("params : ",params) #print("params collection is completed....") initial_lr = config.lr_coverage if config.is_coverage else config.lr self.optimizer = Adagrad( params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc) start_iter, start_loss = 0, 0 #### Loading state where the training stopped earlier use that to train for future epoches #### if model_file_path is not None: state = torch.load(model_file_path, map_location=lambda storage, location: storage) start_iter = state['iter'] start_loss = state['current_loss'] if not config.is_coverage: self.optimizer.load_state_dict(state['optimizer']) ###### Making into GPU/server accessable Variables ##### if use_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() return start_iter, start_loss def train_one_batch(self, batch): ########### Below Two lines of code is for just initialization of Encoder and Decoder sizes,vocab, lenghts etc : ###### enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \ get_input_from_batch(batch, use_cuda) dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \ get_output_from_batch(batch, use_cuda) self.optimizer.zero_grad() #print("train_one_batch function ......") encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder( enc_batch, enc_lens) s_t_1 = self.model.reduce_state( encoder_hidden ) ### Here initially encoder final hiddenstate==decoder first/prev word at timestamp=0 #print("s_t_1 : ",len(s_t_1),s_t_1[0].shape,s_t_1[1].shape) #print("steps.....") #print("max_dec_len = ",max_dec_len) step_losses = [] for di in range(min(max_dec_len, config.max_dec_steps)): ############ Traing [ Teacher Forcing ] ########### y_t_1 = dec_batch[:, di] # Teacher forcing #print("y_t_1 : ",len(y_t_1)) final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder( y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di) #print("attn_dist : ",len(attn_dist),attn_dist[0].shape) #print("final_dist : ",len(final_dist),final_dist[0].shape) ############## vocab_Size target = target_batch[:, di] #print("target = ",len(target)) gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze() step_loss = -torch.log( gold_probs + config.eps ) #################################################### Eqn_6 if config.is_coverage: step_coverage_loss = torch.sum( torch.min(attn_dist, coverage), 1) ###############################Eqn_13a step_loss = step_loss + config.cov_loss_wt * step_coverage_loss ###############################Eqn_13b coverage = next_coverage step_mask = dec_padding_mask[:, di] step_loss = step_loss * step_mask step_losses.append(step_loss) sum_losses = torch.sum(torch.stack(step_losses, 1), 1) batch_avg_loss = sum_losses / dec_lens_var loss = torch.mean(batch_avg_loss) loss.backward() self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm) self.optimizer.step() return loss.item() def trainIters(self, n_iters, model_file_path=None): print("trainIters__Started___model_file_path is : ", model_file_path) iter, running_avg_loss = self.setup_train(model_file_path) start = time.time() print("Max iteration : n_iters = ", n_iters) print("going to start running iter NO : ", iter) print("\n******************************\n") while iter < n_iters: print("\n###################################\n") print("iter : ", iter) batch = self.batcher.next_batch() print("batch data loading : ", batch) loss = self.train_one_batch(batch) running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter) print("running_avg_loss : ", running_avg_loss) iter += 1 if iter % 100 == 0: ##100 self.summary_writer.flush() print_interval = 100 #1000 if iter % print_interval == 0: print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval, time.time() - start, loss)) start = time.time() if iter % 500 == 0: ##5000 self.save_model(running_avg_loss, iter)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--build_data", action="store_true", help="Whether to build data.") parser.add_argument("--train_path", type=str, default="../data/tacred_train.json", help="Path to unlabled data.") parser.add_argument("--dev_path", type=str, default="../data/tacred_dev.json", help="Path to dev data.") parser.add_argument("--test_path", type=str, default="../data/tacred_test.json", help="Path to train data.") parser.add_argument("--explanation_data_path", type=str, default="../data/tacred_explanations.json", help="Path to explanation data.") parser.add_argument( "--vocab_path", type=str, default="../data/vocabs/vocab_glove.840B.300d_-1_0.6.p", help="Path to vocab created in Pre-training") parser.add_argument("--match_batch_size", default=50, type=int, help="Match batch size for train.") parser.add_argument("--unlabeled_batch_size", default=100, type=int, help="Unlabeled batch size for train.") parser.add_argument("--eval_batch_size", default=50, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=0.1, type=float, help="The initial learning rate for Adam.") parser.add_argument("--epochs", default=60, type=int, help="Number of Epochs for training") parser.add_argument('--embeddings', type=str, default="glove.840B.300d", help="initial embeddings to use") parser.add_argument('--seed', type=int, default=7698, help="random seed for initialization") parser.add_argument('--emb_dim', type=int, default=300, help="embedding vector size") parser.add_argument( '--hidden_dim', type=int, default=100, help="hidden vector size of lstm (really 2*hidden_dim, due to bilstm)") parser.add_argument('--model_save_dir', type=str, default="", help="where to save the model") parser.add_argument('--experiment_name', type=str, help="what to save the model file as") parser.add_argument('--load_clf_model', action='store_true', help="Whether to load a trained classifier model") parser.add_argument('--start_epoch', type=int, default=0, help="start_epoch") parser.add_argument('--use_adagrad', action='store_true', help="use adagrad optimizer") args = parser.parse_args() torch.manual_seed(args.seed) random.seed(args.seed) lower_bound = -20.0 dataset = "tacred" save_string = generate_save_string(dataset, args.embeddings) number_of_classes = len(TACRED_LABEL_MAP) none_label_id = TACRED_LABEL_MAP["no_relation"] set_re_dataset_ner_label_space(dataset) task = "re" if args.build_data: build_datasets_from_splits(args.train_path, args.dev_path, args.test_path, args.vocab_path, TACRED_LABEL_MAP, args.explanation_data_path, save_string, task=task, dataset=dataset) with open( "../data/training_data/{}_data_{}.p".format( "matched", save_string), "rb") as f: strict_match_data = pickle.load(f) with open(args.vocab_path, "rb") as f: vocab = pickle.load(f) dev_path = "../data/training_data/dev_data_{}.p".format(save_string) test_path = "../data/training_data/test_data_{}.p".format(save_string) pad_idx = vocab["<pad>"] if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") tacred_vocab = build_custom_vocab("tacred", len(vocab)) custom_vocab_length = len(tacred_vocab) clf = BiLSTM_Att_Clf.BiLSTM_Att_Clf(vocab.vectors, pad_idx, args.emb_dim, args.hidden_dim, torch.cuda.is_available(), number_of_classes, custom_token_count=custom_vocab_length) del vocab epochs = args.epochs epoch_string = str(epochs) test_epoch_f1_scores = [] dev_epoch_f1_scores = [] best_test_f1_score = -1 best_dev_f1_score = -1 strict_loss_epoch = [] if args.load_clf_model: clf.load_state_dict( torch.load("../data/saved_models/Clf_{}.p".format( args.experiment_name))) print("loaded model") with open("../data/result_data/test_f1_per_epoch_Clf_{}.csv".format( args.experiment_name)) as f: reader = csv.reader(f) next(reader) for row in reader: test_epoch_f1_scores.append(row) if float(row[-1]) > best_test_f1_score: best_test_f1_score = float(row[-1]) with open("../data/result_data/dev_f1_per_epoch_Clf_{}.csv".format( args.experiment_name)) as f: reader = csv.reader(f) next(reader) for row in reader: dev_epoch_f1_scores.append(row) if float(row[-1]) > best_dev_f1_score: best_dev_f1_score = float(row[-1]) print("loaded past results") clf = clf.to(device) if args.use_adagrad: optimizer = Adagrad(clf.parameters(), lr=args.learning_rate) else: optimizer = SGD(clf.parameters(), lr=args.learning_rate) h0 = torch.empty(4, args.match_batch_size, args.hidden_dim).to(device) c0 = torch.empty(4, args.match_batch_size, args.hidden_dim).to(device) nn.init.xavier_normal_(h0) nn.init.xavier_normal_(c0) # define loss functions strict_match_loss_function = nn.CrossEntropyLoss() for epoch in range(args.start_epoch, args.start_epoch + epochs): print('\n Epoch {:} / {:}'.format(epoch + 1, args.start_epoch + epochs)) total_loss, strict_total_loss, soft_total_loss, sim_total_loss = 0, 0, 0, 0 batch_count = 0 clf.train() for step, batch in enumerate( tqdm( strict_match_data.as_batches( batch_size=args.match_batch_size, seed=epoch))): # prepping batch data strict_match_tokens, strict_match_lengths, strict_match_labels = batch strict_match_tokens = strict_match_tokens.to(device) strict_match_labels = strict_match_labels.to(device) strict_match_predictions = clf.forward(strict_match_tokens, strict_match_lengths, h0, c0) strict_match_loss = strict_match_loss_function( strict_match_predictions, strict_match_labels) strict_total_loss = strict_total_loss + strict_match_loss.item() batch_count += 1 if batch_count % 50 == 0 and batch_count > 0: print((total_loss, strict_total_loss, soft_total_loss, sim_total_loss, batch_count)) strict_match_loss.backward() torch.nn.utils.clip_grad_norm_(clf.parameters(), 5.0) optimizer.step() # compute the training loss of the epoch train_avg_loss = total_loss / batch_count train_avg_strict_loss = strict_total_loss / batch_count train_avg_soft_loss = soft_total_loss / batch_count train_avg_sim_loss = sim_total_loss / batch_count print("Train Losses") loss_tuples = ("%.5f" % train_avg_loss, "%.5f" % train_avg_strict_loss, "%.5f" % train_avg_soft_loss, "%.5f" % train_avg_sim_loss) print( "Avg Train Total Loss: {}, Avg Train Strict Loss: {}, Avg Train Soft Loss: {}, Avg Train Sim Loss: {}" .format(*loss_tuples)) strict_loss_epoch.append(train_avg_strict_loss) train_path = "../data/training_data/{}_data_{}.p".format( "matched", save_string) train_results = evaluate_next_clf(train_path, clf, strict_match_loss_function, number_of_classes, batch_size=args.eval_batch_size, none_label_id=none_label_id) avg_loss, avg_train_ent_f1_score, avg_train_val_f1_score, total_train_class_probs, no_relation_thresholds = train_results print("Train Results") train_tuple = ("%.5f" % avg_loss, "%.5f" % avg_train_ent_f1_score, "%.5f" % avg_train_val_f1_score, str(no_relation_thresholds)) print( "Avg Train Loss: {}, Avg Train Entropy F1 Score: {}, Avg Train Max Value F1 Score: {}, Thresholds: {}" .format(*train_tuple)) dev_results = evaluate_next_clf(dev_path, clf, strict_match_loss_function, number_of_classes, batch_size=args.eval_batch_size, none_label_id=none_label_id) avg_loss, avg_dev_ent_f1_score, avg_dev_val_f1_score, total_dev_class_probs, no_relation_thresholds = dev_results print("Dev Results") dev_tuple = ("%.5f" % avg_loss, "%.5f" % avg_dev_ent_f1_score, "%.5f" % avg_dev_val_f1_score, str(no_relation_thresholds)) print( "Avg Dev Loss: {}, Avg Dev Entropy F1 Score: {}, Avg Dev Max Value F1 Score: {}, Thresholds: {}" .format(*dev_tuple)) dev_epoch_f1_scores.append( (avg_loss, avg_dev_ent_f1_score, avg_dev_val_f1_score, max(avg_dev_ent_f1_score, avg_dev_val_f1_score))) if max(avg_dev_ent_f1_score, avg_dev_val_f1_score) > best_dev_f1_score: best_dev_f1_score = max(avg_dev_ent_f1_score, avg_dev_val_f1_score) print("Updated Dev F1 Score") test_results = evaluate_next_clf(test_path, clf, strict_match_loss_function, number_of_classes,\ no_relation_thresholds=no_relation_thresholds,\ batch_size=args.eval_batch_size, none_label_id=none_label_id) avg_loss, avg_test_ent_f1_score, avg_test_val_f1_score, total_test_class_probs, _ = test_results print("Test Results") test_tuple = ("%.5f" % avg_loss, "%.5f" % avg_test_ent_f1_score, "%.5f" % avg_test_val_f1_score, str(no_relation_thresholds)) print( "Avg Test Loss: {}, Avg Test Entropy F1 Score: {}, Avg Test Max Value F1 Score: {}, Thresholds: {}" .format(*test_tuple)) test_epoch_f1_scores.append( (avg_loss, avg_test_ent_f1_score, avg_test_val_f1_score, max(avg_test_ent_f1_score, avg_test_val_f1_score))) if best_test_f1_score < max(avg_test_ent_f1_score, avg_test_val_f1_score): print("Saving Model") if len(args.model_save_dir) > 0: dir_name = args.model_save_dir else: dir_name = "../data/saved_models/" torch.save(clf.state_dict(), "{}Clf_{}.p".format(dir_name, args.experiment_name)) with open( "../data/result_data/test_predictions_Clf_{}.csv".format( args.experiment_name), "wb") as f: pickle.dump(total_test_class_probs, f) with open( "../data/result_data/dev_predictions_Clf_{}.csv".format( args.experiment_name), "wb") as f: pickle.dump(total_dev_class_probs, f) with open("../data/result_data/thresholds.p", "wb") as f: pickle.dump({"thresholds": no_relation_thresholds}, f) best_test_f1_score = max(avg_test_ent_f1_score, avg_test_val_f1_score) print("Best Test F1: {}".format("%.5f" % best_test_f1_score)) print(test_epoch_f1_scores[-3:]) with open( "../data/result_data/train_strict_loss_per_epoch_Clf_{}.csv". format(args.experiment_name), "w") as f: writer = csv.writer(f) writer.writerow(['train loss']) for row in strict_loss_epoch: writer.writerow([row]) with open( "../data/result_data/dev_f1_per_epoch_Clf_{}.csv".format( args.experiment_name), "w") as f: writer = csv.writer(f) writer.writerow( ['avg_loss, entropy_f1_score', 'max_value_f1_score', 'max']) for row in dev_epoch_f1_scores: writer.writerow(row) with open( "../data/result_data/test_f1_per_epoch_Clf_{}.csv".format( args.experiment_name), "w") as f: writer = csv.writer(f) writer.writerow( ['avg_loss, entropy_f1_score', 'max_value_f1_score', 'max']) for row in test_epoch_f1_scores: writer.writerow(row)