def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) training_pairs = [util.tensorsFromPair(random.choice(pairs), input_lang, output_lang) \ for i in range(n_iters)] criterion = nn.NLLLoss() for iter in range(1, n_iters + 1): training_pair = training_pairs[iter - 1] input_tensor = training_pair[0] target_tensor = training_pair[1] loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (util.timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses)
def train(dataset : SequenceSequenceDataset, hidden_size : int, learning_rate : float, num_encoder_layers : int, num_decoder_layers : int, max_length : int, num_epochs : int, batch_size : int, print_every : int, context_vocab_size : int, tactic_vocab_size : int) -> Iterable[Checkpoint]: print("Initializing PyTorch...") in_stream = [inputFromSentence(datum[0], max_length) for datum in dataset] out_stream = [inputFromSentence(datum[1], max_length) for datum in dataset] data_loader = data.DataLoader(data.TensorDataset(torch.LongTensor(out_stream), torch.LongTensor(in_stream)), batch_size=batch_size, num_workers=0, shuffle=True, pin_memory=True, drop_last=True) encoder = EncoderRNN(context_vocab_size, hidden_size, num_encoder_layers, batch_size=batch_size) decoder = DecoderRNN(hidden_size, tactic_vocab_size, num_decoder_layers, batch_size=batch_size) encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) optimizers = [encoder_optimizer, decoder_optimizer] criterion = maybe_cuda(nn.NLLLoss()) start = time.time() num_items = len(dataset) * num_epochs total_loss = 0 print("Training...") for epoch in range(num_epochs): print("Epoch {}".format(epoch)) adjustLearningRates(learning_rate, optimizers, epoch) for batch_num, (output_batch, input_batch) in enumerate(data_loader): target_length = output_batch.size()[1] encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() predictor_output = decoder.run_teach(encoder .run(cast(SomeLongTensor, input_batch)), cast(SomeLongTensor, output_batch)) loss = maybe_cuda(Variable(LongTensor(0))) output_var = maybe_cuda(Variable(output_batch)) for i in range(target_length): loss += criterion(predictor_output[i], output_var[:,i]) loss.backward() encoder_optimizer.step() decoder_optimizer.step() total_loss += (loss.data[0] / target_length) * batch_size if (batch_num + 1) % print_every == 0: items_processed = (batch_num + 1) * batch_size + epoch * len(dataset) progress = items_processed / num_items print("{} ({} {:.2f}%) {:.4f}". format(timeSince(start, progress), items_processed, progress * 100, total_loss / items_processed)) yield encoder.state_dict(), decoder.state_dict()
def trainIters(epoch, pairs, lang, encoder, decoder, print_every=100, plot_every=200, learning_rate=0.001): start = time.time() print("Starting training") plot_losses = [] num = len(pairs) encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) criterion = nn.NLLLoss() for epo in range(epoch): print("epoch: ", epo + 1) print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every for iter in range(1, len(pairs) + 1): training_pair = variablesFromPair(pairs[iter - 1], lang) input_variable = training_pair[0] target_variable = training_pair[1] loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print("loss: ", loss.numpy()) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / num), iter, iter / num * 100, print_loss_avg)) evaluateRandomly(lang, encoder, decoder, 1) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses)
def checkpoints(self, inputs : List[List[float]], outputs : List[int]) \ -> Iterable[NeuralPredictorState]: print("Building tensors") dataloader = data.DataLoader(data.TensorDataset( torch.FloatTensor(inputs), torch.LongTensor(outputs)), batch_size=self.batch_size, num_workers=0, shuffle=True, pin_memory=True, drop_last=True) num_batches = int(len(inputs) / self.batch_size) dataset_size = num_batches * self.batch_size print("Initializing model...") training_start = time.time() for epoch in range(1, self.num_epochs): self.adjuster.step() print("Epoch {} (learning rate {:.6f})".format( epoch, self._optimizer.param_groups[0]['lr'])) epoch_loss = 0. for batch_num, data_batch in enumerate(dataloader, start=1): self._optimizer.zero_grad() input_batch, output_batch = data_batch # with autograd.detect_anomaly(): predictionDistribution = self._model(input_batch) output_var = maybe_cuda(Variable(output_batch)) loss = self._criterion(predictionDistribution, output_var) loss.backward() self._optimizer.step() epoch_loss += loss.item() if batch_num % self.print_every == 0: items_processed = batch_num * self.batch_size + \ (epoch - 1) * dataset_size progress = items_processed / (dataset_size * self.num_epochs) print("{} ({:7} {:5.2f}%) {:.4f}".format( timeSince(training_start, progress), items_processed, progress * 100, epoch_loss / batch_num)) state = self._model.state_dict() loss = epoch_loss / num_batches checkpoint = NeuralPredictorState(epoch, loss, state) yield checkpoint
def train_vectors(data_path, model_file): """ Trains a doc2vec model from character sequences split into 3-letter words, then saves it to a file. """ start = time.time() print("getting data") ids, sentences_ls = dp.get_paragraphs(data_path) print("got processed data") tagged_data = [ TaggedDocument(words=dp.preprocess_str_hp(_d).split(), tags=[str(i)]) for i, _d in sentences_ls ] print("made ", len(sentences_ls), " sentences") embedding = build_model(tagged_data) print("made model in ", timeSince(start)) embedding.save(model_file) return ids, embedding
def trainIters(encoder, decoder, pairs, input_lang, output_lang, config): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=config.learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=config.learning_rate) training_pairs = [ variablesFromPair(random.choice(pairs), input_lang, output_lang, config) for i in range(config.n_iters) ] criterion = nn.NLLLoss() for iter in range(1, config.n_iters + 1): training_pair = training_pairs[iter - 1] input_variable = training_pair[0] target_variable = training_pair[1] loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, config) print_loss_total += loss plot_loss_total += loss if iter % config.print_every == 0: print_loss_avg = print_loss_total / config.print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / config.n_iters), iter, iter / config.n_iters * 100, print_loss_avg)) if iter % config.plot_every == 0: plot_loss_avg = plot_loss_total / config.plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses)
# Generate a random episode #refactor training line #t = time.time() train_loss = train_batched_step(samples, model) #TODO #print("optim time:", time.time() - t) avg_train_loss += train_loss counter += 1 if episode == 1 or episode % args.print_freq == 0 or episode == model.num_pretrain_episodes: val_loss = eval_ll(val_states, model) print( '{:s} ({:d} {:.0f}% finished) TrainLoss: {:.4f}, ValLoss: {:.4f}' .format( timeSince( start, float(episode) / float(model.num_pretrain_episodes)), episode, float(episode) / float(model.num_pretrain_episodes) * 100., avg_train_loss / counter, val_loss), flush=True) avg_train_loss = 0. counter = 0 print('gen sample stats', batchtime.items()) batchtime['max'] = 0 batchtime['mean'] = 0 batchtime['count'] = 0 if episode % args.save_freq == 0 or episode == model.num_pretrain_episodes: model.save(path) if episode % 10000 == 0 or episode == model.num_pretrain_episodes:
def supervised_q(args: argparse.Namespace) -> None: replay_memory = [] with open(args.tmp_file, 'r') as f: for idx, line in enumerate(tqdm(f, desc="Loading data")): replay_memory.append(LabeledTransition.from_dict(json.loads(line))) if args.max_tuples is not None: replay_memory = replay_memory[-args.max_tuples:] # Load the predictor predictor = cast( features_polyarg_predictor.FeaturesPolyargPredictor, predict_tactic.loadPredictorByFile(args.predictor_weights)) q_estimator: QEstimator # Create an initial Q Estimator if args.estimator == "polyarg": q_estimator = PolyargQEstimator(args.learning_rate, args.epoch_step, args.gamma, predictor) else: q_estimator = FeaturesQEstimator(args.learning_rate, args.epoch_step, args.gamma) if args.start_from: q_estimator_name, *saved = \ torch.load(args.start_from) if args.estimator == "polyarg": assert q_estimator_name == "polyarg evaluator", \ q_estimator_name else: assert q_estimator_name == "features evaluator", \ q_estimator_name q_estimator.load_saved_state(*saved) training_start = time.time() training_samples = assign_scores(args, q_estimator, predictor, replay_memory, progress=True) input_tensors = q_estimator.get_input_tensors(training_samples) rescore_lr = args.learning_rate for epoch in range(1, args.num_epochs + 1): scores = torch.FloatTensor( [score for _, _, _, score in training_samples]) batches: Sequence[Sequence[torch.Tensor]] = data.DataLoader( data.TensorDataset(*(input_tensors + [scores])), batch_size=args.batch_size, num_workers=0, shuffle=True, pin_memory=True, drop_last=True) epoch_loss = 0. eprint("Epoch {}: Learning rate {:.12f}".format( epoch, q_estimator.optimizer.param_groups[0]['lr']), guard=args.show_loss) for idx, batch in enumerate(batches, start=1): q_estimator.optimizer.zero_grad() word_features_batch, vec_features_batch, \ expected_outputs_batch = batch outputs = q_estimator.model(word_features_batch, vec_features_batch) loss = q_estimator.criterion(outputs, maybe_cuda(expected_outputs_batch)) loss.backward() q_estimator.optimizer.step() q_estimator.total_batches += 1 epoch_loss += loss.item() if idx % args.print_every == 0: items_processed = idx * args.batch_size + \ (epoch - 1) * len(replay_memory) progress = items_processed / (len(replay_memory) * args.num_epochs) eprint("{} ({:7} {:5.2f}%) {:.4f}".format( timeSince(training_start, progress), items_processed, progress * 100, epoch_loss * (len(batches) / idx)), guard=args.show_loss) q_estimator.adjuster.step() q_estimator.save_weights(args.out_weights, args) if epoch % args.score_every == 0 and epoch < args.num_epochs: training_samples = assign_scores(args, q_estimator, predictor, replay_memory, progress=True) rescore_lr *= args.rescore_gamma q_estimator.optimizer.param_groups[0]['lr'] = rescore_lr pass pass
def train(self, triplets, n_iters, d_steps, d_optimizer, g_steps, g_optimizer, batch_size, max_len, criterion, word2index, index2word, embeddings_index, embeddings_size, print_every, plot_every, checkpoint_every, to_file=False, loss_f=None, sample_out_f=None, path_to_exp_out=None): # criterion is for both G and D # record start time for logging begin_time = time.time() print_d_loss_total = 0 # Reset every print_every plot_d_loss_total = 0 # Reset every plot_every print_g_loss_total = 0 # Reset every print_every plot_g_loss_total = 0 # Reset every plot_every plot_d_loss_avgs = [] plot_g_loss_avgs = [] for iter in range(1, n_iters + 1): # train D for d_train_idx in range(d_steps): # 1. Train D on real+fake self.D.zero_grad() # 1A: Train D on real # get data # prepare batch training_batch, seq_lens = get_random_batch( triplets, batch_size) # concat the context_ans batch with the question batch # each element in the training batch is context + question + answer cqa_batch, _, cqa_lens = prepare_batch_var(training_batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size, mode=['word'], concat_opt='cqa') train_input = Variable(cqa_batch[0].cuda( )) if use_cuda else Variable( cqa_batch[0] ) # embeddings vectors, size = [seq len x batch size x embedding dim] d_real_decision = self.D.forward(train_input, cqa_lens[0]) real_target = Variable(torch.FloatTensor([1]*batch_size)).cuda() if use_cuda else \ Variable(torch.FloatTensor([1]*batch_size)) d_real_error = criterion(d_real_decision, real_target) # ones = true d_real_error.backward( ) # compute/store gradients, but don't change params # 1B: Train D on fake fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var( self.G, training_batch, max_len, batch_size, word2index, index2word, embeddings_index, embeddings_size, mode=('word')) # # sanity check: rpepare fake batch and prepare batch have the same order # print(fake_cqa_batch[0][12] == cqa_batch[0][12]) d_fake_data = Variable( fake_cqa_batch[0].cuda()) if use_cuda else Variable( fake_cqa_batch[0]) d_fake_decision = self.D.forward(d_fake_data, fake_cqa_lens[0]) fake_target = Variable(torch.FloatTensor([0]*batch_size)).cuda() if use_cuda else \ Variable(torch.FloatTensor([0]*batch_size)) # d_fake_error = criterion(d_fake_decision, fake_target) # zeros = fake # d_fake_error.backward() # d_optimizer.step() # accumulate loss # FIXME I dont think below implementation works for batch version d_error = torch.mean(d_fake_decision) - torch.mean( d_real_decision) # W_GAN loss # d_error = -torch.mean(self.log(1 - d_fake_decision)) - torch.mean(self.log(d_real_decision)) # GAN loss d_error.backward() d_optimizer.step() # d_error = d_real_error + d_fake_error # train G for g_train_idx in range(g_steps): self.G.zero_grad() # conditional data for generator training_batch, seq_lens = get_random_batch( triplets, batch_size) fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var( self.G, training_batch, max_len, batch_size, word2index, index2word, embeddings_index, embeddings_size, mode=('word'), detach=False) g_fake_data = Variable( fake_cqa_batch[0].cuda()) if use_cuda else Variable( fake_cqa_batch[0]) dg_fake_decision = self.D.forward(g_fake_data, fake_cqa_lens[0]) target = Variable(torch.FloatTensor([1]*batch_size).cuda()) if use_cuda else \ Variable(torch.FloatTensor([1]*batch_size)) # g_error = criterion(dg_fake_decision, target) g_error = -torch.mean(dg_fake_decision) # wgan loss # G_error = -torch.mean(self.log(dg_fake_decision)) # gan loss g_error.backward() g_optimizer.step() # Only optimizes G's parameters # log error print_d_loss_total += d_error.data[0] print_g_loss_total += g_error.data[0] plot_d_loss_total += d_error.data[0] plot_g_loss_total += g_error.data[0] if iter % print_every == 0: print_d_loss_avg = print_d_loss_total / print_every print_g_loss_avg = print_g_loss_total / print_every print_d_loss_total = 0 print_g_loss_total = 0 if not to_file: print('%s (%d %d%%)' % (timeSince(begin_time, iter / float(n_iters)), iter, iter / n_iters * 100)) # print("errors: D: real-%s/fake-%s G: %s " % ( d_real_error.data[0], d_fake_error.data[0], g_error.data[0]) ) print("errors: D: %s G: %s " % (print_d_loss_avg, print_g_loss_avg)) print('---sample generated question---') # sample a triple and print the generated question evaluate(self.G, triplets, embeddings_index, embeddings_size, word2index, index2word, max_len) else: sample_out_f.write( unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100))) evaluate(self.G, triplets, embeddings_index, embeddings_size, word2index, index2word, max_len, to_file, sample_out_f) sample_out_f.write(unicode('\n')) if iter % plot_every == 0: plot_d_loss_avg = plot_d_loss_total / plot_every plot_d_loss_avgs.append(plot_d_loss_avg) plot_g_loss_avg = plot_g_loss_total / plot_every plot_g_loss_avgs.append(plot_g_loss_avg) plot_d_loss_total = 0 plot_g_loss_total = 0 if to_file: loss_f.write( unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100))) loss_f.write( unicode("errors: D: %s G: %s " % (print_d_loss_avg, print_g_loss_avg))) loss_f.write(unicode('\n')) if (iter % checkpoint_every == 0) or (iter == n_iters): checkpoint_fname = 'checkpoint_iter_' + str(iter) + '.pth.tar' state = { 'iteration': iter + 1, 'd_state_dict': self.D.state_dict(), 'g_state_dict': self.G.state_dict(), 'd_optimizer': d_optimizer.state_dict(), 'g_optimizer': g_optimizer.state_dict(), } torch.save(state, path_to_exp_out + '/' + checkpoint_fname) plotLoss(plot_d_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='d_loss_itr_' + str(iter) + '.png', title='training loss D (monitoring purpose)', from_file=False) plotLoss(plot_g_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='g_loss_itr_' + str(iter) + '.png', title='training loss G (monitoring purpose)', from_file=False)
def train(dataset : List[Sentence], token_vocab_size : int, max_length : int, hidden_size : int, learning_rate : float, epoch_step : int, gamma : float, num_encoder_layers : int, num_decoder_layers : int, num_epochs : int, batch_size : int, print_every : int, optimizer_f : Callable[..., Optimizer]) \ -> Iterable[Checkpoint]: curtime = time.time() print("Building pytorch dataset...", end="") sys.stdout.flush() data_loader = data.DataLoader(data.TensorDataset( torch.LongTensor(dataset[:]), torch.LongTensor(dataset[:])), batch_size=batch_size, num_workers=0, shuffle=True, pin_memory=True, drop_last=True) print(" {:.2f}s".format(time.time() - curtime)) curtime = time.time() print("Initializing model...", end="") sys.stdout.flush() encoder = maybe_cuda( EncoderRNN(token_vocab_size, hidden_size, num_encoder_layers, batch_size=batch_size)) decoder = maybe_cuda( DecoderRNN(hidden_size, token_vocab_size, num_decoder_layers, batch_size=batch_size)) encoder_optimizer = optimizer_f(encoder.parameters(), lr=learning_rate) decoder_optimizer = optimizer_f(decoder.parameters(), lr=learning_rate) encoder_adjuster = scheduler.StepLR(encoder_optimizer, epoch_step, gamma) decoder_adjuster = scheduler.StepLR(decoder_optimizer, epoch_step, gamma) criterion = maybe_cuda(nn.NLLLoss()) print(" {:.2f}s".format(time.time() - curtime)) start = time.time() num_items = len(dataset) * num_epochs total_loss = 0 print("Training...") for epoch in range(num_epochs): print("Epoch {}".format(epoch)) # Adjust learning rates if needed encoder_adjuster.step() decoder_adjuster.step() # Process batches of data for batch_num, (input_batch, output_batch) in enumerate(data_loader): # Reset the optimizers encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() # Run the autoencoder decoded_output = \ decoder.run_teach( encoder.run(cast(torch.LongTensor, input_batch)), cast(torch.LongTensor, output_batch)) # Gather the losses loss = maybe_cuda(Variable(torch.zeros(1, dtype=torch.float32))) output_var = maybe_cuda(Variable(output_batch)) target_length = output_batch.size()[1] for i in range(target_length): loss += criterion(decoded_output[i], output_var[:, i]) total_loss += (loss.data.item() / target_length) * batch_size assert total_loss == total_loss assert isinstance(total_loss, float) # Update the weights loss.backward() encoder_optimizer.step() decoder_optimizer.step() # Print status every once in a while if (batch_num + 1) % print_every == 0: items_processed = (batch_num + 1) * batch_size + epoch * len(dataset) progress = items_processed / num_items print("{} ({} {:.2f}%) {:.4f}".format( timeSince(start, progress), items_processed, progress * 100, total_loss / items_processed)) yield Checkpoint(encoder_state=encoder.state_dict(), decoder_state=decoder.state_dict(), training_loss=total_loss) pass