def train_model(model, loss_fn, X_train, t_train, X_val, t_val, num_epochs=1000, learning_rate=0.1, batch_size=100, weight_decay=0.001, print_every=10): for ep in range(num_epochs): for X_batch, t_batch in util.get_batches(X_train, t_train, batch_size): # compute the activations act = model.compute_activations(X_batch) # do the gradient descent update dLdz = loss_fn.derivatives(act['z'], t_batch) param_derivs = model.cost_derivatives(X_batch, act, dLdz) model.gradient_descent_update(param_derivs, learning_rate) # apply weight decay model.apply_weight_decay(weight_decay, learning_rate) if ep % print_every == 0: # evaluate the training loss and error act = model.compute_activations(X_train) train_loss = loss_fn.value(act['z'], t_train).mean() y = model.get_predictions(act) train_err = np.mean(y != t_train) # evaluate the validation loss and error act = model.compute_activations(X_val) val_loss = loss_fn.value(act['z'], t_val).mean() y = model.get_predictions(act) val_err = np.mean(y != t_val) print 'Epoch {}; train_loss={:1.5f}, train_err={:1.5f}, val_loss={:1.5f}, val_err={:1.5f}'.format( ep, train_loss, train_err, val_loss, val_err) return model
def fit(self, sess, saver,data,epochs): prev_loss = 999 train_writer = tf.summary.FileWriter('script/log', sess.graph) x,y = self.preprocess_sequence_data(data) for epoch in range(epochs): print('Start epoch: {}'.format(epoch)) train_loss = [] state = np.zeros((self.config.batch_size, self.config.state_size*self.config.n_layers)) for inputs_batch,labels_batch in util.get_batches(x,y, self.config.seq_len): loss, state, log_summary = self.train(sess, inputs_batch,labels_batch,state) train_writer.add_summary(log_summary) train_loss.append(loss) train_loss = sum(train_loss)/len(train_loss) print('Epoch: {0} Training loss {1:.4f}'.format(epoch,train_loss)) if train_loss <prev_loss: prev_loss = train_loss if not os.path.exists('script/rnn_check_points/{}'.format(self.config.country_code)): os.makedirs('script/rnn_check_points/{}'.format(self.config.country_code)) saver.save(sess, 'script/rnn_check_points/{}/rnn'.format(self.config.country_code),global_step=self.global_step) print('Best training loss. Parameters saved.')
train_op_ = optimizer_.apply_gradients(capped_gradients_, global_step=global_step_) loss_list = [] reward_list = [] loss_list_critic = [] reward_valid_list = [] count = 0 with critic_sess.as_default(): with critic_graph.as_default(): critic_sess.run(tf.global_variables_initializer()) for epoch_i in range(epochs): for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate( util.get_batches(train_source, train_target, batch_size, source_vocab_to_int['<PAD>'], target_vocab_to_int['<PAD>'])): if batch_i == 0: with train_sess.as_default(): with train_graph.as_default(): rewards_all = 0 for batch_j, (valid_sources_batch, valid_targets_batch, valid_sources_lengths, valid_targets_lengths) in enumerate( util.get_batches(valid_source, valid_target, batch_size, source_vocab_to_int['<PAD>'], target_vocab_to_int['<PAD>'])): batch_valid_logits = train_sess.run( inference_logits, {input_data: valid_sources_batch, source_sequence_length: valid_sources_lengths, target_sequence_length: valid_targets_lengths, keep_prob: 1.0})
chunk_vocab.add(u"<chunk_unk>") print "chunk count:", chunk_vocab.size if args.mode == "baseline": lm = models.BaselineLanguageModel(model, args, vocab) elif args.mode == "lattice": lm = models.LatticeLanguageModel(model, args, vocab, chunk_vocab) elif args.mode == "memb": lm = models.MultiEmbLanguageModel(model, args, vocab) else: raise Exception("unrecognized mode") if args.load: model.populate(args.save) if not args.evaluate and not args.debug: train_batches = util.get_batches(train_data, args.minibatch_size, args.max_batched_sentence_len) valid_batches = util.get_batches(valid_data, args.minibatch_size, args.max_batched_sentence_len) best_score = None args.update_num = 0 train_accumulator = Accumulator(accs, disps) _start = time.time() for epoch_i in range(args.epochs): args.completed_epochs = epoch_i print "Epoch %d. Shuffling..." % epoch_i, if epoch_i == 0: train_batches = util.shuffle_preserve_first(train_batches) else: random.shuffle(train_batches) print "done." for i, batch in enumerate(train_batches):
def train(args): torch.manual_seed(args.seed) # Get data loader train_data, dev_data, word2id, id2word, char2id, new_args = data_loader( args) model = QAxl(new_args) if args.use_cuda: model = model.cuda() dev_batches = get_batches(dev_data, args.batch_size, evaluation=True) # Get optimizer and scheduler parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adamax(parameters, lr=args.lrate) lrate = args.lrate if args.eval: model.load_state_dict(torch.load(args.model_dir)) model.eval() model.SelfEvaluate(dev_batches, args.data_dir + 'dev_eval.json', answer_file=args.answer_file, drop_file=args.data_dir + 'drop.json', dev=args.data_dir + 'dev.json') exit() if args.load_model: model.load_state_dict(torch.load(args.model_dir)) best_score = 0.0 ## Training for epoch in range(1, args.epochs + 1): train_batches = get_batches(train_data, args.batch_size) dev_batches = get_batches(dev_data, args.batch_size, evaluation=True) model.train() for i, train_batch in enumerate(train_batches): loss = model(train_batch) model.zero_grad() optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(parameters, new_args['grad_clipping']) optimizer.step() model.reset_parameters() if i % 100 == 0: print( 'epoch = %d, loss = %.5f, step = %d, lrate = %.5f best_score = %.3f' % (epoch, model.train_loss.value, i, lrate, best_score)) sys.stdout.flush() model.eval() exact_match_score, F1 = model.SelfEvaluate( dev_batches, args.data_dir + 'dev_eval.json', answer_file=args.answer_file, drop_file=args.data_dir + 'drop.json', dev=args.data_dir + 'dev-v2.0.json') if best_score < F1: best_score = F1 print('saving %s ...' % args.model_dir) torch.save(model.state_dict(), args.model_dir) if epoch > 0 and epoch % args.decay_period == 0: lrate *= args.decay for param_group in optimizer.param_groups: param_group['lr'] = lrate
def train(epochs=20, clip=5, val_frac=0.1, print_every=100): global data net.train() # create training and validation data val_idx = int(len(data) * (1 - val_frac)) data, val_data = data[:val_idx], data[val_idx:] counter = 0 n_chars = len(net.chars) for e in range(epochs): # initialize hidden state h = net.init_hidden(batch_size) for x, y in get_batches(data, batch_size, seq_length): counter += 1 # One-hot encode our data and make them Torch tensors x = one_hot_encode(x, n_chars) inputs, targets = torch.from_numpy(x), torch.from_numpy(y) inputs, targets = inputs.to(device), targets.cuda(device) h = tuple([each.data for each in h]) net.zero_grad() # get the output from the model output, h = net(inputs, h) # calculate the loss and perform backprop loss = criterion(output, targets.view(batch_size * seq_length).long()) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(net.parameters(), clip) opt.step() # loss stats if counter % print_every == 0: # Get validation loss val_h = net.init_hidden(batch_size) val_losses = [] net.eval() for x, y in get_batches(val_data, batch_size, seq_length): # One-hot encode our data and make them Torch tensors x = one_hot_encode(x, n_chars) x, y = torch.from_numpy(x), torch.from_numpy(y) # Creating new variables for the hidden state, otherwise # we'd backprop through the entire training history val_h = tuple([each.data for each in val_h]) inputs, targets = x, y if (on_gpu()): inputs, targets = inputs.cuda(), targets.cuda() output, val_h = net(inputs, val_h) val_loss = criterion( output, targets.view(batch_size * seq_length).long()) val_losses.append(val_loss.item()) net.train( ) # reset to train mode after iterationg through validation data print("Epoch: {}/{}...".format(e + 1, epochs), "Step: {}...".format(counter), "Loss: {:.4f}...".format(loss.item()), "Val Loss: {:.4f}".format(np.mean(val_losses)))
train_op_ = optimizer_.apply_gradients(capped_gradients_, global_step=global_step_) with critic_sess.as_default(): with critic_sess.graph.as_default(): critic_sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() tf.add_to_collection('train_op_critic', train_op_) tf.add_to_collection('loss_critic', l_) loss_list = [] count = 0 for batch_i, (source_batch, target_batch, sources_lengths, targets_lengths) in enumerate( util.get_batches(source_train, target_train, batch_size, source_vocab_to_int['<PAD>'], target_vocab_to_int['<PAD>'])): count += 1 with train_sess.as_default(): with train_sess.graph.as_default(): translate_logits = train_sess.run( logits, { input_data: source_batch, target_sequence_length: targets_lengths, source_sequence_length: sources_lengths, keep_prob: 1.0 }) lens = [[translate_logits.shape[1]] * batch_size] lens = np.squeeze(lens)