def __init__(self, train, test, dir): self.model = dy.Model() self.trainer = dy.AdamTrainer(self.model) self.pW = self.model.add_parameters((args.m, args.s)) self.pU = self.model.add_parameters((vocab.size(), args.m)) self.trainData = train self.testData = test self.dir = dir if os.path.exists(self.dir): shutil.rmtree(self.dir)
def train(self, epochs, trainer, lr, no_decay, patience, end_patience): if trainer == "sgd": trainer = dy.MomentumSGDTrainer(self.model, learning_rate=lr) trainer.set_clip_threshold(5.0) else: trainer = dy.AdamTrainer(self.model) best_acc = 0 print(len(self.training_data)) check_val = int(len(self.training_data) / (5.0 * self.batch_size)) best_ep = -1 for ep in range(epochs): logging.info("Epoch: %d" % ep) ep_loss = 0 num_batches = 0 random.shuffle(self.training_data) for i in range(0, len(self.training_data), self.batch_size): if num_batches % check_val == 0: v_acc = self.get_accuracy(self.dev_data, print_out="dev.temp.") logging.info("Validation F1: %f" % v_acc) if v_acc > best_acc: self.save_model() best_acc = v_acc logging.info("Saved!") best_ep = ep cur_size = min(self.batch_size, len(self.training_data) - i) loss = self.calculate_loss(self.training_data[i : i + cur_size]) ep_loss += loss.scalar_value() loss.backward() trainer.update() num_batches += 1 logging.info("Training loss: %f" % ep_loss) if (ep - best_ep) > end_patience: self.model.populate(self.model_file) logging.info("Training patience reached.\n") break if not no_decay and (ep - best_ep) > patience: self.model.populate(self.model_file) # best_ep = ep lr = trainer.learning_rate / 1.05 trainer.learning_rate = lr logging.info("New learning rate: " + str(lr)) logging.info("\n")
def train_network(self, train_data, epochs = 3): trainer = dy.AdamTrainer(self.pc) i = 0 mloss = 0. goods = 0. for e in range(epochs): shuffle(train_data) for x, y in train_data: i = i + 1 loss = self.eval_loss(x, y) good = y == self.last_case_class() #print y, self.last_output_value(), np.argmax(self.last_output_value()), self.last_case_class() mloss += loss.value() goods += int(good) loss.backward() trainer.update() print("average loss: {} acc: {}".format(mloss/i, goods/i))
def __init__(self,word_size,context_fre, context_size,vocab,window=2,subsample_n=2000,mode='bow',embed_size=200, batch_size=128,num_sampled=5, epoch=6): self.embed_size = embed_size self.mode = mode self.window = window self.vocab = vocab self.word_size = word_size self.subsample_n = subsample_n self.context_size = context_size self.num_sampled = num_sampled self.epoch = epoch self.context_fre = context_fre self.batch_size=batch_size self.pc = dy.ParameterCollection() self.optimizer = dy.AdamTrainer(self.pc) self.word_embeddings = self.pc.add_lookup_parameters((self.word_size, self.embed_size), name="word-embeddings") self.context_embeddings = self.pc.add_lookup_parameters((self.context_size, self.embed_size), name="context-embeddings") dy.renew_cg() print ([(param.name(), param.shape()) for param in self.pc.lookup_parameters_list() + self.pc.parameters_list()])
def main(): parser = argparse.ArgumentParser( description= 'Convolutional Neural Networks for Sentence Classification in DyNet') parser.add_argument('--gpu', type=int, default=0, help='GPU ID to use. For cpu, set -1 [default: 0]') parser.add_argument( '--train_x_path', type=str, default='./data/train_x.txt', help='File path of train x data [default: `./data/train_x.txt`]') parser.add_argument( '--train_y_path', type=str, default='./data/train_y.txt', help='File path of train y data [default: `./data/train_x.txt`]') parser.add_argument( '--valid_x_path', type=str, default='./data/valid_x.txt', help='File path of valid x data [default: `./data/valid_x.txt`]') parser.add_argument( '--valid_y_path', type=str, default='./data/valid_y.txt', help='File path of valid y data [default: `./data/valid_y.txt`]') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]') parser.add_argument('--batch_size', type=int, default=64, help='Mini batch size [default: 64]') parser.add_argument('--win_sizes', type=int, nargs='*', default=[3, 4, 5], help='Window sizes of filters [default: [3, 4, 5]]') parser.add_argument( '--num_fil', type=int, default=100, help='Number of filters in each window size [default: 100]') parser.add_argument('--s', type=float, default=3.0, help='L2 norm constraint on w [default: 3.0]') parser.add_argument('--dropout_prob', type=float, default=0.5, help='Dropout probability [default: 0.5]') parser.add_argument( '--v_strategy', type=str, default='static', help= 'Embedding strategy. rand: Random initialization. static: Load pretrained embeddings and do not update during the training. non-static: Load pretrained embeddings and update during the training. [default: static]' ) parser.add_argument( '--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) N_EPOCHS = args.n_epochs WIN_SIZES = args.win_sizes BATCH_SIZE = args.batch_size EMB_DIM = 300 OUT_DIM = 1 L2_NORM_LIM = args.s NUM_FIL = args.num_fil DROPOUT_PROB = args.dropout_prob V_STRATEGY = args.v_strategy ALLOC_MEM = args.alloc_mem if V_STRATEGY in ['rand', 'static', 'non-static']: NUM_CHA = 1 else: NUM_CHA = 2 # FILE paths W2V_PATH = './GoogleNews-vectors-negative300.bin' TRAIN_X_PATH = args.train_x_path TRAIN_Y_PATH = args.train_y_path VALID_X_PATH = args.valid_x_path VALID_Y_PATH = args.valid_y_path # DyNet setting dyparams = dy.DynetParams() dyparams.set_random_seed(RANDOM_SEED) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Load pretrained embeddings pretrained_model = gensim.models.KeyedVectors.load_word2vec_format( W2V_PATH, binary=True) vocab = pretrained_model.wv.vocab.keys() w2v = pretrained_model.wv # Build dataset ======================================================================================================= w2c = build_w2c(TRAIN_X_PATH, vocab=vocab) w2i, i2w = build_w2i(TRAIN_X_PATH, w2c, unk='unk') train_x, train_y = build_dataset(TRAIN_X_PATH, TRAIN_Y_PATH, w2i, unk='unk') valid_x, valid_y = build_dataset(VALID_X_PATH, VALID_Y_PATH, w2i, unk='unk') train_x, train_y = sort_data_by_length(train_x, train_y) valid_x, valid_y = sort_data_by_length(valid_x, valid_y) VOCAB_SIZE = len(w2i) print('VOCAB_SIZE:', VOCAB_SIZE) V_init = init_V(w2v, w2i) with open(os.path.join(RESULTS_DIR, './w2i.dump'), 'wb') as f_w2i, open(os.path.join(RESULTS_DIR, './i2w.dump'), 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w) # Build model ================================================================================= model = dy.Model() trainer = dy.AdamTrainer(model) # V1 V1 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) if V_STRATEGY in ['static', 'non-static', 'multichannel']: V1.init_from_array(V_init) if V_STRATEGY in ['static', 'multichannel']: V1_UPDATE = False else: # 'rand', 'non-static' V1_UPDATE = True make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) # V2 if V_STRATEGY == 'multichannel': V2 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) V2.init_from_array(V_init) V2_UPDATE = True make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) layers = [ CNNText(model, EMB_DIM, WIN_SIZES, NUM_CHA, NUM_FIL, dy.tanh, DROPOUT_PROB), Dense(model, 3 * NUM_FIL, OUT_DIM, dy.logistic) ] # Train model ================================================================================ n_batches_train = math.ceil(len(train_x) / BATCH_SIZE) n_batches_valid = math.ceil(len(valid_x) / BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train loss_all_train = [] pred_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(train_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(train_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=False) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_train.append(mb_loss.value()) pred_all_train.extend(list(binary_pred(y.npvalue().flatten()))) # Backward prop mb_loss.backward() trainer.update() # L2 norm constraint layers[1].scale_W(L2_NORM_LIM) # Make padding embs zero if V_STRATEGY in ['rand', 'non-static']: make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) elif V_STRATEGY in ['multichannel']: make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) # Valid loss_all_valid = [] pred_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(valid_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(valid_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=True) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_valid.append(mb_loss.value()) pred_all_valid.extend(list(binary_pred(y.npvalue().flatten()))) print( 'EPOCH: %d, Train Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Valid Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Time:: %.3f[s]' % ( epoch + 1, np.mean(loss_all_train), f1_score(train_y, pred_all_train), accuracy_score(train_y, pred_all_train), np.mean(loss_all_valid), f1_score(valid_y, pred_all_valid), accuracy_score(valid_y, pred_all_valid), time.time() - start_time, )) # Save model ========================================================================================================================= if V_STRATEGY in ['rand', 'static', 'non-static']: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1] + layers) else: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1, V2] + layers)
def train(builder, model, model_parameters, X_train, y_train, nepochs, alpha=0.01, update=True, dropout=0.0, x_y_vectors=None, num_hidden_layers=0): """ Train the LSTM :param builder: the LSTM builder :param model: LSTM RNN model :param model_parameters: the model parameters :param X_train: the lstm instances :param y_train: the lstm labels :param nepochs: number of epochs :param alpha: the learning rate (only for SGD) :param update: whether to update the lemma embeddings :param dropout: dropout probability for all component embeddings :param x_y_vectors: the word vectors of x and y :param num_hidden_layers The number of hidden layers for the term-pair classification network """ trainer = dy.AdamTrainer(model, alpha=alpha) minibatch_size = min(MINIBATCH_SIZE, len(y_train)) nminibatches = int(math.ceil(len(y_train) / minibatch_size)) previous_loss = 1000 for epoch in range(nepochs): total_loss = 0.0 epoch_indices = np.random.permutation(len(y_train)) for minibatch in range(nminibatches): path_cache = {} batch_indices = epoch_indices[minibatch * minibatch_size:(minibatch + 1) * minibatch_size] dy.renew_cg() loss = dy.esum([ -dy.log( dy.pick( process_one_instance( builder, model, model_parameters, X_train[batch_indices[i]], path_cache, update, dropout, x_y_vectors=x_y_vectors[batch_indices[i]] if x_y_vectors is not None else None, num_hidden_layers=num_hidden_layers), y_train[batch_indices[i]])) for i in range(minibatch_size) ]) total_loss += loss.value() # forward computation loss.backward() trainer.update() # deprecated http://dynet.readthedocs.io/en/latest/python_ref.html#optimizers GB # and requires an argument (would be epoch i guess...) # trainer.update_epoch() trainer.update() total_loss /= len(y_train) print 'Epoch', (epoch + 1), '/', nepochs, 'Loss =', total_loss # Early stopping if math.fabs(previous_loss - total_loss) < LOSS_EPSILON: break previous_loss = total_loss
def main(): parser = argparse.ArgumentParser(description='Selective Encoding for Abstractive Sentence Summarization in DyNet') parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]') parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]') parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651])') parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]') parser.add_argument('--vocab_size', type=int, default=124404, help='Vocabulary size [default: 124404]') parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]') parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]') parser.add_argument('--maxout_dim', type=int, default=2, help='Maxout size [default: 2]') parser.add_argument('--alloc_mem', type=int, default=10000, help='Amount of memory to allocate [mb] [default: 10000]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size VOCAB_SIZE = args.vocab_size EMB_DIM = args.emb_dim HID_DIM = args.hid_dim MAXOUT_DIM = args.maxout_dim ALLOC_MEM = args.alloc_mem # File paths TRAIN_X_FILE = './data/train.article.txt' TRAIN_Y_FILE = './data/train.title.txt' VALID_X_FILE = './data/valid.article.filter.txt' VALID_Y_FILE = './data/valid.title.filter.txt' # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_SEED) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Build dataset dataset = Dataset( TRAIN_X_FILE, TRAIN_Y_FILE, VALID_X_FILE, VALID_Y_FILE, vocab_size=VOCAB_SIZE, batch_size=BATCH_SIZE, n_train=N_TRAIN, n_valid=N_VALID ) VOCAB_SIZE = len(dataset.w2i) print('VOCAB_SIZE', VOCAB_SIZE) # Build model model = dy.Model() trainer = dy.AdamTrainer(model) V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) encoder = SelectiveBiGRU(model, EMB_DIM, HID_DIM) decoder = AttentionalGRU(model, EMB_DIM, HID_DIM, MAXOUT_DIM, VOCAB_SIZE) # Train model start_time = time.time() for epoch in range(N_EPOCHS): # Train loss_all_train = [] dataset.reset_train_iter() for train_x_mb, train_y_mb in tqdm(dataset.train_iter): # Create a new computation graph dy.renew_cg() associate_parameters([encoder, decoder]) losses = [] for x, t in zip(train_x_mb, train_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] hp, hb_1 = encoder(x_embs) # Decoder decoder.set_initial_states(hp, hb_1) t_embs = [dy.lookup(V, t_t) for t_t in t_in] y = decoder(t_embs) # Loss loss = dy.esum( [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)] ) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_train.append(mb_loss.value()) # Backward prop mb_loss.backward() trainer.update() # Valid loss_all_valid = [] dataset.reset_valid_iter() for valid_x_mb, valid_y_mb in dataset.valid_iter: # Create a new computation graph dy.renew_cg() associate_parameters([encoder, decoder]) losses = [] for x, t in zip(valid_x_mb, valid_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] hp, hb_1 = encoder(x_embs) # Decoder decoder.set_initial_states(hp, hb_1) t_embs = [dy.lookup(V, t_t) for t_t in t_in] y = decoder(t_embs) # Loss loss = dy.esum( [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)] ) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_valid.append(mb_loss.value()) print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f, Time: %.3f[s]' % ( epoch+1, np.mean(loss_all_train), np.mean(loss_all_valid), time.time()-start_time )) # Save model dy.save('./model_e'+str(epoch+1), [V, encoder, decoder]) with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w: pickle.dump(dataset.w2i, f_w2i) pickle.dump(dataset.i2w, f_i2w)
def main(): parser = argparse.ArgumentParser(description='A Neural Attention Model for Abstractive Sentence Summarization in DyNet') parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: 0]') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]') parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]') parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651]') parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]') parser.add_argument('--vocab_size', type=int, default=60000, help='Vocabulary size [default: 60000]') parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]') parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]') parser.add_argument('--encoder_type', type=str, default='attention', help='Encoder type. bow: Bag-of-words encoder. attention: Attention-based encoder [default: attention]') parser.add_argument('--c', type=int, default=5, help='Window size in neural language model [default: 5]') parser.add_argument('--q', type=int, default=2, help='Window size in attention-based encoder [default: 2]') parser.add_argument('--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size VOCAB_SIZE = args.vocab_size EMB_DIM = args.emb_dim HID_DIM = args.hid_dim ENCODER_TYPE = args.encoder_type C = args.c Q = args.q ALLOC_MEM = args.alloc_mem # File paths TRAIN_X_FILE = './data/train.article.txt' TRAIN_Y_FILE = './data/train.title.txt' VALID_X_FILE = './data/valid.article.filter.txt' VALID_Y_FILE = './data/valid.title.filter.txt' # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_STATE) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Build dataset ==================================================================================== w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN) w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN) train_X, w2i, i2w = build_dataset(TRAIN_X_FILE, w2c=w2c, padid=False, eos=True, unksym='<unk>', target=False, n_data=N_TRAIN, vocab_size=VOCAB_SIZE) train_y, _, _ = build_dataset(TRAIN_Y_FILE, w2i=w2i, target=True, n_data=N_TRAIN) valid_X, _, _ = build_dataset(VALID_X_FILE, w2i=w2i, target=False, n_data=N_VALID) valid_y, _, _ = build_dataset(VALID_Y_FILE, w2i=w2i, target=True, n_data=N_VALID) VOCAB_SIZE = len(w2i) OUT_DIM = VOCAB_SIZE print('VOCAB_SIZE:', VOCAB_SIZE) # Build model ====================================================================================== model = dy.Model() trainer = dy.AdamTrainer(model) rush_abs = ABS(model, EMB_DIM, HID_DIM, VOCAB_SIZE, Q, C, encoder_type=ENCODER_TYPE) # Padding train_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in train_y] valid_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in valid_y] n_batches_train = math.ceil(len(train_X)/BATCH_SIZE) n_batches_valid = math.ceil(len(valid_X)/BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train train_X, train_y = shuffle(train_X, train_y) loss_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() rush_abs.associate_parameters() # Create a mini batch start = i*BATCH_SIZE end = start + BATCH_SIZE train_X_mb = train_X[start:end] train_y_mb = train_y[start:end] losses = [] for x, t in zip(train_X_mb, train_y_mb): t_in, t_out = t[:-1], t[C:] y = rush_abs(x, t_in) loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_train.append(mb_loss.value()) # Backward prop mb_loss.backward() trainer.update() # Valid loss_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() rush_abs.associate_parameters() # Create a mini batch start = i*BATCH_SIZE end = start + BATCH_SIZE valid_X_mb = valid_X[start:end] valid_y_mb = valid_y[start:end] losses = [] for x, t in zip(valid_X_mb, valid_y_mb): t_in, t_out = t[:-1], t[C:] y = rush_abs(x, t_in) loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_valid.append(mb_loss.value()) print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' % ( epoch+1, np.mean(loss_all_train), np.mean(loss_all_valid) )) # Save model ======================================================================== dy.save('./model_e'+str(epoch+1), [rush_abs]) with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w)
def __init__(self, model, type, lrate, moment=None): self._tt = { "sgd": dy.SimpleSGDTrainer(model, lrate), "momentum": dy.MomentumSGDTrainer(model, lrate, moment), "adam": dy.AdamTrainer(model, lrate) }[type]
def main(): print_config(opt) # Load the relations with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in: relations = [line.strip() for line in f_in] relation_index = {relation: i for i, relation in enumerate(relations)} # Load the datasets if args.debug: trainname = '../datasets/wn-bo/train_sample.tsv' print 'Loading the dataset...', trainname, '*' * 10 train_set = load_dataset(trainname, relations) val_set = load_dataset(trainname, relations) test_set = load_dataset(trainname, relations) else: trainname = '/' + args.trainname + '.tsv' valname = '/' + args.valname + '.tsv' testname = '/' + args.testname + '.tsv' print 'Loading the dataset...', trainname, '*' * 10 train_set = load_dataset(args.dataset_prefix + trainname, relations) print 'Loading the dataset...', valname, '*' * 10 val_set = load_dataset(args.dataset_prefix + valname, relations) print 'Loading the dataset...', testname, '*' * 10 test_set = load_dataset(args.dataset_prefix + testname, relations) # y_train = [relation_index[label] for label in train_set.values()] # y_val = [relation_index[label] for label in val_set.values()] # y_test = [relation_index[label] for label in test_set.values()] dataset_keys = train_set.keys() + val_set.keys() + test_set.keys() # add (x, root) to dataset_keys vocab = set() for (x, y) in dataset_keys: vocab.add(x) vocab.add(y) dataset_keys += [(term, 'root007') for term in vocab] if not args.debug: trees = read_tree_file( "../datasets/wn-bo/wn-bo-trees-4-11-50-train533-lower.ptb", given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up) trees_val = read_tree_file( "../datasets/wn-bo/wn-bo-trees-4-11-50-dev114-lower.ptb", given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up) trees_test = read_tree_file( "../datasets/wn-bo/wn-bo-trees-4-11-50-test114-lower.ptb", given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up) trees_semeval = read_edge_files("../datasets/SemEval-2016/original/", given_root=True, filter_root=args.filter_root, allow_up=False) else: trees = read_tree_file( "../datasets/wn-bo/train_sample.ptb2", given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up) trees_val = read_tree_file( "../datasets/wn-bo/train_sample.ptb2", given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up) trees_test = read_tree_file( "../datasets/wn-bo/train_sample.ptb2", given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up) trees_semeval = read_tree_file( "../datasets/wn-bo/train_sample.ptb2", given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up) # Load the resource (processed corpus) print 'Loading the corpus...', args.corpus_prefix, '*' * 10 corpus = KnowledgeResource(args.corpus_prefix) if not os.path.exists('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug)): print 'Loading the vocabulary...' # path_lemmas_name = "pickled_data/path_lemmas_3in1.pkl" # print 'reload path_lemmas from:', path_lemmas_name # path_lemmas = pickle.load(open(path_lemmas_name, 'rb')) path_lemmas, x_y_words, keys = get_vocabulary(corpus, dataset_keys, None) if not args.debug: pickle.dump(path_lemmas, open('pickled_data/path_lemmas_{}.pkl'.format(args.model_prefix_file), 'wb')) pickle.dump(x_y_words, open('pickled_data/x_y_words_{}.pkl'.format(args.model_prefix_file), 'wb')) # Load the word embeddings print 'Initializing word embeddings...' word_vectors, word_index, word_set = load_embeddings(args.embeddings_file, path_lemmas, x_y_words, debug=args.debug) # Load the paths and create the feature vectors print 'Loading path files...' dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \ dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index, keys) print 'saving pkl...' pickle.dump((word_vectors, word_index, word_set, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, dir_inverted_index), open('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'wb')) else: print 'Data loaded from', 'pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'make sure pkl is correct' (word_vectors, word_index, word_set, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, dir_inverted_index) = pickle.load( open('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'rb')) print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \ (len(word_index), len(pos_index), len(dep_index), len(dir_index)) # dataset_instances is now (paths, x_y_vectors, features) X_train = dataset_instances[:len(train_set)] X_val = dataset_instances[len(train_set):len(train_set) + len(val_set)] X_test = dataset_instances[len(train_set) + len(val_set):] print len(X_train), len(X_val), len(X_test) # check_data(train_set, X_train, word_set) # check_data(val_set, X_val, word_set) # check_data(test_set, X_test, word_set) # save_path_info(dataset_keys, dataset_instances) # scores_save = [] # scores_save_test = [] # prob_save = [] # prob_save_test = [] policy = Policy(dataset_keys, dataset_instances, num_lemmas=len(word_index), num_pos=len(pos_index), num_dep=len(dep_index), num_directions=len(dir_index), opt=opt, num_relations=len(relations), lemma_embeddings=word_vectors) trainer = dy.AdamTrainer(policy.model, alpha=args.lr) if args.debug: n_epoch = 1000 else: n_epoch = 1000 best = [0] * 6 best_idx = [0] * 6 best_val = [0] * 6 best_val_idx = [0] * 6 best_test = [0] * 6 best_test_idx = [0] * 6 best_semeval = [0] * 6 best_semeval_idx = [0] * 6 policy_save_test = defaultdict(list) wrong_total_l = [] # check_limit(trees, policy, policy.unk_hard) # check_limit(trees, policy, policy.unk_soft) # check_limit(trees_test, policy, policy.unk_hard) # check_limit(trees_test, policy, policy.unk_soft) # exit(0) # TRAIN / TEST START HERE if args.load_model_file is None: for epoch in range(n_epoch): best, best_idx = train(epoch, trees, policy, trainer, best, best_idx, wrong_total_l) # policy_save_test, best_test, best_test_idx = test(epoch, trees_test, policy, policy_save_test, best_test, # best_test_idx) _, best_val, best_val_idx = test_single(epoch, trees_val, policy, [], best_val, best_val_idx, wrong_total_l) policy_save_test, best_test, best_test_idx = test_single(epoch, trees_test, policy, policy_save_test, best_test, best_test_idx, wrong_total_l) else: load_candidate_from_pickle(trees_semeval) _, best_semeval, best_semeval_idx = test_single(0, trees_semeval, policy, [], best_semeval, best_semeval_idx, wrong_total_l, reward_type='print_each')
def main(args): import dynet as dy get_data = {"ag": lambda : ag_data_reader.get_dataset(args.num_NE), "dw": lambda : dw_data_reader.get_dataset(args.num_NE), "bl": lambda : blog_data_reader.get_dataset(), "tp_fr": lambda : trustpilot_data_reader.get_dataset("fr"), "tp_de": lambda : trustpilot_data_reader.get_dataset("de"), "tp_dk": lambda : trustpilot_data_reader.get_dataset("dk"), "tp_us": lambda : trustpilot_data_reader.get_dataset("us"), "tp_uk": lambda : trustpilot_data_reader.get_dataset("uk")} train, dev, test = get_data[args.dataset]() labels_main_task = set([ex.get_label() for ex in train]) labels_main_task.add(0) assert(sorted(labels_main_task) == list(range(len(labels_main_task)))) labels_adve_task = get_aux_labels(train) print("Train size: {}".format(len(train))) print("Dev size: {}".format(len(dev))) print("Test size: {}".format(len(test))) print("Train data distribution") mfb_train = print_data_distributions(train) print("Dev data distribution") mfb_dev = print_data_distributions(dev) print("Test data distribution") mfb_test = print_data_distributions(test) results = {} model = dy.Model() #if args.use_demographics: symbols = ["<g={}>".format(i) for i in ["F", "M"]] + ["<a={}>".format(i) for i in ["U", "O"]] vocabulary = extract_vocabulary(train, add_symbols=symbols) bilstm = HierarchicalBiLSTM(args, vocabulary, model) input_size = bilstm.size() main_classifier = MLP(input_size, len(labels_main_task), args.hidden_layers, args.dim_hidden, dy.rectify, model) trainer = dy.AdamTrainer(model) trainer.set_clip_threshold(5) args.learning_rate = trainer.learning_rate if args.subset: train = train[:args.subset] dev = dev[:args.subset] output_size = len(labels_adve_task) adversary_classifier = MLP_sigmoid(input_size, output_size, args.hidden_layers, args.dim_hidden, dy.rectify, model) discriminator = None if args.atraining: discriminator = Discriminator(input_size, output_size, args.hidden_layers, args.dim_hidden, dy.rectify, model, trainer) generator = None if args.generator: generator = Generator(args, vocabulary, model, trainer) #### add adversary classifier mod = PrModel(args, model, trainer, bilstm, main_classifier, adversary_classifier, discriminator, generator, vocabulary) if args.baseline: _, ftest = mod.train_baseline(train, dev, test, args.iterations) print(ftest) return print("Train main task") results["000_main_dev_acc"] = mod.train_main(train, dev) targets_test = [ex.get_label() for ex in test] loss_test, acc_test, _ = mod.evaluate_main(test, targets_test) print("\t Test results : l={} acc={}".format(loss_test, acc_test)) results["001_main_test_acc"] = acc_test ############## ############## ############## ############## ############## ############## ############## Adversary training / evaluate privacy ############## ############## ############## ############## ############## train_hidden, dev_hidden, test_hidden = [mod.get_adversary_dataset(dataset) for dataset in [train, dev, test]] trainer.restart() print("Train adversary") results["002_adv_dev_F"] = mod.train_adversary(train_hidden, dev_hidden) targets_test = [ex.get_aux_labels() for ex in test] loss_test, acc_test, predictions_test = mod.evaluate_adversary(test_hidden) print("\t Adversary Test results : l={} acc={}".format(loss_test, acc_test)) outsize = mod.adversary_classifier.output_size() Fscore = compute_eval_metrics(outsize, targets_test, predictions_test) print("\tF = {} ".format(Fscore)) results["003_adv_test_fscore"] = Fscore[2] results["004_adv_test_precision"] = Fscore[0] results["005_adv_test_recall"] = Fscore[1] for i, acc in enumerate(Fscore[3]): results["{}_adv_test_acc_task_{}".format(str(i+6).zfill(3), i)] = acc preds = [set(range(outsize)) for _ in targets_test] Fscore = compute_eval_metrics(outsize, targets_test, preds) baseline_str = [Fscore[2], Fscore[0], Fscore[1]] + [x if x > 50.0 else 100 - x for x in Fscore[3]] line = ["Baseline", "NA", "NA", "NA", "NA", "NA", "NA", "NA", str(round(mfb_train * 100, 2)), str(round(mfb_test*100, 2)), "0"] print("\t".join(line) + "\t" + "\t".join(map(str, baseline_str))) for k in results: if type(results[k]) == float: results[k] = round(results[k], 2) results["#H"] = args.dim_hidden results["#h"] = args.hidden_layers results["#w"] = args.dim_word results["#W"] = args.dim_wrnn results["#Zatr"] = int(args.atraining) results["#Zptr"] = int(args.ptraining) results["#Zalpha"] = args.alpha keys = sorted(results) print("Model\t", end="") print("\t".join(keys)) print("\t".join(map(str, [results[k] for k in keys])))
def main(): parser = argparse.ArgumentParser( description= 'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet' ) parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]') parser.add_argument( '--n_train', type=int, default=3803957, help= 'Number of training examples (up to 3803957 in gigaword) [default: 3803957]' ) parser.add_argument( '--n_valid', type=int, default=189651, help= 'Number of validation examples (up to 189651 in gigaword) [default: 189651])' ) parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]') parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]') parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]') parser.add_argument('--lat_dim', type=int, default=256, help='Latent size [default: 256]') parser.add_argument( '--alloc_mem', type=int, default=8192, help='Amount of memory to allocate [mb] [default: 8192]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size VOCAB_SIZE = 60000 EMB_DIM = args.emb_dim HID_DIM = args.hid_dim LAT_DIM = args.lat_dim ALLOC_MEM = args.alloc_mem # File paths TRAIN_X_FILE = './data/train.article.txt' TRAIN_Y_FILE = './data/train.title.txt' VALID_X_FILE = './data/valid.article.filter.txt' VALID_Y_FILE = './data/valid.title.filter.txt' # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_STATE) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Build dataset ==================================================================================== w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN) w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN) train_X, w2i, i2w = build_dataset(TRAIN_X_FILE, w2c=w2c, padid=False, eos=True, unksym='<unk>', target=False, n_data=N_TRAIN, vocab_size=VOCAB_SIZE) train_y, _, _ = build_dataset(TRAIN_Y_FILE, w2i=w2i, target=True, n_data=N_TRAIN) valid_X, _, _ = build_dataset(VALID_X_FILE, w2i=w2i, target=False, n_data=N_VALID) valid_y, _, _ = build_dataset(VALID_Y_FILE, w2i=w2i, target=True, n_data=N_VALID) VOCAB_SIZE = len(w2i) OUT_DIM = VOCAB_SIZE print(VOCAB_SIZE) # Build model ====================================================================================== model = dy.Model() trainer = dy.AdamTrainer(model) V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) encoder = BiGRU(model, EMB_DIM, 2 * HID_DIM) decoder = RecurrentGenerativeDecoder(model, EMB_DIM, 2 * HID_DIM, LAT_DIM, OUT_DIM) # Train model ======================================================================================= n_batches_train = math.ceil(len(train_X) / BATCH_SIZE) n_batches_valid = math.ceil(len(valid_X) / BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train train_X, train_y = shuffle(train_X, train_y) loss_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() encoder.associate_parameters() decoder.associate_parameters() # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE train_X_mb = train_X[start:end] train_y_mb = train_y[start:end] losses = [] for x, t in zip(train_X_mb, train_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] he = encoder(x_embs) # Decoder t_embs = [dy.lookup(V, t_t) for t_t in t_in] decoder.set_initial_states(he) y, KL = decoder(t_embs) loss = dy.esum([ dy.pickneglogsoftmax(y_t, t_t) + KL_t for y_t, t_t, KL_t in zip(y, t_out, KL) ]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_train.append(mb_loss.value()) # Backward prop mb_loss.backward() trainer.update() # Valid loss_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() encoder.associate_parameters() decoder.associate_parameters() # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE valid_X_mb = valid_X[start:end] valid_y_mb = valid_y[start:end] losses = [] for x, t in zip(valid_X_mb, valid_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] he = encoder(x_embs) # Decoder t_embs = [dy.lookup(V, t_t) for t_t in t_in] decoder.set_initial_states(he) y, KL = decoder(t_embs) loss = dy.esum([ dy.pickneglogsoftmax(y_t, t_t) + KL_t for y_t, t_t, KL_t in zip(y, t_out, KL) ]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_valid.append(mb_loss.value()) print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' % (epoch + 1, np.mean(loss_all_train), np.mean(loss_all_valid))) # Save model ====================================================================================== dy.save('./model_e' + str(epoch + 1), [V, encoder, decoder]) with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w)
def train_model(model, encoder, decoder, params, train_inputs, train_outputs, dev_inputs, dev_outputs, y2int, int2y, epochs, optimization, results_file_path, plot, batch_size, eval_after, min_epochs): print 'training...' sys.stdout.flush() np.random.seed(17) random.seed(17) # sort training sentences by length in descending order train_data = zip(train_inputs, train_outputs) train_data.sort(key=lambda t: -len(t[0])) train_order = [ x * batch_size for x in range(len(train_data) / batch_size + 1) ] # sort dev sentences by length in descending order dev_batch_size = 1 dev_data = zip(dev_inputs, dev_outputs) dev_data.sort(key=lambda t: -len(t[0])) dev_order = [ x * dev_batch_size for x in range(len(dev_data) / dev_batch_size + 1) ] if optimization == 'ADAM': trainer = dn.AdamTrainer( model ) # lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8) elif optimization == 'MOMENTUM': trainer = dn.MomentumSGDTrainer(model) elif optimization == 'SGD': trainer = dn.SimpleSGDTrainer(model) elif optimization == 'ADAGRAD': trainer = dn.AdagradTrainer(model) elif optimization == 'ADADELTA': trainer = dn.AdadeltaTrainer(model) else: trainer = dn.SimpleSGDTrainer(model) trainer.set_clip_threshold(float(arguments['--grad-clip'])) seen_examples_count = 0 total_loss = 0 best_dev_epoch = 0 best_train_epoch = 0 patience = 0 train_len = len(train_outputs) dev_len = len(dev_inputs) avg_train_loss = -1 train_loss_patience = 0 train_loss_patience_threshold = 99999999 max_patience = int(arguments['--max-patience']) log_path = results_file_path + '_log.txt' start_epoch, checkpoints_x, train_loss_y, dev_loss_y, dev_accuracy_y = read_from_log( log_path) if len(train_loss_y) > 0: total_batches = checkpoints_x[-1] best_avg_train_loss = max(train_loss_y) best_dev_accuracy = max(dev_accuracy_y) best_dev_loss = max(dev_loss_y) else: total_batches = 0 best_avg_train_loss = 999999 best_dev_loss = 999999 best_dev_accuracy = 0 # progress bar init # noinspection PyArgumentList # widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()] # train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start() e = -1 for e in xrange(start_epoch, epochs): try: # shuffle the batch start indices in each epoch random.shuffle(train_order) batches_per_epoch = len(train_order) start = time.time() # go through batches for i, batch_start_index in enumerate(train_order, start=1): # get batch examples batch_inputs = [ x[0] for x in train_data[batch_start_index:batch_start_index + batch_size] ] batch_outputs = [ x[1] for x in train_data[batch_start_index:batch_start_index + batch_size] ] actual_batch_size = len(batch_inputs) # skip empty batches if actual_batch_size == 0 or len(batch_inputs[0]) == 0: continue # compute batch loss # debug prints for batch seq lengths # print 'batch {} seq lens'.format(i) # print [len(s) for s in batch_inputs] loss = compute_batch_loss(encoder, decoder, batch_inputs, batch_outputs, y2int) # forward pass total_loss += loss.scalar_value() loss.backward() total_batches += 1 # update parameters trainer.update() seen_examples_count += actual_batch_size # avg loss per sample avg_train_loss = total_loss / float(i * batch_size + e * train_len) # start patience counts only after 20 batches if avg_train_loss < best_avg_train_loss and total_batches > 20: best_avg_train_loss = avg_train_loss train_loss_patience = 0 else: train_loss_patience += 1 if train_loss_patience > train_loss_patience_threshold: print 'train loss patience exceeded: {}'.format( train_loss_patience) sys.stdout.flush() return model, params, e, best_dev_epoch if total_batches % 100 == 0 and total_batches > 0: print 'epoch {}: {} batches out of {} ({} examples out of {}) total: {} batches, {} examples. avg \ loss per example: {}'.format(e, i, batches_per_epoch, i * batch_size, train_len, total_batches, total_batches * batch_size, avg_train_loss) sys.stdout.flush() # print sentences per second end = time.time() elapsed_seconds = end - start print '{} sentences per second'.format( seen_examples_count / elapsed_seconds) sys.stdout.flush() seen_examples_count = 0 start = time.time() # checkpoint if total_batches % eval_after == 0: print 'starting checkpoint evaluation' sys.stdout.flush() dev_bleu, dev_loss = checkpoint_eval( encoder, decoder, params, dev_batch_size, dev_data, dev_inputs, dev_len, dev_order, dev_outputs, int2y, y2int, results_file_path=results_file_path) log_to_file(log_path, e, total_batches, avg_train_loss, dev_loss, dev_bleu) save_model(model, results_file_path, total_batches, models_to_save=int( arguments['--models-to-save'])) if dev_bleu > best_dev_accuracy: best_dev_accuracy = dev_bleu best_dev_epoch = e # save best model to disk save_best_model(model, results_file_path) print 'saved new best model' sys.stdout.flush() patience = 0 else: patience += 1 if dev_loss < best_dev_loss: best_dev_loss = dev_loss print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev bleu: {3:.4f} \ best dev bleu {4:.4f} (epoch {5}) patience = {6}'.format( e, avg_train_loss, dev_loss, dev_bleu, best_dev_accuracy, best_dev_epoch, patience) sys.stdout.flush() if (patience == max_patience) and (e >= min_epochs): print 'out of patience after {0} checkpoints'.format( str(e)) sys.stdout.flush() # train_progress_bar.finish() if plot: plt.cla() print 'checkpoint patience exceeded' sys.stdout.flush() return model, params, e, best_dev_epoch # plotting results from checkpoint evaluation if plot: train_loss_y.append(avg_train_loss) checkpoints_x.append(total_batches) dev_accuracy_y.append(dev_bleu) dev_loss_y.append(dev_loss) y_vals = [('train_loss', train_loss_y), ('dev loss', dev_loss_y), ('dev_bleu', dev_accuracy_y)] common.plot_to_file(y_vals, x_name='total batches', x_vals=checkpoints_x, file_path=results_file_path + '_learning_curve.png') except RuntimeError as exception: # sometimes the above two instructions fail due to memory allocation failure. # I was unable to find a fix for these failures. # perhaps we can just "skip" the failures. print 'WARNING: Skipping epoch due to RuntimeError (' + str( exception) + ')' sys.stdout.flush() # update progress bar after completing epoch # train_progress_bar.update(e) # update progress bar after completing training # train_progress_bar.finish() if plot: # clear plot when done plt.cla() print 'finished training. average loss: {} best epoch on dev: {} best epoch on train: {}'.format( str(avg_train_loss), best_dev_epoch, best_train_epoch) sys.stdout.flush() return model, params, e, best_dev_epoch