Exemple #1
0
    def __init__(self, train, test, dir):
        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(self.model)

        self.pW = self.model.add_parameters((args.m, args.s))
        self.pU = self.model.add_parameters((vocab.size(), args.m))

        self.trainData = train
        self.testData = test
        self.dir = dir
        if os.path.exists(self.dir):
            shutil.rmtree(self.dir)
    def train(self, epochs, trainer, lr, no_decay, patience, end_patience):
        if trainer == "sgd":
            trainer = dy.MomentumSGDTrainer(self.model, learning_rate=lr)
            trainer.set_clip_threshold(5.0)
        else:
            trainer = dy.AdamTrainer(self.model)
        best_acc = 0

        print(len(self.training_data))

        check_val = int(len(self.training_data) / (5.0 * self.batch_size))
        best_ep = -1
        for ep in range(epochs):
            logging.info("Epoch: %d" % ep)
            ep_loss = 0
            num_batches = 0
            random.shuffle(self.training_data)
            for i in range(0, len(self.training_data), self.batch_size):
                if num_batches % check_val == 0:
                    v_acc = self.get_accuracy(self.dev_data, print_out="dev.temp.")
                    logging.info("Validation F1: %f" % v_acc)

                    if v_acc > best_acc:
                        self.save_model()
                        best_acc = v_acc
                        logging.info("Saved!")
                        best_ep = ep
                cur_size = min(self.batch_size, len(self.training_data) - i)
                loss = self.calculate_loss(self.training_data[i : i + cur_size])
                ep_loss += loss.scalar_value()
                loss.backward()
                trainer.update()
                num_batches += 1
            logging.info("Training loss: %f" % ep_loss)
            if (ep - best_ep) > end_patience:
                self.model.populate(self.model_file)
                logging.info("Training patience reached.\n")
                break
            if not no_decay and (ep - best_ep) > patience:
                self.model.populate(self.model_file)
                # best_ep = ep
                lr = trainer.learning_rate / 1.05
                trainer.learning_rate = lr
                logging.info("New learning rate: " + str(lr))
            logging.info("\n")
Exemple #3
0
 def train_network(self, train_data, epochs = 3):
     trainer = dy.AdamTrainer(self.pc)
     i = 0
     mloss = 0.
     goods = 0.
         
     for e in range(epochs):
         shuffle(train_data)
         for x, y in train_data:
             i = i + 1
             loss = self.eval_loss(x, y)
             good = y == self.last_case_class()
             #print y, self.last_output_value(), np.argmax(self.last_output_value()), self.last_case_class()
             mloss += loss.value()
             goods += int(good)
             loss.backward()
             trainer.update()
     print("average loss: {} acc: {}".format(mloss/i, goods/i))
Exemple #4
0
 def __init__(self,word_size,context_fre, context_size,vocab,window=2,subsample_n=2000,mode='bow',embed_size=200, batch_size=128,num_sampled=5, epoch=6):
     self.embed_size = embed_size
     self.mode = mode
     self.window = window
     self.vocab = vocab
     self.word_size = word_size
     self.subsample_n = subsample_n
     self.context_size = context_size
     self.num_sampled = num_sampled
     self.epoch = epoch
     self.context_fre = context_fre
     self.batch_size=batch_size
     self.pc = dy.ParameterCollection()
     self.optimizer = dy.AdamTrainer(self.pc)
     self.word_embeddings = self.pc.add_lookup_parameters((self.word_size, self.embed_size), name="word-embeddings")
     self.context_embeddings = self.pc.add_lookup_parameters((self.context_size, self.embed_size), name="context-embeddings")
     dy.renew_cg()
     print ([(param.name(), param.shape()) for param in self.pc.lookup_parameters_list() + self.pc.parameters_list()])
def main():
    parser = argparse.ArgumentParser(
        description=
        'Convolutional Neural Networks for Sentence Classification in DyNet')

    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU ID to use. For cpu, set -1 [default: 0]')
    parser.add_argument(
        '--train_x_path',
        type=str,
        default='./data/train_x.txt',
        help='File path of train x data [default: `./data/train_x.txt`]')
    parser.add_argument(
        '--train_y_path',
        type=str,
        default='./data/train_y.txt',
        help='File path of train y data [default: `./data/train_x.txt`]')
    parser.add_argument(
        '--valid_x_path',
        type=str,
        default='./data/valid_x.txt',
        help='File path of valid x data [default: `./data/valid_x.txt`]')
    parser.add_argument(
        '--valid_y_path',
        type=str,
        default='./data/valid_y.txt',
        help='File path of valid y data [default: `./data/valid_y.txt`]')
    parser.add_argument('--n_epochs',
                        type=int,
                        default=10,
                        help='Number of epochs [default: 10]')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        help='Mini batch size [default: 64]')
    parser.add_argument('--win_sizes',
                        type=int,
                        nargs='*',
                        default=[3, 4, 5],
                        help='Window sizes of filters [default: [3, 4, 5]]')
    parser.add_argument(
        '--num_fil',
        type=int,
        default=100,
        help='Number of filters in each window size [default: 100]')
    parser.add_argument('--s',
                        type=float,
                        default=3.0,
                        help='L2 norm constraint on w [default: 3.0]')
    parser.add_argument('--dropout_prob',
                        type=float,
                        default=0.5,
                        help='Dropout probability [default: 0.5]')
    parser.add_argument(
        '--v_strategy',
        type=str,
        default='static',
        help=
        'Embedding strategy. rand: Random  initialization. static: Load pretrained embeddings and do not update during the training. non-static: Load pretrained embeddings and update during the training. [default: static]'
    )
    parser.add_argument(
        '--alloc_mem',
        type=int,
        default=4096,
        help='Amount of memory to allocate [mb] [default: 4096]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    N_EPOCHS = args.n_epochs
    WIN_SIZES = args.win_sizes
    BATCH_SIZE = args.batch_size
    EMB_DIM = 300
    OUT_DIM = 1
    L2_NORM_LIM = args.s
    NUM_FIL = args.num_fil
    DROPOUT_PROB = args.dropout_prob
    V_STRATEGY = args.v_strategy
    ALLOC_MEM = args.alloc_mem

    if V_STRATEGY in ['rand', 'static', 'non-static']:
        NUM_CHA = 1
    else:
        NUM_CHA = 2

    # FILE paths
    W2V_PATH = './GoogleNews-vectors-negative300.bin'
    TRAIN_X_PATH = args.train_x_path
    TRAIN_Y_PATH = args.train_y_path
    VALID_X_PATH = args.valid_x_path
    VALID_Y_PATH = args.valid_y_path

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_random_seed(RANDOM_SEED)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Load pretrained embeddings
    pretrained_model = gensim.models.KeyedVectors.load_word2vec_format(
        W2V_PATH, binary=True)
    vocab = pretrained_model.wv.vocab.keys()
    w2v = pretrained_model.wv

    # Build dataset =======================================================================================================
    w2c = build_w2c(TRAIN_X_PATH, vocab=vocab)
    w2i, i2w = build_w2i(TRAIN_X_PATH, w2c, unk='unk')
    train_x, train_y = build_dataset(TRAIN_X_PATH,
                                     TRAIN_Y_PATH,
                                     w2i,
                                     unk='unk')
    valid_x, valid_y = build_dataset(VALID_X_PATH,
                                     VALID_Y_PATH,
                                     w2i,
                                     unk='unk')

    train_x, train_y = sort_data_by_length(train_x, train_y)
    valid_x, valid_y = sort_data_by_length(valid_x, valid_y)

    VOCAB_SIZE = len(w2i)
    print('VOCAB_SIZE:', VOCAB_SIZE)

    V_init = init_V(w2v, w2i)

    with open(os.path.join(RESULTS_DIR, './w2i.dump'),
              'wb') as f_w2i, open(os.path.join(RESULTS_DIR, './i2w.dump'),
                                   'wb') as f_i2w:
        pickle.dump(w2i, f_w2i)
        pickle.dump(i2w, f_i2w)

    # Build model =================================================================================
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    # V1
    V1 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))
    if V_STRATEGY in ['static', 'non-static', 'multichannel']:
        V1.init_from_array(V_init)
    if V_STRATEGY in ['static', 'multichannel']:
        V1_UPDATE = False
    else:  # 'rand', 'non-static'
        V1_UPDATE = True
    make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM)

    # V2
    if V_STRATEGY == 'multichannel':
        V2 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))
        V2.init_from_array(V_init)
        V2_UPDATE = True
        make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM)

    layers = [
        CNNText(model, EMB_DIM, WIN_SIZES, NUM_CHA, NUM_FIL, dy.tanh,
                DROPOUT_PROB),
        Dense(model, 3 * NUM_FIL, OUT_DIM, dy.logistic)
    ]

    # Train model ================================================================================
    n_batches_train = math.ceil(len(train_x) / BATCH_SIZE)
    n_batches_valid = math.ceil(len(valid_x) / BATCH_SIZE)

    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        loss_all_train = []
        pred_all_train = []
        for i in tqdm(range(n_batches_train)):
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters(layers)

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            x = build_batch(train_x[start:end], w2i, max(WIN_SIZES)).T
            t = np.array(train_y[start:end])

            sen_len = x.shape[0]

            if V_STRATEGY in ['rand', 'static', 'non-static']:
                x_embs = dy.concatenate_cols(
                    [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x])
                x_embs = dy.transpose(x_embs)
                x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1))
            else:  # multichannel
                x_embs1 = dy.concatenate_cols(
                    [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x])
                x_embs2 = dy.concatenate_cols(
                    [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x])
                x_embs1 = dy.transpose(x_embs1)
                x_embs2 = dy.transpose(x_embs2)
                x_embs = dy.concatenate([x_embs1, x_embs2], d=2)

            t = dy.inputTensor(t, batched=True)
            y = forwards(layers, x_embs, test=False)

            mb_loss = dy.mean_batches(dy.binary_log_loss(y, t))

            # Forward prop
            loss_all_train.append(mb_loss.value())
            pred_all_train.extend(list(binary_pred(y.npvalue().flatten())))

            # Backward prop
            mb_loss.backward()
            trainer.update()

            # L2 norm constraint
            layers[1].scale_W(L2_NORM_LIM)

            # Make padding embs zero
            if V_STRATEGY in ['rand', 'non-static']:
                make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM)
            elif V_STRATEGY in ['multichannel']:
                make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM)

        # Valid
        loss_all_valid = []
        pred_all_valid = []
        for i in range(n_batches_valid):
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters(layers)

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            x = build_batch(valid_x[start:end], w2i, max(WIN_SIZES)).T
            t = np.array(valid_y[start:end])

            sen_len = x.shape[0]

            if V_STRATEGY in ['rand', 'static', 'non-static']:
                x_embs = dy.concatenate_cols(
                    [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x])
                x_embs = dy.transpose(x_embs)
                x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1))
            else:  # multichannel
                x_embs1 = dy.concatenate_cols(
                    [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x])
                x_embs2 = dy.concatenate_cols(
                    [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x])
                x_embs1 = dy.transpose(x_embs1)
                x_embs2 = dy.transpose(x_embs2)
                x_embs = dy.concatenate([x_embs1, x_embs2], d=2)

            t = dy.inputTensor(t, batched=True)
            y = forwards(layers, x_embs, test=True)

            mb_loss = dy.mean_batches(dy.binary_log_loss(y, t))

            # Forward prop
            loss_all_valid.append(mb_loss.value())
            pred_all_valid.extend(list(binary_pred(y.npvalue().flatten())))

        print(
            'EPOCH: %d, Train Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Valid Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Time:: %.3f[s]'
            % (
                epoch + 1,
                np.mean(loss_all_train),
                f1_score(train_y, pred_all_train),
                accuracy_score(train_y, pred_all_train),
                np.mean(loss_all_valid),
                f1_score(valid_y, pred_all_valid),
                accuracy_score(valid_y, pred_all_valid),
                time.time() - start_time,
            ))

        # Save model =========================================================================================================================
        if V_STRATEGY in ['rand', 'static', 'non-static']:
            dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)),
                    [V1] + layers)
        else:
            dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)),
                    [V1, V2] + layers)
Exemple #6
0
def train(builder,
          model,
          model_parameters,
          X_train,
          y_train,
          nepochs,
          alpha=0.01,
          update=True,
          dropout=0.0,
          x_y_vectors=None,
          num_hidden_layers=0):
    """
    Train the LSTM
    :param builder: the LSTM builder
    :param model: LSTM RNN model
    :param model_parameters: the model parameters
    :param X_train: the lstm instances
    :param y_train: the lstm labels
    :param nepochs: number of epochs
    :param alpha: the learning rate (only for SGD)
    :param update: whether to update the lemma embeddings
    :param dropout: dropout probability for all component embeddings
    :param x_y_vectors: the word vectors of x and y
    :param num_hidden_layers The number of hidden layers for the term-pair classification network
    """
    trainer = dy.AdamTrainer(model, alpha=alpha)
    minibatch_size = min(MINIBATCH_SIZE, len(y_train))
    nminibatches = int(math.ceil(len(y_train) / minibatch_size))
    previous_loss = 1000

    for epoch in range(nepochs):

        total_loss = 0.0

        epoch_indices = np.random.permutation(len(y_train))

        for minibatch in range(nminibatches):

            path_cache = {}
            batch_indices = epoch_indices[minibatch *
                                          minibatch_size:(minibatch + 1) *
                                          minibatch_size]

            dy.renew_cg()

            loss = dy.esum([
                -dy.log(
                    dy.pick(
                        process_one_instance(
                            builder,
                            model,
                            model_parameters,
                            X_train[batch_indices[i]],
                            path_cache,
                            update,
                            dropout,
                            x_y_vectors=x_y_vectors[batch_indices[i]]
                            if x_y_vectors is not None else None,
                            num_hidden_layers=num_hidden_layers),
                        y_train[batch_indices[i]]))
                for i in range(minibatch_size)
            ])
            total_loss += loss.value()  # forward computation
            loss.backward()
            trainer.update()

        # deprecated http://dynet.readthedocs.io/en/latest/python_ref.html#optimizers GB
        # and requires an argument (would be epoch i guess...)
        # trainer.update_epoch()
        trainer.update()
        total_loss /= len(y_train)
        print 'Epoch', (epoch + 1), '/', nepochs, 'Loss =', total_loss

        # Early stopping
        if math.fabs(previous_loss - total_loss) < LOSS_EPSILON:
            break

        previous_loss = total_loss
Exemple #7
0
def main():
    parser = argparse.ArgumentParser(description='Selective Encoding for Abstractive Sentence Summarization in DyNet')

    parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]')
    parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]')
    parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651])')
    parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]')
    parser.add_argument('--vocab_size', type=int, default=124404, help='Vocabulary size [default: 124404]')
    parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]')
    parser.add_argument('--maxout_dim', type=int, default=2, help='Maxout size [default: 2]')
    parser.add_argument('--alloc_mem', type=int, default=10000, help='Amount of memory to allocate [mb] [default: 10000]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS   = args.n_epochs
    N_TRAIN    = args.n_train
    N_VALID    = args.n_valid
    BATCH_SIZE = args.batch_size
    VOCAB_SIZE = args.vocab_size
    EMB_DIM    = args.emb_dim
    HID_DIM    = args.hid_dim
    MAXOUT_DIM = args.maxout_dim
    ALLOC_MEM  = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_SEED)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset
    dataset = Dataset(
        TRAIN_X_FILE,
        TRAIN_Y_FILE,
        VALID_X_FILE,
        VALID_Y_FILE,
        vocab_size=VOCAB_SIZE,
        batch_size=BATCH_SIZE,
        n_train=N_TRAIN,
        n_valid=N_VALID
    )
    VOCAB_SIZE = len(dataset.w2i)
    print('VOCAB_SIZE', VOCAB_SIZE)

    # Build model
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))
    encoder = SelectiveBiGRU(model, EMB_DIM, HID_DIM)
    decoder = AttentionalGRU(model, EMB_DIM, HID_DIM, MAXOUT_DIM, VOCAB_SIZE)

    # Train model
    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        loss_all_train = []
        dataset.reset_train_iter()
        for train_x_mb, train_y_mb in tqdm(dataset.train_iter):
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters([encoder, decoder])
            losses = []
            for x, t in zip(train_x_mb, train_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                hp, hb_1 = encoder(x_embs)

                # Decoder
                decoder.set_initial_states(hp, hb_1)
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                y = decoder(t_embs)

                # Loss
                loss = dy.esum(
                    [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]
                )
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        dataset.reset_valid_iter()
        for valid_x_mb, valid_y_mb in dataset.valid_iter:
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters([encoder, decoder])
            losses = []
            for x, t in zip(valid_x_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                hp, hb_1 = encoder(x_embs)

                # Decoder
                decoder.set_initial_states(hp, hb_1)
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                y = decoder(t_embs)

                # Loss
                loss = dy.esum(
                    [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]
                )
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f, Time: %.3f[s]' % (
            epoch+1,
            np.mean(loss_all_train),
            np.mean(loss_all_valid),
            time.time()-start_time
        ))

        # Save model
        dy.save('./model_e'+str(epoch+1), [V, encoder, decoder])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w:
            pickle.dump(dataset.w2i, f_w2i)
            pickle.dump(dataset.i2w, f_i2w)
Exemple #8
0
def main():
    parser = argparse.ArgumentParser(description='A Neural Attention Model for Abstractive Sentence Summarization in DyNet')

    parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: 0]')
    parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]')
    parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]')
    parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651]')
    parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]')
    parser.add_argument('--vocab_size', type=int, default=60000, help='Vocabulary size [default: 60000]')
    parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]')
    parser.add_argument('--encoder_type', type=str, default='attention', help='Encoder type. bow: Bag-of-words encoder. attention: Attention-based encoder [default: attention]')
    parser.add_argument('--c', type=int, default=5, help='Window size in neural language model [default: 5]')
    parser.add_argument('--q', type=int, default=2, help='Window size in attention-based encoder [default: 2]')
    parser.add_argument('--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS     = args.n_epochs
    N_TRAIN      = args.n_train
    N_VALID      = args.n_valid
    BATCH_SIZE   = args.batch_size
    VOCAB_SIZE   = args.vocab_size
    EMB_DIM      = args.emb_dim
    HID_DIM      = args.hid_dim
    ENCODER_TYPE = args.encoder_type
    C            = args.c
    Q            = args.q
    ALLOC_MEM    = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_STATE)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset ====================================================================================
    w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN)
    w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN)

    train_X, w2i, i2w = build_dataset(TRAIN_X_FILE, w2c=w2c, padid=False, eos=True, unksym='<unk>', target=False, n_data=N_TRAIN, vocab_size=VOCAB_SIZE)
    train_y, _, _     = build_dataset(TRAIN_Y_FILE, w2i=w2i, target=True, n_data=N_TRAIN)

    valid_X, _, _ = build_dataset(VALID_X_FILE, w2i=w2i, target=False, n_data=N_VALID)
    valid_y, _, _ = build_dataset(VALID_Y_FILE, w2i=w2i, target=True, n_data=N_VALID)

    VOCAB_SIZE = len(w2i)
    OUT_DIM = VOCAB_SIZE
    print('VOCAB_SIZE:', VOCAB_SIZE)

    # Build model ======================================================================================
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    rush_abs = ABS(model, EMB_DIM, HID_DIM, VOCAB_SIZE, Q, C, encoder_type=ENCODER_TYPE)

    # Padding
    train_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in train_y]
    valid_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in valid_y]

    n_batches_train = math.ceil(len(train_X)/BATCH_SIZE)
    n_batches_valid = math.ceil(len(valid_X)/BATCH_SIZE)

    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        train_X, train_y = shuffle(train_X, train_y)
        loss_all_train = []
        for i in tqdm(range(n_batches_train)):
            # Create a new computation graph
            dy.renew_cg()
            rush_abs.associate_parameters()

            # Create a mini batch
            start = i*BATCH_SIZE
            end = start + BATCH_SIZE
            train_X_mb = train_X[start:end]
            train_y_mb = train_y[start:end]

            losses = []
            for x, t in zip(train_X_mb, train_y_mb):
                t_in, t_out = t[:-1], t[C:]

                y = rush_abs(x, t_in)
                loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        for i in range(n_batches_valid):
            # Create a new computation graph
            dy.renew_cg()
            rush_abs.associate_parameters()

            # Create a mini batch
            start = i*BATCH_SIZE
            end = start + BATCH_SIZE
            valid_X_mb = valid_X[start:end]
            valid_y_mb = valid_y[start:end]

            losses = []
            for x, t in zip(valid_X_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[C:]

                y = rush_abs(x, t_in)
                loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' % (
            epoch+1,
            np.mean(loss_all_train),
            np.mean(loss_all_valid)
        ))

        # Save model ========================================================================
        dy.save('./model_e'+str(epoch+1), [rush_abs])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w:
            pickle.dump(w2i, f_w2i)
            pickle.dump(i2w, f_i2w)
Exemple #9
0
 def __init__(self, model, type, lrate, moment=None):
     self._tt = {
         "sgd": dy.SimpleSGDTrainer(model, lrate),
         "momentum": dy.MomentumSGDTrainer(model, lrate, moment),
         "adam": dy.AdamTrainer(model, lrate)
     }[type]
Exemple #10
0
def main():
    print_config(opt)
    # Load the relations
    with codecs.open(args.dataset_prefix + '/relations.txt', 'r', 'utf-8') as f_in:
        relations = [line.strip() for line in f_in]
        relation_index = {relation: i for i, relation in enumerate(relations)}

    # Load the datasets
    if args.debug:
        trainname = '../datasets/wn-bo/train_sample.tsv'
        print 'Loading the dataset...', trainname, '*' * 10
        train_set = load_dataset(trainname, relations)
        val_set = load_dataset(trainname, relations)
        test_set = load_dataset(trainname, relations)
    else:
        trainname = '/' + args.trainname + '.tsv'
        valname = '/' + args.valname + '.tsv'
        testname = '/' + args.testname + '.tsv'
        print 'Loading the dataset...', trainname, '*' * 10
        train_set = load_dataset(args.dataset_prefix + trainname, relations)
        print 'Loading the dataset...', valname, '*' * 10
        val_set = load_dataset(args.dataset_prefix + valname, relations)
        print 'Loading the dataset...', testname, '*' * 10
        test_set = load_dataset(args.dataset_prefix + testname, relations)
    # y_train = [relation_index[label] for label in train_set.values()]
    # y_val = [relation_index[label] for label in val_set.values()]
    # y_test = [relation_index[label] for label in test_set.values()]
    dataset_keys = train_set.keys() + val_set.keys() + test_set.keys()
    # add (x, root) to dataset_keys
    vocab = set()
    for (x, y) in dataset_keys:
        vocab.add(x)
        vocab.add(y)
    dataset_keys += [(term, 'root007') for term in vocab]

    if not args.debug:
        trees = read_tree_file(
            "../datasets/wn-bo/wn-bo-trees-4-11-50-train533-lower.ptb",
            given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_val = read_tree_file(
            "../datasets/wn-bo/wn-bo-trees-4-11-50-dev114-lower.ptb",
            given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_test = read_tree_file(
            "../datasets/wn-bo/wn-bo-trees-4-11-50-test114-lower.ptb",
            given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_semeval = read_edge_files("../datasets/SemEval-2016/original/",
                                        given_root=True, filter_root=args.filter_root, allow_up=False)
    else:
        trees = read_tree_file(
            "../datasets/wn-bo/train_sample.ptb2",
            given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_val = read_tree_file(
            "../datasets/wn-bo/train_sample.ptb2",
            given_root=args.given_root_train, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_test = read_tree_file(
            "../datasets/wn-bo/train_sample.ptb2",
            given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up)
        trees_semeval = read_tree_file(
            "../datasets/wn-bo/train_sample.ptb2",
            given_root=args.given_root_test, filter_root=args.filter_root, allow_up=args.allow_up)

    # Load the resource (processed corpus)
    print 'Loading the corpus...', args.corpus_prefix, '*' * 10
    corpus = KnowledgeResource(args.corpus_prefix)

    if not os.path.exists('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug)):
        print 'Loading the vocabulary...'
        # path_lemmas_name = "pickled_data/path_lemmas_3in1.pkl"
        # print 'reload path_lemmas from:', path_lemmas_name
        # path_lemmas = pickle.load(open(path_lemmas_name, 'rb'))
        path_lemmas, x_y_words, keys = get_vocabulary(corpus, dataset_keys, None)
        if not args.debug:
            pickle.dump(path_lemmas, open('pickled_data/path_lemmas_{}.pkl'.format(args.model_prefix_file), 'wb'))
            pickle.dump(x_y_words, open('pickled_data/x_y_words_{}.pkl'.format(args.model_prefix_file), 'wb'))

        # Load the word embeddings
        print 'Initializing word embeddings...'
        word_vectors, word_index, word_set = load_embeddings(args.embeddings_file, path_lemmas, x_y_words,
                                                             debug=args.debug)
        # Load the paths and create the feature vectors
        print 'Loading path files...'
        dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index, dep_inverted_index, \
        dir_inverted_index = load_paths_and_word_vectors(corpus, dataset_keys, word_index, keys)
        print 'saving pkl...'
        pickle.dump((word_vectors, word_index, word_set, dataset_instances, pos_index, dep_index, dir_index,
                     pos_inverted_index, dep_inverted_index, dir_inverted_index),
                    open('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'wb'))
    else:
        print 'Data loaded from', 'pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file,
                                                                                    args.debug), 'make sure pkl is correct'
        (word_vectors, word_index, word_set, dataset_instances, pos_index, dep_index, dir_index, pos_inverted_index,
         dep_inverted_index, dir_inverted_index) = pickle.load(
            open('pickled_data/preload_data_{}_debug{}.pkl'.format(args.model_prefix_file, args.debug), 'rb'))

    print 'Number of words %d, number of pos tags: %d, number of dependency labels: %d, number of directions: %d' % \
          (len(word_index), len(pos_index), len(dep_index), len(dir_index))

    # dataset_instances is now (paths, x_y_vectors, features)
    X_train = dataset_instances[:len(train_set)]
    X_val = dataset_instances[len(train_set):len(train_set) + len(val_set)]
    X_test = dataset_instances[len(train_set) + len(val_set):]
    print len(X_train), len(X_val), len(X_test)

    # check_data(train_set, X_train, word_set)
    # check_data(val_set, X_val, word_set)
    # check_data(test_set, X_test, word_set)
    # save_path_info(dataset_keys, dataset_instances)
    # scores_save = []
    # scores_save_test = []
    # prob_save = []
    # prob_save_test = []
    policy = Policy(dataset_keys, dataset_instances, num_lemmas=len(word_index), num_pos=len(pos_index),
                    num_dep=len(dep_index), num_directions=len(dir_index), opt=opt, num_relations=len(relations),
                    lemma_embeddings=word_vectors)
    trainer = dy.AdamTrainer(policy.model, alpha=args.lr)
    if args.debug:
        n_epoch = 1000
    else:
        n_epoch = 1000
    best = [0] * 6
    best_idx = [0] * 6
    best_val = [0] * 6
    best_val_idx = [0] * 6
    best_test = [0] * 6
    best_test_idx = [0] * 6
    best_semeval = [0] * 6
    best_semeval_idx = [0] * 6
    policy_save_test = defaultdict(list)
    wrong_total_l = []

    # check_limit(trees, policy, policy.unk_hard)
    # check_limit(trees, policy, policy.unk_soft)
    # check_limit(trees_test, policy, policy.unk_hard)
    # check_limit(trees_test, policy, policy.unk_soft)
    # exit(0)

    # TRAIN / TEST START HERE
    if args.load_model_file is None:
        for epoch in range(n_epoch):
            best, best_idx = train(epoch, trees, policy, trainer, best, best_idx, wrong_total_l)
            # policy_save_test, best_test, best_test_idx = test(epoch, trees_test, policy, policy_save_test, best_test,
            #                                                   best_test_idx)
            _, best_val, best_val_idx = test_single(epoch, trees_val, policy, [], best_val, best_val_idx, wrong_total_l)
            policy_save_test, best_test, best_test_idx = test_single(epoch, trees_test, policy, policy_save_test,
                                                                     best_test, best_test_idx, wrong_total_l)
    else:
        load_candidate_from_pickle(trees_semeval)
        _, best_semeval, best_semeval_idx = test_single(0, trees_semeval, policy, [], best_semeval,
                                                        best_semeval_idx, wrong_total_l,
                                                        reward_type='print_each')
Exemple #11
0
def main(args):
    import dynet as dy
    
    get_data = {"ag": lambda : ag_data_reader.get_dataset(args.num_NE),
                "dw": lambda : dw_data_reader.get_dataset(args.num_NE),
                "bl": lambda : blog_data_reader.get_dataset(),
                "tp_fr": lambda : trustpilot_data_reader.get_dataset("fr"),
                "tp_de": lambda : trustpilot_data_reader.get_dataset("de"),
                "tp_dk": lambda : trustpilot_data_reader.get_dataset("dk"),
                "tp_us": lambda : trustpilot_data_reader.get_dataset("us"),
                "tp_uk": lambda : trustpilot_data_reader.get_dataset("uk")}
    
    train, dev, test = get_data[args.dataset]()
    
    labels_main_task = set([ex.get_label() for ex in train])
    labels_main_task.add(0)
    
    assert(sorted(labels_main_task) == list(range(len(labels_main_task))))
    
    labels_adve_task = get_aux_labels(train)
    
    print("Train size: {}".format(len(train)))
    print("Dev size:   {}".format(len(dev)))
    print("Test size:  {}".format(len(test)))
    
    print("Train data distribution")
    mfb_train = print_data_distributions(train)

    print("Dev data distribution")
    mfb_dev = print_data_distributions(dev)

    print("Test data distribution")
    mfb_test = print_data_distributions(test)

    results = {}

    model = dy.Model()
    
    #if args.use_demographics:
    symbols = ["<g={}>".format(i) for i in ["F", "M"]] + ["<a={}>".format(i) for i in ["U", "O"]]
    vocabulary = extract_vocabulary(train, add_symbols=symbols)
    
    bilstm = HierarchicalBiLSTM(args, vocabulary, model)
    input_size = bilstm.size()
    main_classifier = MLP(input_size, len(labels_main_task), args.hidden_layers, args.dim_hidden, dy.rectify, model)
    
    trainer = dy.AdamTrainer(model)
    trainer.set_clip_threshold(5)
    
    args.learning_rate = trainer.learning_rate
    
    if args.subset:
        train = train[:args.subset]
        dev = dev[:args.subset]

    output_size = len(labels_adve_task)
    adversary_classifier = MLP_sigmoid(input_size, output_size, args.hidden_layers, args.dim_hidden, dy.rectify, model)
    
    discriminator = None
    if args.atraining:
        discriminator = Discriminator(input_size, output_size, args.hidden_layers, args.dim_hidden, dy.rectify, model, trainer)
    
    generator = None
    if args.generator:
        generator = Generator(args, vocabulary, model, trainer)

    #### add adversary classifier
    mod = PrModel(args, model, trainer, bilstm, main_classifier, adversary_classifier, discriminator, generator, vocabulary)
    
    
    if args.baseline:
        _, ftest = mod.train_baseline(train, dev, test, args.iterations)
        print(ftest)
        return
    
    
    print("Train main task")
    results["000_main_dev_acc"] = mod.train_main(train, dev)
    
    targets_test = [ex.get_label() for ex in test]
    loss_test, acc_test, _ = mod.evaluate_main(test, targets_test)
    print("\t Test results : l={} acc={}".format(loss_test, acc_test))
    results["001_main_test_acc"] = acc_test
    
    
    
    ##############
    ##############
    ##############
    ##############
    ##############
    ##############
    ##############    Adversary training / evaluate privacy
    ##############
    ##############
    ##############
    ##############
    ##############

    train_hidden, dev_hidden, test_hidden = [mod.get_adversary_dataset(dataset) for dataset in [train, dev, test]]
    
    
    trainer.restart()
    print("Train adversary")
    results["002_adv_dev_F"] = mod.train_adversary(train_hidden, dev_hidden)
    targets_test = [ex.get_aux_labels() for ex in test]
    loss_test, acc_test, predictions_test = mod.evaluate_adversary(test_hidden)
    
    print("\t Adversary Test results : l={} acc={}".format(loss_test, acc_test))
    outsize = mod.adversary_classifier.output_size()
    Fscore = compute_eval_metrics(outsize, targets_test, predictions_test)
    print("\tF          = {} ".format(Fscore))


    results["003_adv_test_fscore"] = Fscore[2]
    results["004_adv_test_precision"] = Fscore[0]
    results["005_adv_test_recall"] = Fscore[1]
    for i, acc in enumerate(Fscore[3]):
        results["{}_adv_test_acc_task_{}".format(str(i+6).zfill(3), i)] = acc


    preds = [set(range(outsize)) for _ in targets_test]
    Fscore = compute_eval_metrics(outsize, targets_test, preds)
    
    baseline_str = [Fscore[2], Fscore[0], Fscore[1]] + [x if x > 50.0 else 100 - x for x in Fscore[3]]
    
    
    
    line = ["Baseline", "NA", "NA", "NA", "NA", "NA", "NA", "NA", str(round(mfb_train * 100, 2)), str(round(mfb_test*100, 2)), "0"]
    print("\t".join(line) + "\t" + "\t".join(map(str, baseline_str)))
    
    
    for k in results:
        if type(results[k]) == float:
            results[k] = round(results[k], 2)
    
    
    results["#H"] = args.dim_hidden
    results["#h"] = args.hidden_layers
    results["#w"] = args.dim_word
    results["#W"] = args.dim_wrnn
    results["#Zatr"] = int(args.atraining)
    results["#Zptr"] = int(args.ptraining)
    results["#Zalpha"] = args.alpha
    
    keys = sorted(results)
    
    print("Model\t", end="")
    print("\t".join(keys))
    print("\t".join(map(str, [results[k] for k in keys])))
Exemple #12
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet'
    )

    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--n_epochs',
                        type=int,
                        default=3,
                        help='Number of epochs [default: 3]')
    parser.add_argument(
        '--n_train',
        type=int,
        default=3803957,
        help=
        'Number of training examples (up to 3803957 in gigaword) [default: 3803957]'
    )
    parser.add_argument(
        '--n_valid',
        type=int,
        default=189651,
        help=
        'Number of validation examples (up to 189651 in gigaword) [default: 189651])'
    )
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Mini batch size [default: 32]')
    parser.add_argument('--emb_dim',
                        type=int,
                        default=256,
                        help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim',
                        type=int,
                        default=256,
                        help='Hidden state size [default: 256]')
    parser.add_argument('--lat_dim',
                        type=int,
                        default=256,
                        help='Latent size [default: 256]')
    parser.add_argument(
        '--alloc_mem',
        type=int,
        default=8192,
        help='Amount of memory to allocate [mb] [default: 8192]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS = args.n_epochs
    N_TRAIN = args.n_train
    N_VALID = args.n_valid
    BATCH_SIZE = args.batch_size
    VOCAB_SIZE = 60000
    EMB_DIM = args.emb_dim
    HID_DIM = args.hid_dim
    LAT_DIM = args.lat_dim
    ALLOC_MEM = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_STATE)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset ====================================================================================
    w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN)
    w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN)

    train_X, w2i, i2w = build_dataset(TRAIN_X_FILE,
                                      w2c=w2c,
                                      padid=False,
                                      eos=True,
                                      unksym='<unk>',
                                      target=False,
                                      n_data=N_TRAIN,
                                      vocab_size=VOCAB_SIZE)
    train_y, _, _ = build_dataset(TRAIN_Y_FILE,
                                  w2i=w2i,
                                  target=True,
                                  n_data=N_TRAIN)

    valid_X, _, _ = build_dataset(VALID_X_FILE,
                                  w2i=w2i,
                                  target=False,
                                  n_data=N_VALID)
    valid_y, _, _ = build_dataset(VALID_Y_FILE,
                                  w2i=w2i,
                                  target=True,
                                  n_data=N_VALID)

    VOCAB_SIZE = len(w2i)
    OUT_DIM = VOCAB_SIZE
    print(VOCAB_SIZE)

    # Build model ======================================================================================
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))

    encoder = BiGRU(model, EMB_DIM, 2 * HID_DIM)
    decoder = RecurrentGenerativeDecoder(model, EMB_DIM, 2 * HID_DIM, LAT_DIM,
                                         OUT_DIM)

    # Train model =======================================================================================
    n_batches_train = math.ceil(len(train_X) / BATCH_SIZE)
    n_batches_valid = math.ceil(len(valid_X) / BATCH_SIZE)

    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        train_X, train_y = shuffle(train_X, train_y)
        loss_all_train = []
        for i in tqdm(range(n_batches_train)):
            # Create a new computation graph
            dy.renew_cg()
            encoder.associate_parameters()
            decoder.associate_parameters()

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            train_X_mb = train_X[start:end]
            train_y_mb = train_y[start:end]

            losses = []
            for x, t in zip(train_X_mb, train_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                he = encoder(x_embs)

                # Decoder
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                decoder.set_initial_states(he)
                y, KL = decoder(t_embs)

                loss = dy.esum([
                    dy.pickneglogsoftmax(y_t, t_t) + KL_t
                    for y_t, t_t, KL_t in zip(y, t_out, KL)
                ])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        for i in range(n_batches_valid):
            # Create a new computation graph
            dy.renew_cg()
            encoder.associate_parameters()
            decoder.associate_parameters()

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            valid_X_mb = valid_X[start:end]
            valid_y_mb = valid_y[start:end]

            losses = []
            for x, t in zip(valid_X_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                he = encoder(x_embs)

                # Decoder
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                decoder.set_initial_states(he)
                y, KL = decoder(t_embs)

                loss = dy.esum([
                    dy.pickneglogsoftmax(y_t, t_t) + KL_t
                    for y_t, t_t, KL_t in zip(y, t_out, KL)
                ])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' %
              (epoch + 1, np.mean(loss_all_train), np.mean(loss_all_valid)))

        # Save model ======================================================================================
        dy.save('./model_e' + str(epoch + 1), [V, encoder, decoder])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump',
                                                     'wb') as f_i2w:
            pickle.dump(w2i, f_w2i)
            pickle.dump(i2w, f_i2w)
Exemple #13
0
def train_model(model, encoder, decoder, params, train_inputs, train_outputs,
                dev_inputs, dev_outputs, y2int, int2y, epochs, optimization,
                results_file_path, plot, batch_size, eval_after, min_epochs):
    print 'training...'
    sys.stdout.flush()

    np.random.seed(17)
    random.seed(17)

    # sort training sentences by length in descending order
    train_data = zip(train_inputs, train_outputs)
    train_data.sort(key=lambda t: -len(t[0]))
    train_order = [
        x * batch_size for x in range(len(train_data) / batch_size + 1)
    ]

    # sort dev sentences by length in descending order
    dev_batch_size = 1
    dev_data = zip(dev_inputs, dev_outputs)
    dev_data.sort(key=lambda t: -len(t[0]))
    dev_order = [
        x * dev_batch_size for x in range(len(dev_data) / dev_batch_size + 1)
    ]

    if optimization == 'ADAM':
        trainer = dn.AdamTrainer(
            model
        )  # lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8)
    elif optimization == 'MOMENTUM':
        trainer = dn.MomentumSGDTrainer(model)
    elif optimization == 'SGD':
        trainer = dn.SimpleSGDTrainer(model)
    elif optimization == 'ADAGRAD':
        trainer = dn.AdagradTrainer(model)
    elif optimization == 'ADADELTA':
        trainer = dn.AdadeltaTrainer(model)
    else:
        trainer = dn.SimpleSGDTrainer(model)

    trainer.set_clip_threshold(float(arguments['--grad-clip']))
    seen_examples_count = 0
    total_loss = 0
    best_dev_epoch = 0
    best_train_epoch = 0
    patience = 0
    train_len = len(train_outputs)
    dev_len = len(dev_inputs)
    avg_train_loss = -1
    train_loss_patience = 0
    train_loss_patience_threshold = 99999999
    max_patience = int(arguments['--max-patience'])
    log_path = results_file_path + '_log.txt'
    start_epoch, checkpoints_x, train_loss_y, dev_loss_y, dev_accuracy_y = read_from_log(
        log_path)

    if len(train_loss_y) > 0:
        total_batches = checkpoints_x[-1]
        best_avg_train_loss = max(train_loss_y)
        best_dev_accuracy = max(dev_accuracy_y)
        best_dev_loss = max(dev_loss_y)
    else:
        total_batches = 0
        best_avg_train_loss = 999999
        best_dev_loss = 999999
        best_dev_accuracy = 0

    # progress bar init
    # noinspection PyArgumentList
    # widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()]
    # train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start()

    e = -1
    for e in xrange(start_epoch, epochs):
        try:
            # shuffle the batch start indices in each epoch
            random.shuffle(train_order)
            batches_per_epoch = len(train_order)
            start = time.time()

            # go through batches
            for i, batch_start_index in enumerate(train_order, start=1):
                # get batch examples
                batch_inputs = [
                    x[0]
                    for x in train_data[batch_start_index:batch_start_index +
                                        batch_size]
                ]
                batch_outputs = [
                    x[1]
                    for x in train_data[batch_start_index:batch_start_index +
                                        batch_size]
                ]
                actual_batch_size = len(batch_inputs)

                # skip empty batches
                if actual_batch_size == 0 or len(batch_inputs[0]) == 0:
                    continue

                # compute batch loss

                # debug prints for batch seq lengths
                # print 'batch {} seq lens'.format(i)
                # print [len(s) for s in batch_inputs]
                loss = compute_batch_loss(encoder, decoder, batch_inputs,
                                          batch_outputs, y2int)

                # forward pass
                total_loss += loss.scalar_value()
                loss.backward()

                total_batches += 1

                # update parameters
                trainer.update()

                seen_examples_count += actual_batch_size

                # avg loss per sample
                avg_train_loss = total_loss / float(i * batch_size +
                                                    e * train_len)

                # start patience counts only after 20 batches
                if avg_train_loss < best_avg_train_loss and total_batches > 20:
                    best_avg_train_loss = avg_train_loss
                    train_loss_patience = 0
                else:
                    train_loss_patience += 1
                    if train_loss_patience > train_loss_patience_threshold:
                        print 'train loss patience exceeded: {}'.format(
                            train_loss_patience)
                        sys.stdout.flush()
                        return model, params, e, best_dev_epoch

                if total_batches % 100 == 0 and total_batches > 0:
                    print 'epoch {}: {} batches out of {} ({} examples out of {}) total: {} batches, {} examples. avg \
	loss per example: {}'.format(e, i, batches_per_epoch, i * batch_size,
                              train_len, total_batches,
                              total_batches * batch_size, avg_train_loss)
                    sys.stdout.flush()

                    # print sentences per second
                    end = time.time()
                    elapsed_seconds = end - start
                    print '{} sentences per second'.format(
                        seen_examples_count / elapsed_seconds)
                    sys.stdout.flush()
                    seen_examples_count = 0
                    start = time.time()

                # checkpoint
                if total_batches % eval_after == 0:

                    print 'starting checkpoint evaluation'
                    sys.stdout.flush()
                    dev_bleu, dev_loss = checkpoint_eval(
                        encoder,
                        decoder,
                        params,
                        dev_batch_size,
                        dev_data,
                        dev_inputs,
                        dev_len,
                        dev_order,
                        dev_outputs,
                        int2y,
                        y2int,
                        results_file_path=results_file_path)

                    log_to_file(log_path, e, total_batches, avg_train_loss,
                                dev_loss, dev_bleu)
                    save_model(model,
                               results_file_path,
                               total_batches,
                               models_to_save=int(
                                   arguments['--models-to-save']))
                    if dev_bleu > best_dev_accuracy:
                        best_dev_accuracy = dev_bleu
                        best_dev_epoch = e

                        # save best model to disk
                        save_best_model(model, results_file_path)
                        print 'saved new best model'
                        sys.stdout.flush()
                        patience = 0
                    else:
                        patience += 1

                    if dev_loss < best_dev_loss:
                        best_dev_loss = dev_loss

                    print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev bleu: {3:.4f} \
	best dev bleu {4:.4f} (epoch {5}) patience = {6}'.format(
                        e, avg_train_loss, dev_loss, dev_bleu,
                        best_dev_accuracy, best_dev_epoch, patience)
                    sys.stdout.flush()

                    if (patience == max_patience) and (e >= min_epochs):
                        print 'out of patience after {0} checkpoints'.format(
                            str(e))
                        sys.stdout.flush()
                        # train_progress_bar.finish()
                        if plot:
                            plt.cla()
                        print 'checkpoint patience exceeded'
                        sys.stdout.flush()
                        return model, params, e, best_dev_epoch

                    # plotting results from checkpoint evaluation
                    if plot:
                        train_loss_y.append(avg_train_loss)
                        checkpoints_x.append(total_batches)
                        dev_accuracy_y.append(dev_bleu)
                        dev_loss_y.append(dev_loss)

                        y_vals = [('train_loss', train_loss_y),
                                  ('dev loss', dev_loss_y),
                                  ('dev_bleu', dev_accuracy_y)]
                        common.plot_to_file(y_vals,
                                            x_name='total batches',
                                            x_vals=checkpoints_x,
                                            file_path=results_file_path +
                                            '_learning_curve.png')
        except RuntimeError as exception:
            # sometimes the above two instructions fail due to memory allocation failure.
            # I was unable to find a fix for these failures.
            # perhaps we can just "skip" the failures.
            print 'WARNING: Skipping epoch due to RuntimeError (' + str(
                exception) + ')'
            sys.stdout.flush()

    # update progress bar after completing epoch
    # train_progress_bar.update(e)

    # update progress bar after completing training
    # train_progress_bar.finish()
    if plot:
        # clear plot when done
        plt.cla()
    print 'finished training. average loss: {} best epoch on dev: {} best epoch on train: {}'.format(
        str(avg_train_loss), best_dev_epoch, best_train_epoch)
    sys.stdout.flush()

    return model, params, e, best_dev_epoch