Esempio n. 1
0
    def __init__(self, train, test, dir):
        self.model = dy.Model()
        self.trainer = dy.AdamTrainer(self.model)

        self.pW = self.model.add_parameters((args.m, args.s))
        self.pU = self.model.add_parameters((vocab.size(), args.m))

        self.trainData = train
        self.testData = test
        self.dir = dir
        if os.path.exists(self.dir):
            shutil.rmtree(self.dir)
Esempio n. 2
0
def do_cpu():
    import _dynet as C
    C.init()
    cm = C.Model()
    cpW = cm.add_parameters((1000, 1000))
    s = time.time()
    C.renew_cg()
    W = C.parameter(cpW)
    W = W * W * W * W * W * W * W
    z = C.squared_distance(W, W)
    z.value()
    z.backward()
    print("CPU time:", time.time() - s)
Esempio n. 3
0
def do_gpu():
    import _dynet as G
    import sys
    sys.argv.append('--dynet-devices')
    sys.argv.append('GPU:0')
    G.init()
    gm = G.Model()
    gpW = gm.add_parameters((1000, 1000))
    s = time.time()
    G.renew_cg()
    W = G.parameter(gpW)
    W = W * W * W * W * W * W * W
    z = G.squared_distance(W, W)
    z.value()
    z.backward()
    print("GPU time:", time.time() - s)
Esempio n. 4
0
 def __init__(self, input_vector_size, *argc):
     if input_vector_size == 0:
         return
     model = dy.Model()
     self.params = {
         "builders": [
             dy.LSTMBuilder(1, input_vector_size, LSTM_HIDDEN_DIM, model)
             for _ in range(2)
         ] + [
             dy.LSTMBuilder(1, LSTM_HIDDEN_DIM * 2, LSTM_HIDDEN_DIM, model)
             for _ in range(2)
         ],
         "W":
         model.add_parameters((LINEAR_DIM, LSTM_HIDDEN_DIM * 2)),
         "v":
         model.add_parameters(LINEAR_DIM)
     }
     self.model = model
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Convolutional Neural Networks for Sentence Classification in DyNet')

    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU ID to use. For cpu, set -1 [default: 0]')
    parser.add_argument(
        '--train_x_path',
        type=str,
        default='./data/train_x.txt',
        help='File path of train x data [default: `./data/train_x.txt`]')
    parser.add_argument(
        '--train_y_path',
        type=str,
        default='./data/train_y.txt',
        help='File path of train y data [default: `./data/train_x.txt`]')
    parser.add_argument(
        '--valid_x_path',
        type=str,
        default='./data/valid_x.txt',
        help='File path of valid x data [default: `./data/valid_x.txt`]')
    parser.add_argument(
        '--valid_y_path',
        type=str,
        default='./data/valid_y.txt',
        help='File path of valid y data [default: `./data/valid_y.txt`]')
    parser.add_argument('--n_epochs',
                        type=int,
                        default=10,
                        help='Number of epochs [default: 10]')
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        help='Mini batch size [default: 64]')
    parser.add_argument('--win_sizes',
                        type=int,
                        nargs='*',
                        default=[3, 4, 5],
                        help='Window sizes of filters [default: [3, 4, 5]]')
    parser.add_argument(
        '--num_fil',
        type=int,
        default=100,
        help='Number of filters in each window size [default: 100]')
    parser.add_argument('--s',
                        type=float,
                        default=3.0,
                        help='L2 norm constraint on w [default: 3.0]')
    parser.add_argument('--dropout_prob',
                        type=float,
                        default=0.5,
                        help='Dropout probability [default: 0.5]')
    parser.add_argument(
        '--v_strategy',
        type=str,
        default='static',
        help=
        'Embedding strategy. rand: Random  initialization. static: Load pretrained embeddings and do not update during the training. non-static: Load pretrained embeddings and update during the training. [default: static]'
    )
    parser.add_argument(
        '--alloc_mem',
        type=int,
        default=4096,
        help='Amount of memory to allocate [mb] [default: 4096]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    N_EPOCHS = args.n_epochs
    WIN_SIZES = args.win_sizes
    BATCH_SIZE = args.batch_size
    EMB_DIM = 300
    OUT_DIM = 1
    L2_NORM_LIM = args.s
    NUM_FIL = args.num_fil
    DROPOUT_PROB = args.dropout_prob
    V_STRATEGY = args.v_strategy
    ALLOC_MEM = args.alloc_mem

    if V_STRATEGY in ['rand', 'static', 'non-static']:
        NUM_CHA = 1
    else:
        NUM_CHA = 2

    # FILE paths
    W2V_PATH = './GoogleNews-vectors-negative300.bin'
    TRAIN_X_PATH = args.train_x_path
    TRAIN_Y_PATH = args.train_y_path
    VALID_X_PATH = args.valid_x_path
    VALID_Y_PATH = args.valid_y_path

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_random_seed(RANDOM_SEED)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Load pretrained embeddings
    pretrained_model = gensim.models.KeyedVectors.load_word2vec_format(
        W2V_PATH, binary=True)
    vocab = pretrained_model.wv.vocab.keys()
    w2v = pretrained_model.wv

    # Build dataset =======================================================================================================
    w2c = build_w2c(TRAIN_X_PATH, vocab=vocab)
    w2i, i2w = build_w2i(TRAIN_X_PATH, w2c, unk='unk')
    train_x, train_y = build_dataset(TRAIN_X_PATH,
                                     TRAIN_Y_PATH,
                                     w2i,
                                     unk='unk')
    valid_x, valid_y = build_dataset(VALID_X_PATH,
                                     VALID_Y_PATH,
                                     w2i,
                                     unk='unk')

    train_x, train_y = sort_data_by_length(train_x, train_y)
    valid_x, valid_y = sort_data_by_length(valid_x, valid_y)

    VOCAB_SIZE = len(w2i)
    print('VOCAB_SIZE:', VOCAB_SIZE)

    V_init = init_V(w2v, w2i)

    with open(os.path.join(RESULTS_DIR, './w2i.dump'),
              'wb') as f_w2i, open(os.path.join(RESULTS_DIR, './i2w.dump'),
                                   'wb') as f_i2w:
        pickle.dump(w2i, f_w2i)
        pickle.dump(i2w, f_i2w)

    # Build model =================================================================================
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    # V1
    V1 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))
    if V_STRATEGY in ['static', 'non-static', 'multichannel']:
        V1.init_from_array(V_init)
    if V_STRATEGY in ['static', 'multichannel']:
        V1_UPDATE = False
    else:  # 'rand', 'non-static'
        V1_UPDATE = True
    make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM)

    # V2
    if V_STRATEGY == 'multichannel':
        V2 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))
        V2.init_from_array(V_init)
        V2_UPDATE = True
        make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM)

    layers = [
        CNNText(model, EMB_DIM, WIN_SIZES, NUM_CHA, NUM_FIL, dy.tanh,
                DROPOUT_PROB),
        Dense(model, 3 * NUM_FIL, OUT_DIM, dy.logistic)
    ]

    # Train model ================================================================================
    n_batches_train = math.ceil(len(train_x) / BATCH_SIZE)
    n_batches_valid = math.ceil(len(valid_x) / BATCH_SIZE)

    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        loss_all_train = []
        pred_all_train = []
        for i in tqdm(range(n_batches_train)):
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters(layers)

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            x = build_batch(train_x[start:end], w2i, max(WIN_SIZES)).T
            t = np.array(train_y[start:end])

            sen_len = x.shape[0]

            if V_STRATEGY in ['rand', 'static', 'non-static']:
                x_embs = dy.concatenate_cols(
                    [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x])
                x_embs = dy.transpose(x_embs)
                x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1))
            else:  # multichannel
                x_embs1 = dy.concatenate_cols(
                    [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x])
                x_embs2 = dy.concatenate_cols(
                    [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x])
                x_embs1 = dy.transpose(x_embs1)
                x_embs2 = dy.transpose(x_embs2)
                x_embs = dy.concatenate([x_embs1, x_embs2], d=2)

            t = dy.inputTensor(t, batched=True)
            y = forwards(layers, x_embs, test=False)

            mb_loss = dy.mean_batches(dy.binary_log_loss(y, t))

            # Forward prop
            loss_all_train.append(mb_loss.value())
            pred_all_train.extend(list(binary_pred(y.npvalue().flatten())))

            # Backward prop
            mb_loss.backward()
            trainer.update()

            # L2 norm constraint
            layers[1].scale_W(L2_NORM_LIM)

            # Make padding embs zero
            if V_STRATEGY in ['rand', 'non-static']:
                make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM)
            elif V_STRATEGY in ['multichannel']:
                make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM)

        # Valid
        loss_all_valid = []
        pred_all_valid = []
        for i in range(n_batches_valid):
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters(layers)

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            x = build_batch(valid_x[start:end], w2i, max(WIN_SIZES)).T
            t = np.array(valid_y[start:end])

            sen_len = x.shape[0]

            if V_STRATEGY in ['rand', 'static', 'non-static']:
                x_embs = dy.concatenate_cols(
                    [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x])
                x_embs = dy.transpose(x_embs)
                x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1))
            else:  # multichannel
                x_embs1 = dy.concatenate_cols(
                    [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x])
                x_embs2 = dy.concatenate_cols(
                    [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x])
                x_embs1 = dy.transpose(x_embs1)
                x_embs2 = dy.transpose(x_embs2)
                x_embs = dy.concatenate([x_embs1, x_embs2], d=2)

            t = dy.inputTensor(t, batched=True)
            y = forwards(layers, x_embs, test=True)

            mb_loss = dy.mean_batches(dy.binary_log_loss(y, t))

            # Forward prop
            loss_all_valid.append(mb_loss.value())
            pred_all_valid.extend(list(binary_pred(y.npvalue().flatten())))

        print(
            'EPOCH: %d, Train Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Valid Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Time:: %.3f[s]'
            % (
                epoch + 1,
                np.mean(loss_all_train),
                f1_score(train_y, pred_all_train),
                accuracy_score(train_y, pred_all_train),
                np.mean(loss_all_valid),
                f1_score(valid_y, pred_all_valid),
                accuracy_score(valid_y, pred_all_valid),
                time.time() - start_time,
            ))

        # Save model =========================================================================================================================
        if V_STRATEGY in ['rand', 'static', 'non-static']:
            dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)),
                    [V1] + layers)
        else:
            dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)),
                    [V1, V2] + layers)
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser(description='Selective Encoding for Abstractive Sentence Summarization in DyNet')

    parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]')
    parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]')
    parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651])')
    parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]')
    parser.add_argument('--vocab_size', type=int, default=124404, help='Vocabulary size [default: 124404]')
    parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]')
    parser.add_argument('--maxout_dim', type=int, default=2, help='Maxout size [default: 2]')
    parser.add_argument('--alloc_mem', type=int, default=10000, help='Amount of memory to allocate [mb] [default: 10000]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS   = args.n_epochs
    N_TRAIN    = args.n_train
    N_VALID    = args.n_valid
    BATCH_SIZE = args.batch_size
    VOCAB_SIZE = args.vocab_size
    EMB_DIM    = args.emb_dim
    HID_DIM    = args.hid_dim
    MAXOUT_DIM = args.maxout_dim
    ALLOC_MEM  = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_SEED)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset
    dataset = Dataset(
        TRAIN_X_FILE,
        TRAIN_Y_FILE,
        VALID_X_FILE,
        VALID_Y_FILE,
        vocab_size=VOCAB_SIZE,
        batch_size=BATCH_SIZE,
        n_train=N_TRAIN,
        n_valid=N_VALID
    )
    VOCAB_SIZE = len(dataset.w2i)
    print('VOCAB_SIZE', VOCAB_SIZE)

    # Build model
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))
    encoder = SelectiveBiGRU(model, EMB_DIM, HID_DIM)
    decoder = AttentionalGRU(model, EMB_DIM, HID_DIM, MAXOUT_DIM, VOCAB_SIZE)

    # Train model
    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        loss_all_train = []
        dataset.reset_train_iter()
        for train_x_mb, train_y_mb in tqdm(dataset.train_iter):
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters([encoder, decoder])
            losses = []
            for x, t in zip(train_x_mb, train_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                hp, hb_1 = encoder(x_embs)

                # Decoder
                decoder.set_initial_states(hp, hb_1)
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                y = decoder(t_embs)

                # Loss
                loss = dy.esum(
                    [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]
                )
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        dataset.reset_valid_iter()
        for valid_x_mb, valid_y_mb in dataset.valid_iter:
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters([encoder, decoder])
            losses = []
            for x, t in zip(valid_x_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                hp, hb_1 = encoder(x_embs)

                # Decoder
                decoder.set_initial_states(hp, hb_1)
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                y = decoder(t_embs)

                # Loss
                loss = dy.esum(
                    [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]
                )
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f, Time: %.3f[s]' % (
            epoch+1,
            np.mean(loss_all_train),
            np.mean(loss_all_valid),
            time.time()-start_time
        ))

        # Save model
        dy.save('./model_e'+str(epoch+1), [V, encoder, decoder])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w:
            pickle.dump(dataset.w2i, f_w2i)
            pickle.dump(dataset.i2w, f_i2w)
Esempio n. 7
0
    def __init__(self, num_layers, input_dim, hidden_dim, tasks, src_domain,
                 main_task, adversarial_task, vocabularies, update_embs,
                 exp_path, prediction_layer, additional_params):
        # parameter collection
        self.model = dn.Model()

        #self.basename = basename
        self.exp_path = exp_path

        # dimension of the word embeddings
        self.input_dim = input_dim

        # dimension of the rnn hidden states
        self.hidden_dim = hidden_dim

        # number of layers of the rnn
        self.num_layers = num_layers

        # the tasks
        self.tasks = tasks

        # the src domain
        self.src_domain = src_domain

        # the task-specific layer used to predict the main task in case the main task is not trainable
        if tasks[main_task].trainable:
            self.prediction_layer = main_task

        else:
            self.prediction_layer = prediction_layer

        # additional parameters, e.g. dynet parameters, that are not set for the model directly but should be stored when reporting results
        self.additional_params = additional_params

        # the name of the main task that is the target of optimization
        self.main_task = main_task

        self.vocabularies = vocabularies

        self.update_embs = update_embs

        # setup the shared rnn
        self.rnn = self.setup_rnn(model=self.model,
                                  num_layers=self.num_layers,
                                  input_dim=self.input_dim,
                                  hidden_dim=self.hidden_dim)

        # setup the embedding layers for each vocabulary, then associate each task with the respective embedding layer (some tasks (or all tasks) might share the same embedding layer)
        self.embedding_layers = {}
        for voc_name, vocab_builder in sorted(iter(self.vocabularies.items())):
            self.embedding_layers[voc_name] = self.setup_embedding_layer(
                model=self.model,
                emb_dim=self.input_dim,
                vocab_size=vocab_builder.vocab_size,
                layername='{}#emb'.format(voc_name),
                update_embs=self.update_embs,
                embs=vocab_builder.embeds)

        # associate each task with the respective embedding layer
        self.task2embedding_layers = {}
        for tid, task in sorted(iter(self.tasks.items())):
            self.task2embedding_layers[tid] = task.vocab_name

        # set up the task specific output layers for each task.
        # don't set up an output layer if there is no training data for the task
        self.output_layers = {}
        for tid, task in sorted(iter(self.tasks.items())):
            if task.trainable:
                self.output_layers[tid] = self.setup_output_layer(
                    model=self.model,
                    input_dim=self.hidden_dim,
                    output_dim=task.num_classes,
                    layername='{}#out'.format(task.task_name))

        # add an embedding layer for the adversarial data
        self.task2embedding_layers['adversarial'] = adversarial_task.vocab_name
        # add a special output layer for the adversarial
        # we model a binary output layer predicting domain
        self.output_layers['adversarial'] = self.setup_output_layer(
            model=self.model,
            input_dim=self.hidden_dim,
            output_dim=2,
            layername='adversarialout')

        self.gradient_reversal_layer = self.setup_gradient_reversal_layer(
            model=self.model,
            input_dim=self.hidden_dim,
            output_dim=self.hidden_dim,
            layername='gr')

        #store all the model parameters in the model_params dict
        self._set_model_params()
Esempio n. 8
0
import _gdynet as G
print()
import _dynet as C

cm = C.Model()
gm = G.Model()

cpW = cm.add_parameters((1000,1000))
gpW = gm.add_parameters((1000,1000))

def do_cpu():
	C.renew_cg()
	W = C.parameter(cpW)
	W = W*W*W*W*W*W*W
	z = C.squared_distance(W,W)
	z.value()
	z.backward()

def do_gpu():
	G.renew_cg()
	W = G.parameter(gpW)
	W = W*W*W*W*W*W*W
	z = G.squared_distance(W,W)
	z.value()
	z.backward()

import time
s = time.time()
do_cpu()
print("CPU time:",time.time() - s)
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet'
    )

    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='GPU ID to use. For cpu, set -1 [default: -]')
    parser.add_argument('--n_test',
                        type=int,
                        default=189651,
                        help='Number of test examples [default: 189651]')
    parser.add_argument('--beam_size',
                        type=int,
                        default=5,
                        help='Beam size [default: 5]')
    parser.add_argument('--max_len',
                        type=int,
                        default=100,
                        help='Maximum length of decoding [default: 100]')
    parser.add_argument('--model_file',
                        type=str,
                        default='./model_e1',
                        help='Trained model file path [default: ./model_e1]')
    parser.add_argument(
        '--input_file',
        type=str,
        default='./data/valid.article.filter.txt',
        help='Test file path [default: ./data/valid.article.filter.txt]')
    parser.add_argument('--output_file',
                        type=str,
                        default='./pred_y.txt',
                        help='Output file path [default: ./pred_y.txt]')
    parser.add_argument('--w2i_file',
                        type=str,
                        default='./w2i.dump',
                        help='Word2Index file path [default: ./w2i.dump]')
    parser.add_argument('--i2w_file',
                        type=str,
                        default='./i2w.dump',
                        help='Index2Word file path [default: ./i2w.dump]')
    parser.add_argument(
        '--alloc_mem',
        type=int,
        default=1024,
        help='Amount of memory to allocate [mb] [default: 1024]')
    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_TEST = args.n_test
    K = args.beam_size
    MAX_LEN = args.max_len
    ALLOC_MEM = args.alloc_mem

    # File paths
    MODEL_FILE = args.model_file
    INPUT_FILE = args.input_file
    OUTPUT_FILE = args.output_file
    W2I_FILE = args.w2i_file
    I2W_FILE = args.i2w_file

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_SEED)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Load trained model ==============================================================================================
    with open(W2I_FILE, 'rb') as f_w2i, open(I2W_FILE, 'rb') as f_i2w:
        w2i = pickle.load(f_w2i)
        i2w = pickle.load(f_i2w)

    test_X, _, _ = build_dataset(INPUT_FILE,
                                 w2i=w2i,
                                 n_data=N_TEST,
                                 target=False)

    model = dy.Model()
    V, encoder, decoder = dy.load(MODEL_FILE, model)

    # Decode
    pred_y = []
    for x in tqdm(test_X):
        dy.renew_cg()
        associate_parameters([encoder, decoder])

        # Initial states
        x_embs = [dy.lookup(V, x_t) for x_t in x]
        hp, hb_1 = encoder(x_embs)
        decoder.set_initial_states(hp, hb_1)
        s_0, c_0 = decoder.s_0, decoder.c_0

        # candidates
        candidates = [[0, w2i['<s>'], s_0, c_0, []]]

        t = 0
        while t < MAX_LEN:
            t += 1
            tmp_candidates = []
            end_flag = True
            for score_tm1, y_tm1, s_tm1, c_tm1, y_02tm1 in candidates:
                if y_tm1 == w2i['</s>']:
                    tmp_candidates.append(
                        [score_tm1, y_tm1, s_tm1, c_tm1, y_02tm1])
                else:
                    end_flag = False
                    y_tm1_emb = dy.lookup(V, y_tm1)
                    s_t, c_t, _q_t = decoder(y_tm1_emb,
                                             tm1s=[s_tm1, c_tm1],
                                             test=True)
                    _q_t = np.log(_q_t.npvalue())  # Calculate log probs
                    q_t, y_t = np.sort(_q_t)[::-1][:K], np.argsort(
                        _q_t
                    )[::-1][:K]  # Pick K highest log probs and their ids
                    score_t = score_tm1 + q_t  # Accumulate log probs
                    tmp_candidates.extend(
                        [[score_tk, y_tk, s_t, c_t, y_02tm1 + [y_tk]]
                         for score_tk, y_tk in zip(score_t, y_t)])
            if end_flag:
                break
            candidates = sorted(
                tmp_candidates, key=lambda x: -x[0] / len(x[-1])
            )[:K]  # Sort in normalized log probs and pick K highest candidates

        # Pick the candidate with the highest score
        pred = candidates[0][-1]
        if w2i['</s>'] in pred:
            pred.remove(w2i['</s>'])
        pred_y.append(pred)

    pred_y_txt = ''
    for pred in pred_y:
        pred_y_txt += ' '.join([i2w[com] for com in pred]) + '\n'

    with open(OUTPUT_FILE, 'w') as f:
        f.write(pred_y_txt)
Esempio n. 10
0
        #me = - e
        #last = dy.cmult(layers[-1], me) + e

        #print("gradient", last.value())
        #log_loss = dy.log(last + epsilon)
        #print(log_loss.value())
        ys = dy.vecInput(self.dim_out)
        ys.set([1 if i in targets else 0 for i in range(self.dim_out)])
        loss = dy.binary_log_loss(layers[-1], ys)
        return dy.sum_elems(loss)


if __name__ == "__main__":
    import dynet

    model = dy.Model()
    trainer = dy.SimpleSGDTrainer(model)

    classifier = MLP_sigmoid(2, 2, 2, 10, dy.rectify, model)

    dataset = [([-1, -1], {0}), ([-1, 1], {1}), ([1, -1], {1}), ([1, 1], {0})]

    for e in range(10040):
        for xs, y in dataset:
            dy.renew_cg()
            x = dy.vecInput(2)
            x.set(xs)
            l = classifier.get_loss(x, y)
            l.backward()
            trainer.update()
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'A Neural Attention Model for Abstractive Sentence Summarization in DyNet'
    )

    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='GPU ID to use. For cpu, set -1 [default: `-`]')
    parser.add_argument('--n_test',
                        type=int,
                        default=189651,
                        help='Number of test examples [default: `189651`]')
    parser.add_argument('--beam_size',
                        type=int,
                        default=5,
                        help='Beam size [default: `5`]')
    parser.add_argument('--max_len',
                        type=int,
                        default=100,
                        help='Maximum length of decoding [default: `100`]')
    parser.add_argument('--model_file',
                        type=str,
                        default='./model_e1',
                        help='Trained model file path [default: `./model_e1`]')
    parser.add_argument(
        '--input_file',
        type=str,
        default='./data/valid.article.filter.txt',
        help='Test file path [default: `./data/valid.article.filter.txt`]')
    parser.add_argument('--output_file',
                        type=str,
                        default='./pred_y.txt',
                        help='Output file path [default: `./pred_y.txt`]')
    parser.add_argument('--w2i_file',
                        type=str,
                        default='./w2i.dump',
                        help='Word2Index file path [default: `./w2i.dump`]')
    parser.add_argument('--i2w_file',
                        type=str,
                        default='./i2w.dump',
                        help='Index2Word file path [default: `./i2w.dump`]')
    parser.add_argument(
        '--alloc_mem',
        type=int,
        default=1024,
        help='Amount of memory to allocate [mb] [default: `1024`]')
    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_TEST = args.n_test
    K = args.beam_size
    MAX_LEN = args.max_len
    ALLOC_MEM = args.alloc_mem

    # File paths
    MODEL_FILE = args.model_file
    INPUT_FILE = args.input_file
    OUTPUT_FILE = args.output_file
    W2I_FILE = args.w2i_file
    I2W_FILE = args.i2w_file

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_STATE)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Load trained model ==============================================================================================
    with open(W2I_FILE, 'rb') as f_w2i, open(I2W_FILE, 'rb') as f_i2w:
        w2i = pickle.load(f_w2i)
        i2w = pickle.load(f_i2w)

    test_X, _, _ = build_dataset(INPUT_FILE, w2i=w2i, n_data=N_TEST)

    model = dy.Model()
    rush_abs = dy.load(MODEL_FILE, model)[0]
    ENCODER_TYPE = rush_abs.encoder_type
    C = rush_abs.c

    # Decode
    pred_y = []
    for x in tqdm(test_X):
        dy.renew_cg()
        rush_abs.associate_parameters()

        # Initial states
        rush_abs.set_initial_states(x)

        # [accum log prob, BOS, t_c, decoded sequence]
        candidates = [[0, w2i['<s>'], [w2i['<s>']] * C, []]]

        t = 0
        while t < MAX_LEN:
            t += 1
            tmp_candidates = []
            end_flag = True
            for score_tm1, y_tm1, y_c, y_02tm1 in candidates:
                if y_tm1 == w2i['</s>']:
                    tmp_candidates.append([score_tm1, y_tm1, y_c, y_02tm1])
                else:
                    end_flag = False
                    _q_t = rush_abs(t=y_c, test=True)
                    _q_t = np.log(_q_t.npvalue())  # Log probs
                    q_t, y_t = np.sort(_q_t)[::-1][:K], np.argsort(
                        _q_t
                    )[::-1][:K]  # Pick K highest log probs and their ids
                    score_t = score_tm1 + q_t  # Accum log probs
                    tmp_candidates.extend(
                        [[score_tk, y_tk, y_c[1:] + [y_tk], y_02tm1 + [y_tk]]
                         for score_tk, y_tk in zip(score_t, y_t)])

            if end_flag:
                break
            candidates = sorted(
                tmp_candidates, key=lambda x: -x[0] / len(x[-1])
            )[:K]  # Sort in normalized score and pick K highest candidates

        # Pick the highest-scored candidate
        pred_y.append(candidates[0][-1])

    pred_y_txt = ''
    for pred in pred_y:
        pred_y_txt += ' '.join([i2w[com] for com in pred]) + '\n'

    with open(OUTPUT_FILE, 'w') as f:
        f.write(pred_y_txt)
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser(description='Convolutional Neural Networks for Sentence Classification in DyNet')

    parser.add_argument('--gpu', type=int, default=-1, help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--model_file', type=str, default='./model', help='Model to use for prediction [default: ./model]')
    parser.add_argument('--input_file', type=str, default='./data/valid_x.txt', help='Input file path [default: ./data/valid_x.txt]')
    parser.add_argument('--output_file', type=str, default='./pred_y.txt', help='Output file path [default: ./pred_y.txt]')
    parser.add_argument('--w2i_file', type=str, default='./w2i.dump', help='Word2Index file path [default: ./w2i.dump]')
    parser.add_argument('--i2w_file', type=str, default='./i2w.dump', help='Index2Word file path [default: ./i2w.dump]')
    parser.add_argument('--alloc_mem', type=int, default=1024, help='Amount of memory to allocate [mb] [default: 1024]')
    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    MODEL_FILE = args.model_file
    INPUT_FILE = args.input_file
    OUTPUT_FILE = args.output_file
    W2I_FILE = args.w2i_file
    I2W_FILE = args.i2w_file
    ALLOC_MEM = args.alloc_mem

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Load model
    model = dy.Model()
    pretrained_model = dy.load(MODEL_FILE, model)
    if len(pretrained_model) == 3:
        V1, layers = pretrained_model[0], pretrained_model[1:]
        MULTICHANNEL = False
    else:
        V1, V2, layers = pretrained_model[0], pretrained_model[1], pretrained_model[2:]
        MULTICHANNEL = True

    EMB_DIM = V1.shape()[0]
    WIN_SIZES = layers[0].win_sizes

    # Load test data
    with open(W2I_FILE, 'rb') as f_w2i, open(I2W_FILE, 'rb') as f_i2w:
        w2i = pickle.load(f_w2i)
        i2w = pickle.load(f_i2w)

    max_win = max(WIN_SIZES)
    test_X, _, _ = build_dataset(INPUT_FILE, w2i=w2i, unksym='unk')
    test_X = [[0]*max_win + instance_x + [0]*max_win for instance_x in test_X]

    # Pred
    pred_y = []
    for instance_x in tqdm(test_X):
        # Create a new computation graph
        dy.renew_cg()
        associate_parameters(layers)

        sen_len = len(instance_x)

        if MULTICHANNEL:
            x_embs1 = dy.concatenate([dy.lookup(V1, x_t, update=False) for x_t in instance_x], d=1)
            x_embs2 = dy.concatenate([dy.lookup(V2, x_t, update=False) for x_t in instance_x], d=1)
            x_embs1 = dy.transpose(x_embs1)
            x_embs2 = dy.transpose(x_embs2)
            x_embs = dy.concatenate([x_embs1, x_embs2], d=2)
        else:
            x_embs = dy.concatenate([dy.lookup(V1, x_t, update=False) for x_t in instance_x], d=1)
            x_embs = dy.transpose(x_embs)
            x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1))

        y = f_props(layers, x_embs, train=False)
        pred_y.append(str(int(binary_pred(y.value()))))

    with open(OUTPUT_FILE, 'w') as f:
        f.write('\n'.join(pred_y))
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser(description='A Neural Attention Model for Abstractive Sentence Summarization in DyNet')

    parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: 0]')
    parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]')
    parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]')
    parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651]')
    parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]')
    parser.add_argument('--vocab_size', type=int, default=60000, help='Vocabulary size [default: 60000]')
    parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]')
    parser.add_argument('--encoder_type', type=str, default='attention', help='Encoder type. bow: Bag-of-words encoder. attention: Attention-based encoder [default: attention]')
    parser.add_argument('--c', type=int, default=5, help='Window size in neural language model [default: 5]')
    parser.add_argument('--q', type=int, default=2, help='Window size in attention-based encoder [default: 2]')
    parser.add_argument('--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS     = args.n_epochs
    N_TRAIN      = args.n_train
    N_VALID      = args.n_valid
    BATCH_SIZE   = args.batch_size
    VOCAB_SIZE   = args.vocab_size
    EMB_DIM      = args.emb_dim
    HID_DIM      = args.hid_dim
    ENCODER_TYPE = args.encoder_type
    C            = args.c
    Q            = args.q
    ALLOC_MEM    = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_STATE)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset ====================================================================================
    w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN)
    w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN)

    train_X, w2i, i2w = build_dataset(TRAIN_X_FILE, w2c=w2c, padid=False, eos=True, unksym='<unk>', target=False, n_data=N_TRAIN, vocab_size=VOCAB_SIZE)
    train_y, _, _     = build_dataset(TRAIN_Y_FILE, w2i=w2i, target=True, n_data=N_TRAIN)

    valid_X, _, _ = build_dataset(VALID_X_FILE, w2i=w2i, target=False, n_data=N_VALID)
    valid_y, _, _ = build_dataset(VALID_Y_FILE, w2i=w2i, target=True, n_data=N_VALID)

    VOCAB_SIZE = len(w2i)
    OUT_DIM = VOCAB_SIZE
    print('VOCAB_SIZE:', VOCAB_SIZE)

    # Build model ======================================================================================
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    rush_abs = ABS(model, EMB_DIM, HID_DIM, VOCAB_SIZE, Q, C, encoder_type=ENCODER_TYPE)

    # Padding
    train_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in train_y]
    valid_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in valid_y]

    n_batches_train = math.ceil(len(train_X)/BATCH_SIZE)
    n_batches_valid = math.ceil(len(valid_X)/BATCH_SIZE)

    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        train_X, train_y = shuffle(train_X, train_y)
        loss_all_train = []
        for i in tqdm(range(n_batches_train)):
            # Create a new computation graph
            dy.renew_cg()
            rush_abs.associate_parameters()

            # Create a mini batch
            start = i*BATCH_SIZE
            end = start + BATCH_SIZE
            train_X_mb = train_X[start:end]
            train_y_mb = train_y[start:end]

            losses = []
            for x, t in zip(train_X_mb, train_y_mb):
                t_in, t_out = t[:-1], t[C:]

                y = rush_abs(x, t_in)
                loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        for i in range(n_batches_valid):
            # Create a new computation graph
            dy.renew_cg()
            rush_abs.associate_parameters()

            # Create a mini batch
            start = i*BATCH_SIZE
            end = start + BATCH_SIZE
            valid_X_mb = valid_X[start:end]
            valid_y_mb = valid_y[start:end]

            losses = []
            for x, t in zip(valid_X_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[C:]

                y = rush_abs(x, t_in)
                loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' % (
            epoch+1,
            np.mean(loss_all_train),
            np.mean(loss_all_valid)
        ))

        # Save model ========================================================================
        dy.save('./model_e'+str(epoch+1), [rush_abs])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w:
            pickle.dump(w2i, f_w2i)
            pickle.dump(i2w, f_i2w)
Esempio n. 14
0
def main(args):
    import dynet as dy
    
    get_data = {"ag": lambda : ag_data_reader.get_dataset(args.num_NE),
                "dw": lambda : dw_data_reader.get_dataset(args.num_NE),
                "bl": lambda : blog_data_reader.get_dataset(),
                "tp_fr": lambda : trustpilot_data_reader.get_dataset("fr"),
                "tp_de": lambda : trustpilot_data_reader.get_dataset("de"),
                "tp_dk": lambda : trustpilot_data_reader.get_dataset("dk"),
                "tp_us": lambda : trustpilot_data_reader.get_dataset("us"),
                "tp_uk": lambda : trustpilot_data_reader.get_dataset("uk")}
    
    train, dev, test = get_data[args.dataset]()
    
    labels_main_task = set([ex.get_label() for ex in train])
    labels_main_task.add(0)
    
    assert(sorted(labels_main_task) == list(range(len(labels_main_task))))
    
    labels_adve_task = get_aux_labels(train)
    
    print("Train size: {}".format(len(train)))
    print("Dev size:   {}".format(len(dev)))
    print("Test size:  {}".format(len(test)))
    
    print("Train data distribution")
    mfb_train = print_data_distributions(train)

    print("Dev data distribution")
    mfb_dev = print_data_distributions(dev)

    print("Test data distribution")
    mfb_test = print_data_distributions(test)

    results = {}

    model = dy.Model()
    
    #if args.use_demographics:
    symbols = ["<g={}>".format(i) for i in ["F", "M"]] + ["<a={}>".format(i) for i in ["U", "O"]]
    vocabulary = extract_vocabulary(train, add_symbols=symbols)
    
    bilstm = HierarchicalBiLSTM(args, vocabulary, model)
    input_size = bilstm.size()
    main_classifier = MLP(input_size, len(labels_main_task), args.hidden_layers, args.dim_hidden, dy.rectify, model)
    
    trainer = dy.AdamTrainer(model)
    trainer.set_clip_threshold(5)
    
    args.learning_rate = trainer.learning_rate
    
    if args.subset:
        train = train[:args.subset]
        dev = dev[:args.subset]

    output_size = len(labels_adve_task)
    adversary_classifier = MLP_sigmoid(input_size, output_size, args.hidden_layers, args.dim_hidden, dy.rectify, model)
    
    discriminator = None
    if args.atraining:
        discriminator = Discriminator(input_size, output_size, args.hidden_layers, args.dim_hidden, dy.rectify, model, trainer)
    
    generator = None
    if args.generator:
        generator = Generator(args, vocabulary, model, trainer)

    #### add adversary classifier
    mod = PrModel(args, model, trainer, bilstm, main_classifier, adversary_classifier, discriminator, generator, vocabulary)
    
    
    if args.baseline:
        _, ftest = mod.train_baseline(train, dev, test, args.iterations)
        print(ftest)
        return
    
    
    print("Train main task")
    results["000_main_dev_acc"] = mod.train_main(train, dev)
    
    targets_test = [ex.get_label() for ex in test]
    loss_test, acc_test, _ = mod.evaluate_main(test, targets_test)
    print("\t Test results : l={} acc={}".format(loss_test, acc_test))
    results["001_main_test_acc"] = acc_test
    
    
    
    ##############
    ##############
    ##############
    ##############
    ##############
    ##############
    ##############    Adversary training / evaluate privacy
    ##############
    ##############
    ##############
    ##############
    ##############

    train_hidden, dev_hidden, test_hidden = [mod.get_adversary_dataset(dataset) for dataset in [train, dev, test]]
    
    
    trainer.restart()
    print("Train adversary")
    results["002_adv_dev_F"] = mod.train_adversary(train_hidden, dev_hidden)
    targets_test = [ex.get_aux_labels() for ex in test]
    loss_test, acc_test, predictions_test = mod.evaluate_adversary(test_hidden)
    
    print("\t Adversary Test results : l={} acc={}".format(loss_test, acc_test))
    outsize = mod.adversary_classifier.output_size()
    Fscore = compute_eval_metrics(outsize, targets_test, predictions_test)
    print("\tF          = {} ".format(Fscore))


    results["003_adv_test_fscore"] = Fscore[2]
    results["004_adv_test_precision"] = Fscore[0]
    results["005_adv_test_recall"] = Fscore[1]
    for i, acc in enumerate(Fscore[3]):
        results["{}_adv_test_acc_task_{}".format(str(i+6).zfill(3), i)] = acc


    preds = [set(range(outsize)) for _ in targets_test]
    Fscore = compute_eval_metrics(outsize, targets_test, preds)
    
    baseline_str = [Fscore[2], Fscore[0], Fscore[1]] + [x if x > 50.0 else 100 - x for x in Fscore[3]]
    
    
    
    line = ["Baseline", "NA", "NA", "NA", "NA", "NA", "NA", "NA", str(round(mfb_train * 100, 2)), str(round(mfb_test*100, 2)), "0"]
    print("\t".join(line) + "\t" + "\t".join(map(str, baseline_str)))
    
    
    for k in results:
        if type(results[k]) == float:
            results[k] = round(results[k], 2)
    
    
    results["#H"] = args.dim_hidden
    results["#h"] = args.hidden_layers
    results["#w"] = args.dim_word
    results["#W"] = args.dim_wrnn
    results["#Zatr"] = int(args.atraining)
    results["#Zptr"] = int(args.ptraining)
    results["#Zalpha"] = args.alpha
    
    keys = sorted(results)
    
    print("Model\t", end="")
    print("\t".join(keys))
    print("\t".join(map(str, [results[k] for k in keys])))
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet'
    )

    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--n_epochs',
                        type=int,
                        default=3,
                        help='Number of epochs [default: 3]')
    parser.add_argument(
        '--n_train',
        type=int,
        default=3803957,
        help=
        'Number of training examples (up to 3803957 in gigaword) [default: 3803957]'
    )
    parser.add_argument(
        '--n_valid',
        type=int,
        default=189651,
        help=
        'Number of validation examples (up to 189651 in gigaword) [default: 189651])'
    )
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Mini batch size [default: 32]')
    parser.add_argument('--emb_dim',
                        type=int,
                        default=256,
                        help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim',
                        type=int,
                        default=256,
                        help='Hidden state size [default: 256]')
    parser.add_argument('--lat_dim',
                        type=int,
                        default=256,
                        help='Latent size [default: 256]')
    parser.add_argument(
        '--alloc_mem',
        type=int,
        default=8192,
        help='Amount of memory to allocate [mb] [default: 8192]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS = args.n_epochs
    N_TRAIN = args.n_train
    N_VALID = args.n_valid
    BATCH_SIZE = args.batch_size
    VOCAB_SIZE = 60000
    EMB_DIM = args.emb_dim
    HID_DIM = args.hid_dim
    LAT_DIM = args.lat_dim
    ALLOC_MEM = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_STATE)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset ====================================================================================
    w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN)
    w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN)

    train_X, w2i, i2w = build_dataset(TRAIN_X_FILE,
                                      w2c=w2c,
                                      padid=False,
                                      eos=True,
                                      unksym='<unk>',
                                      target=False,
                                      n_data=N_TRAIN,
                                      vocab_size=VOCAB_SIZE)
    train_y, _, _ = build_dataset(TRAIN_Y_FILE,
                                  w2i=w2i,
                                  target=True,
                                  n_data=N_TRAIN)

    valid_X, _, _ = build_dataset(VALID_X_FILE,
                                  w2i=w2i,
                                  target=False,
                                  n_data=N_VALID)
    valid_y, _, _ = build_dataset(VALID_Y_FILE,
                                  w2i=w2i,
                                  target=True,
                                  n_data=N_VALID)

    VOCAB_SIZE = len(w2i)
    OUT_DIM = VOCAB_SIZE
    print(VOCAB_SIZE)

    # Build model ======================================================================================
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))

    encoder = BiGRU(model, EMB_DIM, 2 * HID_DIM)
    decoder = RecurrentGenerativeDecoder(model, EMB_DIM, 2 * HID_DIM, LAT_DIM,
                                         OUT_DIM)

    # Train model =======================================================================================
    n_batches_train = math.ceil(len(train_X) / BATCH_SIZE)
    n_batches_valid = math.ceil(len(valid_X) / BATCH_SIZE)

    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        train_X, train_y = shuffle(train_X, train_y)
        loss_all_train = []
        for i in tqdm(range(n_batches_train)):
            # Create a new computation graph
            dy.renew_cg()
            encoder.associate_parameters()
            decoder.associate_parameters()

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            train_X_mb = train_X[start:end]
            train_y_mb = train_y[start:end]

            losses = []
            for x, t in zip(train_X_mb, train_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                he = encoder(x_embs)

                # Decoder
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                decoder.set_initial_states(he)
                y, KL = decoder(t_embs)

                loss = dy.esum([
                    dy.pickneglogsoftmax(y_t, t_t) + KL_t
                    for y_t, t_t, KL_t in zip(y, t_out, KL)
                ])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        for i in range(n_batches_valid):
            # Create a new computation graph
            dy.renew_cg()
            encoder.associate_parameters()
            decoder.associate_parameters()

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            valid_X_mb = valid_X[start:end]
            valid_y_mb = valid_y[start:end]

            losses = []
            for x, t in zip(valid_X_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                he = encoder(x_embs)

                # Decoder
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                decoder.set_initial_states(he)
                y, KL = decoder(t_embs)

                loss = dy.esum([
                    dy.pickneglogsoftmax(y_t, t_t) + KL_t
                    for y_t, t_t, KL_t in zip(y, t_out, KL)
                ])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' %
              (epoch + 1, np.mean(loss_all_train), np.mean(loss_all_valid)))

        # Save model ======================================================================================
        dy.save('./model_e' + str(epoch + 1), [V, encoder, decoder])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump',
                                                     'wb') as f_i2w:
            pickle.dump(w2i, f_w2i)
            pickle.dump(i2w, f_i2w)