コード例 #1
0
 def epoch_train(self,examples):
     count=0
     dy.renew_cg()
     current_losses = [ ]
     loss_list = []
     for word,context in (examples):
         loss = self.get_score(word,context)
         current_losses.append(loss)
         loss_list.append(loss.value())
         if len(current_losses) >= self.batch_size:
             mean_loss = dy.esum(current_losses) / float(len(current_losses))
             mean_loss.forward()
             mean_loss.backward()
             self.optimizer.update()
             current_losses = [ ]
             dy.renew_cg()
         count+=1
         ## Print out the average loss in every 1M example
         if count%1000000==1000:
             print (count,np.mean(np.array(loss_list)))
             loss_list = []
     if current_losses:
         mean_loss = dy.esum(current_losses) / float(len(current_losses))
         mean_loss.forward()
         mean_loss.backward()
         self.optimizer.update()
コード例 #2
0
    def decode_loss(self, src1, src2, tgt):
        src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward(
            src1, src2
        )
        _, prev_coverage = self.get_coverage(
            a_t=dy.vecInput(len(src1)), prev_coverage=dy.vecInput(len(src1))
        )

        loss = []
        cov_loss = []
        diag_loss = []

        embedded_tgt = self.embed_idx(tgt, self.tgt_lookup)
        last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)]

        for t, (char, embedded_char) in enumerate(zip(tgt, embedded_tgt)):
            a_t, c1_t = self.attend(
                src1_mat,
                decoder_state,
                src1_w1dt,
                self.att1_w2,
                self.att1_v,
                prev_coverage,
            )
            if not self.single_source:
                _, c2_t = self.attend(
                    src2_mat, decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None
                )
            else:
                c2_t = dy.vecInput(2 * HIDDEN_DIM)

            x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings])
            decoder_state = decoder_state.add_input(x_t)

            out_vector = self.dec_w * decoder_state.output() + self.dec_b
            probs = dy.softmax(out_vector)
            probs, _ = self.get_pointergen_probs(
                c1_t, decoder_state, x_t, a_t, probs, src1
            )

            loss.append(-dy.log(dy.pick(probs, char)))
            cov_loss_cur, prev_coverage = self.get_coverage(a_t, prev_coverage)
            cov_loss.append(cov_loss_cur)
            diag_loss.append(self.get_diag_loss(a_t, t))

            last_output_embeddings = embedded_char

        loss = dy.esum(loss)
        cov_loss = dy.esum(cov_loss)
        diag_loss = dy.esum(diag_loss)
        return loss + COV_LOSS_WEIGHT * cov_loss + DIAG_LOSS_WEIGHT * diag_loss
コード例 #3
0
 def get_pointergen_probs(self, c_t, state, x_t, a_t, probs, src1):
     if not self.pointer_gen:
         return probs, 1.0
     unk_idx = self.tgt_vocab.str2int(UNK)
     p_gen = dy.logistic(
         self.ptr_w_c * c_t
         + self.ptr_w_s * dy.concatenate(list(state.s()))
         + self.ptr_w_x * x_t
     )
     gen_probs = probs * p_gen
     copy_probs = a_t * (1 - p_gen)
     copy_probs_update = []
     for i in gen_probs:
         copy_probs_update.append([i])
     for char, prob in zip(src1, copy_probs):
         cur_idx = self.tgt_vocab.str2int(self.src1_vocab.int2str(char))
         if cur_idx == unk_idx:
             continue
         if isinstance(cur_idx, int):
             copy_probs_update[cur_idx].append(prob)
         else:
             for idx in cur_idx:
                 copy_probs_update[idx].append(prob / len(cur_idx))
     sum_probs = dy.concatenate([dy.esum(exps) for exps in copy_probs_update])
     return sum_probs, p_gen.scalar_value()
コード例 #4
0
 def get_diag_loss(self, a_t, t):
     if self.diag_loss < 0:
         return dy.scalarInput(0)
     off_diag_elems = [dy.scalarInput(0)]
     for i, prob in enumerate(a_t):
         if i < (t - self.diag_loss) or i > (t + self.diag_loss):
             off_diag_elems.append(prob)
     return dy.esum(off_diag_elems)
コード例 #5
0
 def compute_loss_multilabel(self, task, seq, multi_y):
     """
     computes the loss for multi-label instances by summing over the negative log probabilities of all correct labels
     """
     out_probs = self(task, seq)
     losses = []
     for y in multi_y:
         assigned_prob = dn.pick(out_probs, y)
         losses.append(-dn.log(assigned_prob) / len(multi_y))
     return dn.esum(losses)
コード例 #6
0
    def train_batch(self, words):
        losses = []

        W = dy.parameter(self.W)
        b = dy.parameter(self.b)

        for word in words:
            wlosses = []

            word = self.word_to_indices(word)

            s = self.lstm.initial_state()

            for c, next_c in zip(word, word[1:]):
                s = s.add_input(self.lookup[c])
                unnormalized = dy.affine_transform([b, W, s.output()])
                wlosses.append(dy.pickneglogsoftmax(unnormalized, next_c))

            losses.append(dy.esum(wlosses) / len(word))

        return dy.esum(losses) / len(words)
コード例 #7
0
ファイル: model_RL.py プロジェクト: xcgfth/TaxoRL
        def process_one_instance(instance,
                                 update=True,
                                 x_y_vectors=None,
                                 features=None,
                                 mode='train'):
            lemma_lookup = self.model_parameters['lemma_lookup']
            if self.opt['use_path']:
                pos_lookup = self.model_parameters['pos_lookup']
                dep_lookup = self.model_parameters['dep_lookup']
                dir_lookup = self.model_parameters['dir_lookup']
                # Add the empty path
                paths = instance
                if len(paths) == 0:
                    paths[EMPTY_PATH] = 1

                # Compute the averaged path
                num_paths = reduce(lambda x, y: x + y, instance.itervalues())
                path_embeddings = [
                    self.get_path_embedding_from_cache(
                        lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path,
                        update, mode) * count
                    for path, count in instance.iteritems()
                ]
                input_vec = dy.esum(path_embeddings) * (1.0 / num_paths)

            # Concatenate x and y embeddings
            if self.opt['use_xy_embeddings']:
                x_vector, y_vector = dy.lookup(lemma_lookup,
                                               x_y_vectors[0]), dy.lookup(
                                                   lemma_lookup,
                                                   x_y_vectors[1])
                if self.opt['use_path']:
                    input_vec = dy.concatenate([x_vector, input_vec, y_vector])
                else:
                    input_vec = dy.concatenate([x_vector, y_vector])
            if self.opt['use_features']:
                for k in feat_dims:
                    if 'diff' in k and not self.opt['use_freq_features']:
                        continue
                    feat = dy.lookup(self.model_parameters[k], features[k])
                    input_vec = dy.concatenate([input_vec, feat])

            if self.opt['use_height_ebd']:
                if j in tree.term_height:
                    h = tree.get_height(j) - 1
                else:
                    h = 0
                height_vector = dy.lookup(
                    self.model_parameters['height_lookup'], h)
                input_vec = dy.concatenate([input_vec, height_vector])
            return input_vec
コード例 #8
0
    def calculate_loss(self, sents):
        dy.renew_cg()
        losses = []
        for sent in sents:
            features, t_features, feat_reconstruct = self.get_features_for_tagging(
                sent, True
            )
            gold_tags = [tag for chars, word, feats, tag in sent]
            cur_loss = self.crf_module.negative_log_loss(
                features, t_features, gold_tags
            )
            if self.autoencoder:
                autoencoder_loss = [
                    dy.binary_log_loss(reconstruct, dy.inputTensor(feats))
                    for reconstruct, (chars, word, feats, tag) in zip(
                        feat_reconstruct, sent
                    )
                ]
            else:  # remove autoencoder loss
                autoencoder_loss = [dy.scalarInput(0)]
            losses.append(cur_loss + (dy.esum(autoencoder_loss) / self.featsize))

        return dy.esum(losses)
コード例 #9
0
ファイル: decomposable.py プロジェクト: kishkash555/biu
    def calc_attend(self, a_vecs, b_vecs, dropout):
        l_a = a_vecs.dim()[1]
        l_b = b_vecs.dim()[1]

        fa = self.attend.evaluate_network(a_vecs, True, dropout)
        fb = self.attend.evaluate_network(b_vecs, True, dropout)

        e_ij = list()
        for i in range(l_a):
            e_ij.append(list())
            for j in range(l_b):
                e_ij[i].append(
                    dy.dot_product(dy.pick_batch_elem(fa, i),
                                   dy.pick_batch_elem(fb, j)))

        beta_softmaxes = [
            dy.softmax(dy.concatenate(e_ij[i])) for i in range(l_a)
        ]
        alpha_softmaxes = [
            dy.softmax(dy.concatenate([e_ij[i][j] for j in range(l_b)]))
            for i in range(l_a)
        ]

        betas = [
            dy.esum([
                dy.pick_batch_elem(b_vecs, j) * beta_softmaxes[i][j]
                for j in range(l_b)
            ]) for i in range(l_a)
        ]
        alphas = [
            dy.esum([
                dy.pick_batch_elem(a_vecs, i) * alpha_softmaxes[i][j]
                for i in range(l_a)
            ]) for j in range(l_b)
        ]
        return alphas, betas
コード例 #10
0
    def set_initial_states(self, x):
        self.xt_embs = [dy.lookup(self.F, x_t) for x_t in x]

        if self.encoder_type == 'bow':
            self.W_enc = self.W * dy.average(self.xt_embs)

        elif self.encoder_type == 'attention':
            self.xb = dy.concatenate([
                dy.esum(self.xt_embs[max(i - self.q, 0
                                         ):min(len(x) - 1 + 1, i + self.q +
                                               1)]) / self.q
                for i in range(len(x))
            ],
                                     d=1)
            self.xt = dy.transpose(dy.concatenate(self.xt_embs, d=1))
コード例 #11
0
ファイル: mnnl.py プロジェクト: mullikine/bilstm-aux
 def predict_sequence(self, seq, inputs, train=False, output_confidences=False, unk_tag=None, dictionary=None, type_constraint=False, **kwargs):
     output = [self.network_builder(x, **kwargs) for x in inputs]
     if not train:
         if dictionary and type_constraint: # to type constraint decoding only during testing
             pred_tags = []
             for i, o in enumerate(output):
                 softmax_distr = o.npvalue()
                 word = seq.words[i]
                 softmax_distr = self.prune_softmax(softmax_distr, word, dictionary)
                 tag_best = self.index2tag[np.argmax(softmax_distr)]
                 pred_tags.append(tag_best)
             seq.pred_tags = pred_tags
         else:
             seq.pred_tags = [self.index2tag[np.argmax(o.npvalue())] for o in output]  # logprobs to indices
     if output_confidences:
         seq.tag_confidences = array.array('f', [np.max(o.npvalue()) for o in output])
     if train:
         # return loss per tag
         gold_tag_indices = array.array('I',[self.tag2index[t] for t in seq.tags])
         return dynet.esum([pick_neg_log(pred,gold) for pred, gold in zip(output, gold_tag_indices)])
コード例 #12
0
ファイル: mnnl.py プロジェクト: bplank/bilstm-aux
 def predict_sequence(self, seq, inputs, train=False, output_confidences=False, unk_tag=None, dictionary=None, type_constraint=False, **kwargs):
     output = [self.network_builder(x, **kwargs) for x in inputs]
     if not train:
         if dictionary and type_constraint: # to type constraint decoding only during testing
             pred_tags = []
             for i, o in enumerate(output):
                 softmax_distr = o.npvalue()
                 word = seq.words[i]
                 softmax_distr = self.prune_softmax(softmax_distr, word, dictionary)
                 tag_best = self.index2tag[np.argmax(softmax_distr)]
                 pred_tags.append(tag_best)
             seq.pred_tags = pred_tags
         else:
             seq.pred_tags = [self.index2tag[np.argmax(o.npvalue())] for o in output]  # logprobs to indices
     if output_confidences:
         seq.tag_confidences = array.array('f', [np.max(o.npvalue()) for o in output])
     if train:
         # return loss per tag
         gold_tag_indices = array.array('I',[self.tag2index[t] for t in seq.tags])
         return dynet.esum([pick_neg_log(pred,gold) for pred, gold in zip(output, gold_tag_indices)])
コード例 #13
0
def train(builder,
          model,
          model_parameters,
          X_train,
          y_train,
          nepochs,
          alpha=0.01,
          update=True,
          dropout=0.0,
          x_y_vectors=None,
          num_hidden_layers=0):
    """
    Train the LSTM
    :param builder: the LSTM builder
    :param model: LSTM RNN model
    :param model_parameters: the model parameters
    :param X_train: the lstm instances
    :param y_train: the lstm labels
    :param nepochs: number of epochs
    :param alpha: the learning rate (only for SGD)
    :param update: whether to update the lemma embeddings
    :param dropout: dropout probability for all component embeddings
    :param x_y_vectors: the word vectors of x and y
    :param num_hidden_layers The number of hidden layers for the term-pair classification network
    """
    trainer = dy.AdamTrainer(model, alpha=alpha)
    minibatch_size = min(MINIBATCH_SIZE, len(y_train))
    nminibatches = int(math.ceil(len(y_train) / minibatch_size))
    previous_loss = 1000

    for epoch in range(nepochs):

        total_loss = 0.0

        epoch_indices = np.random.permutation(len(y_train))

        for minibatch in range(nminibatches):

            path_cache = {}
            batch_indices = epoch_indices[minibatch *
                                          minibatch_size:(minibatch + 1) *
                                          minibatch_size]

            dy.renew_cg()

            loss = dy.esum([
                -dy.log(
                    dy.pick(
                        process_one_instance(
                            builder,
                            model,
                            model_parameters,
                            X_train[batch_indices[i]],
                            path_cache,
                            update,
                            dropout,
                            x_y_vectors=x_y_vectors[batch_indices[i]]
                            if x_y_vectors is not None else None,
                            num_hidden_layers=num_hidden_layers),
                        y_train[batch_indices[i]]))
                for i in range(minibatch_size)
            ])
            total_loss += loss.value()  # forward computation
            loss.backward()
            trainer.update()

        # deprecated http://dynet.readthedocs.io/en/latest/python_ref.html#optimizers GB
        # and requires an argument (would be epoch i guess...)
        # trainer.update_epoch()
        trainer.update()
        total_loss /= len(y_train)
        print 'Epoch', (epoch + 1), '/', nepochs, 'Loss =', total_loss

        # Early stopping
        if math.fabs(previous_loss - total_loss) < LOSS_EPSILON:
            break

        previous_loss = total_loss
コード例 #14
0
def process_one_instance(builder,
                         model,
                         model_parameters,
                         instance,
                         path_cache,
                         update=True,
                         dropout=0.0,
                         x_y_vectors=None,
                         num_hidden_layers=0):
    """
    Return the LSTM output vector of a single term-pair - the average path embedding
    :param builder: the LSTM builder
    :param model: the LSTM model
    :param model_parameters: the model parameters
    :param instance: a Counter object with paths
    :param path_cache: the cache for path embeddings
    :param update: whether to update the lemma embeddings
    :param dropout: word dropout rate
    :param x_y_vectors: the current word vectors for x and y
    :param num_hidden_layers The number of hidden layers for the term-pair classification network
    :return: the LSTM output vector of a single term-pair
    """
    W1 = dy.parameter(model_parameters['W1'])
    b1 = dy.parameter(model_parameters['b1'])
    W2 = None
    b2 = None

    if num_hidden_layers == 1:
        W2 = dy.parameter(model_parameters['W2'])
        b2 = dy.parameter(model_parameters['b2'])

    lemma_lookup = model_parameters['lemma_lookup']
    pos_lookup = model_parameters['pos_lookup']
    dep_lookup = model_parameters['dep_lookup']
    dir_lookup = model_parameters['dir_lookup']

    # Use the LSTM output vector and feed it to the MLP

    # Add the empty path
    paths = instance

    if len(paths) == 0:
        paths[EMPTY_PATH] = 1

    # Compute the averaged path
    num_paths = reduce(lambda x, y: x + y, instance.itervalues())
    path_embbedings = [
        get_path_embedding_from_cache(path_cache, builder, lemma_lookup,
                                      pos_lookup, dep_lookup, dir_lookup, path,
                                      update, dropout) * count
        for path, count in instance.iteritems()
    ]
    input_vec = dy.esum(path_embbedings) * (1.0 / num_paths)

    # Concatenate x and y embeddings
    if x_y_vectors is not None:
        x_vector, y_vector = dy.lookup(lemma_lookup,
                                       x_y_vectors[0]), dy.lookup(
                                           lemma_lookup, x_y_vectors[1])
        input_vec = dy.concatenate([x_vector, input_vec, y_vector])

    h = W1 * input_vec + b1

    if num_hidden_layers == 1:
        h = W2 * dy.tanh(h) + b2

    output = dy.softmax(h)

    return output
コード例 #15
0
ファイル: layers.py プロジェクト: toru34/zhou_acl_2017
    def __call__(self, x, tm1s=None, test=False):
        if test:
            # Initial states
            s_tm1 = tm1s[0]
            c_tm1 = tm1s[1]
            w_tm1 = x

            # GRU
            s_t = self.GRUBuilder.initial_state().set_s([s_tm1]).add_input(
                dy.concatenate([w_tm1, c_tm1])).output()

            # Attention
            e_t = dy.pick(
                self.va *
                dy.tanh(dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0)
            a_t = dy.softmax(e_t)
            c_t = dy.esum([
                dy.cmult(a_t_i, h_i)
                for a_t_i, h_i in zip(a_t, dy.transpose(self.hp))
            ])
            #c_t = self.hp*a_t # memory error?

            # Output
            r_t = dy.concatenate_cols([
                Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t
                for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr)
            ])  # Maxout
            m_t = dy.max_dim(r_t, d=1)
            y_t = dy.softmax(self.Wo * m_t)

            return s_t, c_t, y_t

        else:
            w_embs = x
            # Initial states
            s_tm1 = self.s_0
            c_tm1 = self.c_0
            GRU = self.GRUBuilder.initial_state().set_s([s_tm1])

            y = []
            for w_tm1 in w_embs:
                # GRU
                GRU = GRU.add_input(dy.concatenate([w_tm1, c_tm1]))
                s_t = GRU.output()

                # Attention
                e_t = dy.pick(
                    self.va * dy.tanh(
                        dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0)
                a_t = dy.softmax(e_t)
                c_t = dy.esum([
                    dy.cmult(a_t_i, h_i)
                    for a_t_i, h_i in zip(a_t, dy.transpose(self.hp))
                ])
                #c_t = self.hp*a_t # memory error?

                # Output
                r_t = dy.concatenate_cols([
                    Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t
                    for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr)
                ])  # Maxout
                m_t = dy.max_dim(r_t, d=1)

                y_t = self.Wo * m_t
                y.append(y_t)

                # t -> tm1
                s_tm1 = s_t
                c_tm1 = c_t

            return y
コード例 #16
0
def main():
    parser = argparse.ArgumentParser(description='Selective Encoding for Abstractive Sentence Summarization in DyNet')

    parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]')
    parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]')
    parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651])')
    parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]')
    parser.add_argument('--vocab_size', type=int, default=124404, help='Vocabulary size [default: 124404]')
    parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]')
    parser.add_argument('--maxout_dim', type=int, default=2, help='Maxout size [default: 2]')
    parser.add_argument('--alloc_mem', type=int, default=10000, help='Amount of memory to allocate [mb] [default: 10000]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS   = args.n_epochs
    N_TRAIN    = args.n_train
    N_VALID    = args.n_valid
    BATCH_SIZE = args.batch_size
    VOCAB_SIZE = args.vocab_size
    EMB_DIM    = args.emb_dim
    HID_DIM    = args.hid_dim
    MAXOUT_DIM = args.maxout_dim
    ALLOC_MEM  = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_SEED)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset
    dataset = Dataset(
        TRAIN_X_FILE,
        TRAIN_Y_FILE,
        VALID_X_FILE,
        VALID_Y_FILE,
        vocab_size=VOCAB_SIZE,
        batch_size=BATCH_SIZE,
        n_train=N_TRAIN,
        n_valid=N_VALID
    )
    VOCAB_SIZE = len(dataset.w2i)
    print('VOCAB_SIZE', VOCAB_SIZE)

    # Build model
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))
    encoder = SelectiveBiGRU(model, EMB_DIM, HID_DIM)
    decoder = AttentionalGRU(model, EMB_DIM, HID_DIM, MAXOUT_DIM, VOCAB_SIZE)

    # Train model
    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        loss_all_train = []
        dataset.reset_train_iter()
        for train_x_mb, train_y_mb in tqdm(dataset.train_iter):
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters([encoder, decoder])
            losses = []
            for x, t in zip(train_x_mb, train_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                hp, hb_1 = encoder(x_embs)

                # Decoder
                decoder.set_initial_states(hp, hb_1)
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                y = decoder(t_embs)

                # Loss
                loss = dy.esum(
                    [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]
                )
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        dataset.reset_valid_iter()
        for valid_x_mb, valid_y_mb in dataset.valid_iter:
            # Create a new computation graph
            dy.renew_cg()
            associate_parameters([encoder, decoder])
            losses = []
            for x, t in zip(valid_x_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                hp, hb_1 = encoder(x_embs)

                # Decoder
                decoder.set_initial_states(hp, hb_1)
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                y = decoder(t_embs)

                # Loss
                loss = dy.esum(
                    [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]
                )
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f, Time: %.3f[s]' % (
            epoch+1,
            np.mean(loss_all_train),
            np.mean(loss_all_valid),
            time.time()-start_time
        ))

        # Save model
        dy.save('./model_e'+str(epoch+1), [V, encoder, decoder])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w:
            pickle.dump(dataset.w2i, f_w2i)
            pickle.dump(dataset.i2w, f_i2w)
コード例 #17
0
    def train_batched(self, tasks, batch_size, scale_gradient_factor,
                      validation_data, seqs_trg, early_stopping, patience,
                      num_epochs, min_num_epochs, num_updates, prob_main_task,
                      prob_adv):
        trainer = dn.SimpleSGDTrainer(self.model)

        # stores best observed validation accuracy
        val_best = 0
        # stores the number of iterations without improvement
        no_improvement = 0
        val_prev = 0

        for epoch in range(num_epochs):
            sum_losses = 0
            adversarial_loss = 0
            losses_prediction_task = []
            losses_aux_task = []
            batch_dict = self.generate_batches_across_tasks(tasks, batch_size)

            # number of updates is twice the length of the main task batch list
            num_updates = len(batch_dict[self.prediction_layer]) * 2
            print(num_updates)
            #logging.INFO('Number of updates to do: {}'.format(num_updates))
            # sample batches according to some schema
            update_counter = 0
            while update_counter <= num_updates:
                update_counter += 1

                # with prob 1-prob_adv, do a task update
                outcome = np.random.binomial(1, prob_adv, size=None)
                if outcome == 0:
                    task_id, batch_ids = self.sample_task_batch(
                        batch_dict, prob_main_task=prob_main_task)
                    losses = []
                    dn.renew_cg()
                    # iterate through the batch
                    for i in batch_ids:
                        seq = tasks[task_id].train_seqs[i]
                        label = tasks[task_id].train_labels[i]
                        loss = self.compute_loss_multilabel(
                            task_id, seq, label)
                        losses.append(loss)

                    batch_loss = dn.esum(losses) / len(batch_ids)
                    batch_loss_value = batch_loss.value()
                    batch_loss.backward()
                    trainer.update()
                    sum_losses += batch_loss_value

                    if task_id == self.prediction_layer:
                        losses_prediction_task.append(batch_loss_value)
                    else:
                        losses_aux_task.append(batch_loss_value)
                else:
                    # do adversarial step
                    losses = []
                    dn.renew_cg()
                    seqs, labels = self.generate_adversarial_batch(
                        seqs_src=tasks[self.src_domain].train_seqs,
                        seqs_trg=seqs_trg,
                        batch_size=batch_size)
                    for i in range(len(seqs)):
                        seq = seqs[i]
                        label = labels[i]
                        loss = self.compute_loss_multilabel(task='adversarial',
                                                            seq=seq,
                                                            multi_y=label)
                        losses.append(loss)
                    batch_loss = dn.esum(losses) / len(seqs)
                    batch_loss_value = batch_loss.value()
                    batch_loss.backward()
                    trainer.update()
                    adversarial_loss += batch_loss_value

            # compute the validation accuracy to monitor early stopping
            # use the micro averaged f as criterion
            res = evaluate_model_predictions(
                self.predict(self.main_task, validation_data['seq']),
                validation_data['label'], validation_data['labelset'])
            f_avg = res['f_avg']
            logging.info(
                'Epoch {}. Sum loss: {}. Avg loss: {}. Avg loss predtask {}. Avg loss aux tasks: {}. No improv: {}. Best f_val: {}. Avg f_val: {}'
                .format(epoch, sum_losses, sum_losses / num_updates,
                        np.mean(losses_prediction_task),
                        np.mean(losses_aux_task), no_improvement, val_best,
                        f_avg))
            logging.info(
                'Epoch {}. Adv loss: {}. Avg loss: {}. Avg loss predtask {}. Avg loss aux tasks: {}. No improv: {}. Best f_val: {}. Avg f_val: {}'
                .format(epoch, adversarial_loss, sum_losses / num_updates,
                        np.mean(losses_prediction_task),
                        np.mean(losses_aux_task), no_improvement, val_best,
                        f_avg))

            # init early stopping after min number of epochs
            if epoch == min_num_epochs - 1:
                val_prev = f_avg
                no_improvement = 0
                self.save(self.exp_path)

            # if early_stopping:
            if f_avg <= val_prev:
                no_improvement += 1
                if early_stopping:
                    if no_improvement >= patience and epoch > min_num_epochs:
                        break
            else:
                if epoch >= min_num_epochs:
                    self.save(self.exp_path)
                no_improvement = 0
                if f_avg >= val_best:
                    val_best = f_avg
                val_prev = f_avg

        return epoch, f_avg, sum_losses, no_improvement, val_best
コード例 #18
0
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence,
                    output, env, first, previous):
    pos_lookup = params_encoder["pos_lookup"]
    char_lookup = params_encoder["char_lookup"]
    char_v = params_decoder["attention_v"]
    char_w1 = params_decoder["attention_wc"]
    char_w2 = params_decoder["attention_bc"]
    sc_vector = []
    for i, world in enumerate(_state(env)):
        world = world
        sc0 = char_encoder.initial_state()
        sc = sc0
        for char in world:
            sc = sc.add_input(char_lookup[char2int[char]])
        sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]]))
    dy_sc_vector = dy.concatenate(sc_vector, d=1)
    s0 = encoder.initial_state()
    s = s0
    lookup = params_encoder["lookup"]
    attention_w = params_decoder["attention_w"]
    attention_b = params_decoder["attention_b"]
    sentence = sentence + ' <end>'
    sentence = [
        vocab.index(c) if c in vocab else vocab.index('<unknown>')
        for c in sentence.split(' ')
    ]
    loss = []
    generate = []
    s_vector = []
    for word in (sentence):
        s = s.add_input(lookup[word])
        s_vector.append(dy.softmax(attention_w * s.output() + attention_b))
    encode_output = s.output()
    dy_s_vector = dy.concatenate(s_vector, d=1)
    _s0 = decoder.initial_state(s.s())
    _s = _s0
    R = params_decoder["R"]
    bias = params_decoder["bias"]
    index = 1
    input_word = "<start>"
    _lookup = params_decoder["lookup"]
    while True:
        dy_env = dy.inputTensor(get_state_embed3(env))
        word = vocab_out.index(input_word)
        gt_y = vocab_out.index(output[index])

        weight = dy.softmax(
            dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector]))
        weight_char = dy.softmax(
            dy.concatenate([
                char_v * dy.tanh(char_w1 * x + char_w2 * _s.output())
                for x in sc_vector
            ]))

        encode_output = dy_s_vector * weight
        encode_state = dy_sc_vector * weight_char
        _s = _s.add_input(
            dy.concatenate([_lookup[word], encode_output, encode_state]))
        probs = dy.softmax((R) * _s.output() + bias)
        prediction = np.argsort(probs.npvalue())[-1]
        if (vocab_out[prediction]) == '<start>':
            prediction = np.argsort(probs.npvalue())[-2]
        generate.append(vocab_out[prediction])
        loss.append(-dy.log(dy.pick(probs, gt_y)))
        if output[index] == '<end>':
            break
        index += 1
        input_word = vocab_out[prediction]
        if input_word == '<end>':
            continue
        env = str(execute(env, [input_word]))
        if env == 'None':
            env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_'
    loss = dy.esum(loss)
    while '<start>' in generate:
        generate.remove('<start>')
    previous = s.output()
    return loss, generate, previous
コード例 #19
0
    def fit(self,
            train_dict,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            clip_threshold=5.0,
            orthogonality_weight=0.0,
            adversarial=False,
            adversarial_weight=1.0,
            ignore_src_Ft=False):
        """
        train the tagger
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        :param adversarial: note: if we want to use adversarial, we have to
                            call add_adversarial_loss before;
        :param adversarial_weight: 1 by default (do not weigh adv loss)
        :param ignore_src_Ft: if asymm.tri. 2nd stage, do not further train Ft on 'src'
        :param train_dict: a dictionary mapping tasks ("F0", "F1", and "Ft")
                           to a dictionary
                           {"X": list of examples,
                            "Y": list of labels,
                            "domain": list of domain tag (0,1) of example}
        Three tasks are indexed as "F0", "F1" and "Ft"

        Note: if a task 'src' is given than a single model with three heads is trained where
        all data is given to all outputs
        """
        print("read training data")

        widCount = Counter()
        train_data = []
        for task, task_dict in train_dict.items():  #task: eg. "F0"
            for key in ["X", "Y", "domain"]:
                assert key in task_dict, "Error: %s is not available." % key
            examples, labels, domain_tags = task_dict["X"], task_dict[
                "Y"], task_dict["domain"]
            assert len(examples) == len(labels)
            if word_dropout_rate > 0.0:
                # keep track of the counts for word dropout
                for sentence, _ in examples:
                    widCount.update([w for w in sentence])

            # train data is a list of 4-tuples: (example, label, task_id, domain_id)
            train_data += list(
                zip(examples, labels, [[task] * len(labels)][0], domain_tags))

        # if we use target vectors, keep track of the targets per sentence
        if trg_vectors is not None:
            trg_start_id = 0
            sentence_trg_vectors = []
            for i, (example, y) in enumerate(train_data):
                sentence_trg_vectors.append(
                    trg_vectors[trg_start_id:trg_start_id +
                                len(example[0]), :])
                trg_start_id += len(example[0])
            assert trg_start_id == len(trg_vectors),\
                'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors))

        print('Starting training for {} epochs...'.format(num_epochs))
        best_val_acc, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        if seed:
            random.seed(seed)

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1,
                                                       num_epochs),
                      max=len(train_data),
                      flush=True)

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            total_loss, total_tagged, total_constraint, total_adversarial = 0.0, 0.0, 0.0, 0.0
            total_orth_constr = 0  # count how many updates

            # log separate losses
            log_losses = {}
            log_total = {}
            for task_id in self.task_ids:
                log_losses[task_id] = 0.0
                log_total[task_id] = 0

            for i, idx in enumerate(random_indices):
                (word_indices,
                 char_indices), y, task_id, domain_id = train_data[idx]

                if word_dropout_rate > 0.0:
                    word_indices = [
                        self.w2i["_UNK"] if
                        (random.random() >
                         (widCount.get(w) /
                          (word_dropout_rate + widCount.get(w)))) else w
                        for w in word_indices
                    ]

                output, constraint, adv = self.predict(
                    word_indices,
                    char_indices,
                    task_id,
                    train=True,
                    orthogonality_weight=orthogonality_weight,
                    domain_id=domain_id if adversarial else None)

                if task_id not in ['src', 'trg']:

                    if len(y) == 1 and y[0] == 0:
                        # in temporal ensembling, we assign a dummy label of [0] for
                        # unlabeled sequences; we skip the supervised loss for these
                        loss = dynet.scalarInput(0)
                    else:
                        loss = dynet.esum([
                            self.pick_neg_log(pred, gold)
                            for pred, gold in zip(output, y)
                        ])

                    if trg_vectors is not None:
                        # the consistency loss in temporal ensembling is used for
                        # both supervised and unsupervised input
                        targets = sentence_trg_vectors[idx]
                        assert len(output) == len(targets)
                        other_loss = unsup_weight * dynet.average([
                            dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t in zip(output, targets)
                        ])
                        loss += other_loss

                    if orthogonality_weight != 0.0 and task_id != 'Ft':
                        # add the orthogonality constraint to the loss
                        total_constraint += constraint.value(
                        ) * orthogonality_weight
                        total_orth_constr += 1
                        loss += constraint * orthogonality_weight

                    if adversarial:
                        total_adversarial += adv.value() * adversarial_weight
                        loss += adv * adversarial_weight

                    total_loss += loss.value()  # for output

                    log_losses[task_id] += total_loss
                    total_tagged += len(word_indices)
                    log_total[task_id] += total_tagged

                    loss.backward()
                    self.trainer.update()
                    bar.next()
                else:
                    # bootstrap=False, the output contains list of outputs one for each task
                    assert trg_vectors is None, 'temporal ensembling not implemented for bootstrap=False'
                    loss = dynet.scalarInput(1)  #initialize
                    if ignore_src_Ft:
                        output = output[:
                                        -1]  # ignore last = Ft when further training with 'src'

                    for t_i, output_t in enumerate(
                            output):  # get loss for each task
                        loss += dynet.esum([
                            self.pick_neg_log(pred, gold)
                            for pred, gold in zip(output_t, y)
                        ])
                        task_id = self.task_ids[t_i]
                        log_losses[task_id] += total_loss
                        log_total[task_id] += total_tagged

                    if orthogonality_weight != 0.0:
                        # add the orthogonality constraint to the loss
                        total_constraint += constraint.value(
                        ) * orthogonality_weight
                        total_orth_constr += 1
                        loss += constraint * orthogonality_weight

                    if adversarial:
                        total_adversarial += adv.value() * adversarial_weight
                        loss += adv * adversarial_weight

                    total_loss += loss.value()  # for output
                    total_tagged += len(word_indices)

                    loss.backward()
                    self.trainer.update()
                    bar.next()

            if adversarial and orthogonality_weight:
                print(
                    "iter {}. Total loss: {:.3f}, total penalty: {:.3f}, total weighted adv loss: {:.3f}"
                    .format(cur_iter, total_loss / total_tagged,
                            total_constraint / total_orth_constr,
                            total_adversarial / total_tagged),
                    file=sys.stderr)
            elif orthogonality_weight:
                print("iter {}. Total loss: {:.3f}, total penalty: {:.3f}".
                      format(cur_iter, total_loss / total_tagged,
                             total_constraint / total_orth_constr),
                      file=sys.stderr)
            else:
                print("iter {}. Total loss: {:.3f} ".format(
                    cur_iter, total_loss / total_tagged),
                      file=sys.stderr)

            for task_id in self.task_ids:
                if log_total[task_id] > 0:
                    print("{0}: {1:.3f}".format(
                        task_id, log_losses[task_id] / log_total[task_id]))

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best accuracy on the validation set
                val_correct, val_total = self.evaluate(val_X, val_Y)
                val_accuracy = val_correct / val_total

                if val_accuracy > best_val_acc:
                    print(
                        'Accuracy {:.4f} is better than best val accuracy {:.4f}.'
                        .format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save_tagger(self, model_path)
                else:
                    print(
                        'Accuracy {:.4f} is worse than best val loss {:.4f}.'.
                        format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for {} epochs. Early stopping...'.
                          format(epochs_no_improvement))
                    break
コード例 #20
0
ファイル: model_RL.py プロジェクト: xcgfth/TaxoRL
    def process_one(self, i, j, tree, mode):
        def process_one_instance(instance,
                                 update=True,
                                 x_y_vectors=None,
                                 features=None,
                                 mode='train'):
            lemma_lookup = self.model_parameters['lemma_lookup']
            if self.opt['use_path']:
                pos_lookup = self.model_parameters['pos_lookup']
                dep_lookup = self.model_parameters['dep_lookup']
                dir_lookup = self.model_parameters['dir_lookup']
                # Add the empty path
                paths = instance
                if len(paths) == 0:
                    paths[EMPTY_PATH] = 1

                # Compute the averaged path
                num_paths = reduce(lambda x, y: x + y, instance.itervalues())
                path_embeddings = [
                    self.get_path_embedding_from_cache(
                        lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path,
                        update, mode) * count
                    for path, count in instance.iteritems()
                ]
                input_vec = dy.esum(path_embeddings) * (1.0 / num_paths)

            # Concatenate x and y embeddings
            if self.opt['use_xy_embeddings']:
                x_vector, y_vector = dy.lookup(lemma_lookup,
                                               x_y_vectors[0]), dy.lookup(
                                                   lemma_lookup,
                                                   x_y_vectors[1])
                if self.opt['use_path']:
                    input_vec = dy.concatenate([x_vector, input_vec, y_vector])
                else:
                    input_vec = dy.concatenate([x_vector, y_vector])
            if self.opt['use_features']:
                for k in feat_dims:
                    if 'diff' in k and not self.opt['use_freq_features']:
                        continue
                    feat = dy.lookup(self.model_parameters[k], features[k])
                    input_vec = dy.concatenate([input_vec, feat])

            if self.opt['use_height_ebd']:
                if j in tree.term_height:
                    h = tree.get_height(j) - 1
                else:
                    h = 0
                height_vector = dy.lookup(
                    self.model_parameters['height_lookup'], h)
                input_vec = dy.concatenate([input_vec, height_vector])
            return input_vec

        if (i, j) not in self.f_cache:
            data = self.get_data(i, j)
            f = process_one_instance(instance=data[0],
                                     update=self.opt['update_word_ebd'],
                                     x_y_vectors=data[1],
                                     features=data[2],
                                     mode=mode)
            self.f_cache[(i, j)] = f
        if not self.opt['use_sibling']:
            # return dy.concatenate([self.f_cache[(i, j)], self.history[0].output()])
            return self.f_cache[(i, j)]
        else:
            sib = [
                self.f_cache[(sibling, j)] for sibling in tree.get_children(j)
            ]
            if len(sib) == 0:
                return self.f_cache[(i, j)]
            else:
                return self.f_cache[(i, j)] + dy.esum(sib) / len(sib)
コード例 #21
0
ファイル: decomposable.py プロジェクト: kishkash555/biu
 def calc_aggregate(self, v1_i, v2_j, dropout):
     v1 = dy.esum(v1_i)
     v2 = dy.esum(v2_j)
     ret = self.aggregate.evaluate_network(dy.concatenate([v1, v2]), False,
                                           dropout)
     return ret
コード例 #22
0
    def fit(self,
            train_X,
            train_Y,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            variance_weights=None,
            labeled_weight_proportion=1.0):
        """
        train the tagger
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        :param clip_threshold: use gradient clipping with threshold (on if >0; default: 5.0)
        :param labeled_weight_proportion: proportion of the unsupervised weight
                                          that should be assigned to labeled
                                          examples
        """
        print("read training data", file=sys.stderr)

        if variance_weights is not None:
            print('First 20 variance weights:', variance_weights[:20])

        if seed:
            print(">>> using seed: ", seed, file=sys.stderr)
            random.seed(seed)  #setting random seed

        # if we use word dropout keep track of counts
        if word_dropout_rate > 0.0:
            widCount = Counter()
            for sentence, _ in train_X:
                widCount.update([w for w in sentence])

        assert (len(train_X) == len(train_Y))
        train_data = list(zip(train_X, train_Y))

        # if we use target vectors, keep track of the targets per sentence
        if trg_vectors is not None:
            trg_start_id = 0
            sentence_trg_vectors = []
            sentence_var_weights = []
            for i, (example, y) in enumerate(train_data):
                sentence_trg_vectors.append(
                    trg_vectors[trg_start_id:trg_start_id +
                                len(example[0]), :])
                if variance_weights is not None:
                    sentence_var_weights.append(
                        variance_weights[trg_start_id:trg_start_id +
                                         len(example[0])])
                trg_start_id += len(example[0])
            assert trg_start_id == len(trg_vectors),\
                'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors))
            assert len(sentence_trg_vectors) == len(train_X)
            if variance_weights is not None:
                assert trg_start_id == len(variance_weights)
                assert len(sentence_var_weights) == len(train_X)

        print('Starting training for {} epochs...'.format(num_epochs))
        best_val_acc, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1,
                                                       num_epochs),
                      max=len(train_data),
                      flush=True)
            total_loss = 0.0
            total_tagged = 0.0

            total_other_loss, total_other_loss_weighted = 0.0, 0.0

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            for i, idx in enumerate(random_indices):
                (word_indices, char_indices), y = train_data[idx]

                if word_dropout_rate > 0.0:
                    word_indices = [
                        self.w2i["_UNK"] if
                        (random.random() >
                         (widCount.get(w) /
                          (word_dropout_rate + widCount.get(w)))) else w
                        for w in word_indices
                    ]
                output = self.predict(word_indices, char_indices, train=True)

                if len(y) == 1 and y[0] == 0:
                    # in temporal ensembling, we assign a dummy label of [0] for
                    # unlabeled sequences; we skip the supervised loss for these
                    loss = dynet.scalarInput(0)
                else:
                    loss = dynet.esum([
                        self.pick_neg_log(pred, gold)
                        for pred, gold in zip(output, y)
                    ])

                if trg_vectors is not None:
                    # the consistency loss in temporal ensembling is used for
                    # both supervised and unsupervised input
                    targets = sentence_trg_vectors[idx]
                    assert len(output) == len(targets)
                    if variance_weights is not None:
                        var_weights = sentence_var_weights[idx]
                        assert len(output) == len(var_weights)
                        # multiply the normalized mean variance with each loss
                        other_loss = dynet.esum([
                            v * dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t, v in zip(output, targets, var_weights)
                        ])
                    else:
                        other_loss = dynet.esum([
                            dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t in zip(output, targets)
                        ])

                    total_other_loss += other_loss.value()
                    if len(y) == 1 and y[0] == 0:  #unlab_ex
                        other_loss += other_loss * unsup_weight
                    else:  #lab_ex
                        # assign the unsupervised weight for labeled examples
                        other_loss += other_loss * unsup_weight * labeled_weight_proportion
                    # keep track for logging
                    total_loss += loss.value()  # main loss
                    total_tagged += len(word_indices)
                    total_other_loss_weighted += other_loss.value()

                    # combine losses
                    loss += other_loss

                else:
                    # keep track for logging
                    total_loss += loss.value()
                    total_tagged += len(word_indices)

                loss.backward()
                self.trainer.update()
                bar.next()

            if trg_vectors is None:
                print("iter {2} {0:>12}: {1:.2f}".format(
                    "total loss", total_loss / total_tagged, cur_iter),
                      file=sys.stderr)
            else:
                print(
                    "iter {2} {0:>12}: {1:.2f} unsupervised loss: {3:.2f} (weighted: {4:.2f})"
                    .format("supervised loss", total_loss / total_tagged,
                            cur_iter, total_other_loss / total_tagged,
                            total_other_loss_weighted / total_tagged),
                    file=sys.stderr)

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best accuracy on the validation set
                val_correct, val_total = self.evaluate(val_X, val_Y)
                val_accuracy = val_correct / val_total

                if val_accuracy > best_val_acc:
                    print(
                        'Accuracy {:.4f} is better than best val accuracy {:.4f}'
                        .format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save_tagger(self, model_path)
                else:
                    print(
                        'Accuracy {:.4f} is worse than best val loss {:.4f}.'.
                        format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for {} epochs. Early stopping...'.
                          format(epochs_no_improvement))
                    break
コード例 #23
0
def main():
    parser = argparse.ArgumentParser(description='A Neural Attention Model for Abstractive Sentence Summarization in DyNet')

    parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: 0]')
    parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]')
    parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]')
    parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651]')
    parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]')
    parser.add_argument('--vocab_size', type=int, default=60000, help='Vocabulary size [default: 60000]')
    parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]')
    parser.add_argument('--encoder_type', type=str, default='attention', help='Encoder type. bow: Bag-of-words encoder. attention: Attention-based encoder [default: attention]')
    parser.add_argument('--c', type=int, default=5, help='Window size in neural language model [default: 5]')
    parser.add_argument('--q', type=int, default=2, help='Window size in attention-based encoder [default: 2]')
    parser.add_argument('--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS     = args.n_epochs
    N_TRAIN      = args.n_train
    N_VALID      = args.n_valid
    BATCH_SIZE   = args.batch_size
    VOCAB_SIZE   = args.vocab_size
    EMB_DIM      = args.emb_dim
    HID_DIM      = args.hid_dim
    ENCODER_TYPE = args.encoder_type
    C            = args.c
    Q            = args.q
    ALLOC_MEM    = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_STATE)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset ====================================================================================
    w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN)
    w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN)

    train_X, w2i, i2w = build_dataset(TRAIN_X_FILE, w2c=w2c, padid=False, eos=True, unksym='<unk>', target=False, n_data=N_TRAIN, vocab_size=VOCAB_SIZE)
    train_y, _, _     = build_dataset(TRAIN_Y_FILE, w2i=w2i, target=True, n_data=N_TRAIN)

    valid_X, _, _ = build_dataset(VALID_X_FILE, w2i=w2i, target=False, n_data=N_VALID)
    valid_y, _, _ = build_dataset(VALID_Y_FILE, w2i=w2i, target=True, n_data=N_VALID)

    VOCAB_SIZE = len(w2i)
    OUT_DIM = VOCAB_SIZE
    print('VOCAB_SIZE:', VOCAB_SIZE)

    # Build model ======================================================================================
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    rush_abs = ABS(model, EMB_DIM, HID_DIM, VOCAB_SIZE, Q, C, encoder_type=ENCODER_TYPE)

    # Padding
    train_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in train_y]
    valid_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in valid_y]

    n_batches_train = math.ceil(len(train_X)/BATCH_SIZE)
    n_batches_valid = math.ceil(len(valid_X)/BATCH_SIZE)

    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        train_X, train_y = shuffle(train_X, train_y)
        loss_all_train = []
        for i in tqdm(range(n_batches_train)):
            # Create a new computation graph
            dy.renew_cg()
            rush_abs.associate_parameters()

            # Create a mini batch
            start = i*BATCH_SIZE
            end = start + BATCH_SIZE
            train_X_mb = train_X[start:end]
            train_y_mb = train_y[start:end]

            losses = []
            for x, t in zip(train_X_mb, train_y_mb):
                t_in, t_out = t[:-1], t[C:]

                y = rush_abs(x, t_in)
                loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        for i in range(n_batches_valid):
            # Create a new computation graph
            dy.renew_cg()
            rush_abs.associate_parameters()

            # Create a mini batch
            start = i*BATCH_SIZE
            end = start + BATCH_SIZE
            valid_X_mb = valid_X[start:end]
            valid_y_mb = valid_y[start:end]

            losses = []
            for x, t in zip(valid_X_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[C:]

                y = rush_abs(x, t_in)
                loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' % (
            epoch+1,
            np.mean(loss_all_train),
            np.mean(loss_all_valid)
        ))

        # Save model ========================================================================
        dy.save('./model_e'+str(epoch+1), [rush_abs])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w:
            pickle.dump(w2i, f_w2i)
            pickle.dump(i2w, f_i2w)
コード例 #24
0
ファイル: structbilty.py プロジェクト: bplank/bilstm-aux
    def fit(self, train, num_iterations, dev=None, model_path=None, patience=0, minibatch_size=0, log_losses=False):
        """
        train the tagger
        """
        losses_log = {} # log losses

        print("init parameters")
        self.init_parameters(train)

        # init lookup parameters and define graph
        print("build graph")
        self.build_computation_graph(len(self.w2i),  len(self.c2i))

        update_embeds = True
        if self.backprob_embeds == False: ## disable backprob into embeds
            print(">>> disable wembeds update <<<")
            update_embeds = False
            
        best_val_acc, epochs_no_improvement = 0.0, 0

        if dev and model_path is not None and patience > 0:
            print('Using early stopping with patience of {}...'.format(patience))

        batch = []
        print("train..")
        for iteration in range(num_iterations):

            total_loss=0.0
            total_tagged=0.0

            indices = [i for i in range(len(train.seqs))]
            random.shuffle(indices)

            loss_accum_loss = defaultdict(float)
            loss_accum_tagged = defaultdict(float)

            for idx in indices:
                seq = train.seqs[idx]

                if seq.task_id not in losses_log:
                    losses_log[seq.task_id] = [] #initialize

                if minibatch_size > 1:
                    # accumulate instances for minibatch update
                    loss1 = self.predict(seq, train=True, update_embeds=update_embeds)
                    total_tagged += len(seq.words)
                    batch.append(loss1)
                    if len(batch) == minibatch_size:
                        loss = dynet.esum(batch)
                        total_loss += loss.value()

                        # logging
                        loss_accum_tagged[seq.task_id] += len(seq.words)
                        loss_accum_loss[seq.task_id] += loss.value()

                        loss.backward()
                        self.trainer.update()
                        dynet.renew_cg()  # use new computational graph for each BATCH when batching is active
                        batch = []
                else:
                    dynet.renew_cg() # new graph per item
                    loss1 = self.predict(seq, train=True, update_embeds=update_embeds)
                    total_tagged += len(seq.words)
                    lv = loss1.value()
                    total_loss += lv

                    # logging
                    loss_accum_tagged[seq.task_id] += len(seq.words)
                    loss_accum_loss[seq.task_id] += loss1.value()

                    loss1.backward()
                    self.trainer.update()

            print("iter {2} {0:>12}: {1:.2f}".format("total loss", total_loss/total_tagged, iteration))

            # log losses
            for task_id in sorted(losses_log):
                losses_log[task_id].append(loss_accum_loss[task_id] / loss_accum_tagged[task_id])

            if log_losses:
                dill.dump(losses_log, open(model_path + ".model" + ".losses.pickle", "wb"))

            if dev:
                # evaluate after every epoch
                correct, total = self.evaluate(dev, "task0")
                val_accuracy = correct/total
                print("dev accuracy: {0:.4f}".format(val_accuracy))

                if val_accuracy > best_val_acc:
                    print('Accuracy {0:.4f} is better than best val accuracy '
                          '{1:.4f}.'.format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save(self, model_path)
                else:
                    print('Accuracy {0:.4f} is worse than best val loss {1:.4f}.'.format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1

                if patience > 0:
                    if epochs_no_improvement == patience:
                        print('No improvement for {} epochs. Early stopping...'.format(epochs_no_improvement))
                        break
コード例 #25
0
ファイル: network.py プロジェクト: kishkash555/biu
    def train_network(self, train_data, epochs = 3, dev_data = None, test_data = None):
        trainer = dy.SimpleSGDTrainer(self.pc,0.05)
        i = 0
        mloss = 0.
        goods = 0.
        loss = []
        dy.renew_cg()
 
        max_dev_acc = MIN_SAVE_ACC
        run_id = randint(0,9999)
        save_path = "{}{:04d}".format(SAVE_TO,run_id)
        report_path = "{}{:04d}.txt".format(SAVE_REPORT_TO,run_id)
        test_path = "{}{:04d}.txt".format(SAVE_TAGGED_TEST_TO,run_id)
        rprt = open(report_path,'wt')
        print report_path
        for e in range(epochs):
            shuffle(train_data)
            for x, y in train_data:
                i = i + 1
                loss = loss + [self.eval_loss(x, y, dropout=True)]
                good = y == self.last_case_class
                goods += int(good)
                if i % UPDATE_EVERY == 0:
                    losses = dy.esum(loss)
                    mloss += losses.value()
                    losses.backward()
                    trainer.update()
                    loss = []
                    dy.renew_cg()
    
                if i % EVALUATE_LOSS_EVERY == 1000:
                    goods_dev = 0.
                    j = 0
                    for d in dev_data or []:
                        dy.renew_cg()
                        j+=1
                        x, y = d
                        self.eval_loss(x, y)
                        goods_dev += 1 if y==self.last_case_class else 0
                    dev_acc = goods_dev / len(dev_data or 'a') 

                    message = "{} average loss after {} iterations: {} acc: {}".format(
                        now_string(), i, mloss/EVALUATE_LOSS_EVERY, goods/EVALUATE_LOSS_EVERY)
                    dev_acc_str = " dev acc: {}".format(dev_acc) if dev_data else ""
                    print(message + dev_acc_str)
                    rprt.write(message + dev_acc_str+'\n')
                    mloss = 0.
                    goods = 0.

                    if dev_acc > max_dev_acc and i > START_SAVE_AFTER:
                        max_dev_acc = dev_acc
                        print("saving.")
                        rprt.write("saving.\n")
                        self.save(save_path)
                        if test_data:
                            outf = open(test_path,'wt')
                            k = 0
                            goods_test = 0.
                            print("tagging test data.")
                            for dd in test_data:
                                dy.renew_cg()
                                k += 1
                                x, y = dd
                                self.eval_loss(x,y)
                                y_hat = self.last_case_class
                                goods_test += 1 if y == y_hat else 0
                                outf.write("{}{}{}\n".format(x, y, y_hat))
                            outf.close()
                            test_acc = goods_test / len(test_data)
                            print("accurcy on test: {}".format(test_acc))



                rprt.flush()
コード例 #26
0
ファイル: structbilty.py プロジェクト: SigridK/bilstm-aux
    def fit(self,
            train,
            num_iterations,
            dev=None,
            model_path=None,
            patience=0,
            minibatch_size=0,
            log_losses=False):
        """
        train the tagger
        """
        losses_log = {}  # log losses

        print("init parameters")
        self.init_parameters(train)

        # init lookup parameters and define graph
        print("build graph")
        self.build_computation_graph(len(self.w2i), len(self.c2i))

        update_embeds = True
        if self.backprob_embeds == False:  ## disable backprob into embeds
            print(">>> disable wembeds update <<<")
            update_embeds = False

        best_val_acc, epochs_no_improvement = 0.0, 0

        if dev and model_path is not None and patience > 0:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        batch = []
        print("train..")
        for iteration in range(num_iterations):

            total_loss = 0.0
            total_tagged = 0.0

            indices = [i for i in range(len(train.seqs))]
            random.shuffle(indices)

            loss_accum_loss = defaultdict(float)
            loss_accum_tagged = defaultdict(float)

            for idx in indices:
                seq = train.seqs[idx]

                if seq.task_id not in losses_log:
                    losses_log[seq.task_id] = []  #initialize

                if minibatch_size > 1:
                    # accumulate instances for minibatch update
                    loss1 = self.predict(seq,
                                         train=True,
                                         update_embeds=update_embeds)
                    total_tagged += len(seq.words)
                    batch.append(loss1)
                    if len(batch) == minibatch_size:
                        loss = dynet.esum(batch)
                        total_loss += loss.value()

                        # logging
                        loss_accum_tagged[seq.task_id] += len(seq.words)
                        loss_accum_loss[seq.task_id] += loss.value()

                        loss.backward()
                        self.trainer.update()
                        dynet.renew_cg(
                        )  # use new computational graph for each BATCH when batching is active
                        batch = []
                else:
                    dynet.renew_cg()  # new graph per item
                    loss1 = self.predict(seq,
                                         train=True,
                                         update_embeds=update_embeds)
                    total_tagged += len(seq.words)
                    lv = loss1.value()
                    total_loss += lv

                    # logging
                    loss_accum_tagged[seq.task_id] += len(seq.words)
                    loss_accum_loss[seq.task_id] += loss1.value()

                    loss1.backward()
                    self.trainer.update()

            print("iter {2} {0:>12}: {1:.2f}".format("total loss",
                                                     total_loss / total_tagged,
                                                     iteration))

            # log losses
            for task_id in sorted(losses_log):
                losses_log[task_id].append(loss_accum_loss[task_id] /
                                           loss_accum_tagged[task_id])

            if log_losses:
                dill.dump(losses_log,
                          open(model_path + ".model" + ".losses.pickle", "wb"))

            if dev:
                # evaluate after every epoch
                correct, total = self.evaluate(dev, "task0")
                val_accuracy = correct / total
                print("dev accuracy: {0:.4f}".format(val_accuracy))

                if val_accuracy > best_val_acc:
                    print('Accuracy {0:.4f} is better than best val accuracy '
                          '{1:.4f}.'.format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save(self, model_path)
                else:
                    print(
                        'Accuracy {0:.4f} is worse than best val loss {1:.4f}.'
                        .format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1

                if patience > 0:
                    if epochs_no_improvement == patience:
                        print(
                            'No improvement for {} epochs. Early stopping...'.
                            format(epochs_no_improvement))
                        break
コード例 #27
0
ファイル: AttentionBasedDecoder.py プロジェクト: ufwt/TraFix
    def compute_decoder_batch_loss(self, encoded_inputs, input_masks,
                                   output_word_ids, output_masks, batch_size):
        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        # initialize the decoder rnn
        s_0 = self.decoder_rnn.initial_state()

        # initial "input feeding" vectors to feed decoder - 3*h
        init_input_feeding = dn.lookup_batch(self.init_lookup,
                                             [0] * batch_size)

        # initial feedback embeddings for the decoder, use begin seq symbol embedding
        init_feedback = dn.lookup_batch(
            self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size)

        # init decoder rnn
        decoder_init = dn.concatenate([init_feedback, init_input_feeding])
        s = s_0.add_input(decoder_init)

        # loss per timestep
        losses = []

        # run the decoder through the output sequences and aggregate loss
        for i, step_word_ids in enumerate(output_word_ids):

            # returns h x batch size matrix
            decoder_rnn_output = s.output()

            # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix)
            attention_output_vector, alphas = self.attend(
                encoded_inputs, decoder_rnn_output, input_masks)

            # compute output scores (returns vocab_size x batch size matrix)
            # h = readout * attention_output_vector + bias
            h = dn.affine_transform(
                [self.bias, self.readout, attention_output_vector])

            # encourage diversity by punishing highly confident predictions
            # TODO: support batching - esp. w.r.t. scalar inputs
            if self.diverse:
                soft = dn.softmax(dn.tanh(h))
                batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \
                    - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4))
            else:
                # get batch loss for this timestep
                batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids)

            # mask the loss if at least one sentence is shorter
            if output_masks and output_masks[i][-1] != 1:
                mask_expr = dn.inputVector(output_masks[i])
                # noinspection PyArgumentList
                mask_expr = dn.reshape(mask_expr, (1, ), batch_size)
                batch_loss = batch_loss * mask_expr

            # input feeding approach - input h (attention_output_vector) to the decoder
            # prepare for the next iteration - "feedback"
            feedback_embeddings = dn.lookup_batch(self.output_lookup,
                                                  step_word_ids)
            decoder_input = dn.concatenate(
                [feedback_embeddings, attention_output_vector])
            s = s.add_input(decoder_input)

            losses.append(batch_loss)

        # sum the loss over the time steps and batch
        total_batch_loss = dn.sum_batches(dn.esum(losses))

        return total_batch_loss
コード例 #28
0
    def __call__(self, x=None, t=None, test=False):
        if test:
            tt_embs = [dy.lookup(self.E, t_t) for t_t in t]

            if self.encoder_type == 'bow':
                # Neural language model
                tt_c = dy.concatenate(tt_embs)
                h = dy.tanh(self.U * tt_c)

                # Output with softmax
                y_t = dy.softmax(self.V * h + self.W_enc)

            elif self.encoder_type == 'attention':
                ttp_embs = [dy.lookup(self.G, t_t) for t_t in t]

                # Neural language model
                tt_c = dy.concatenate(tt_embs)
                h = dy.tanh(self.U * tt_c)

                # Attention
                ttp_c = dy.concatenate(ttp_embs)
                p = dy.softmax(self.xt * self.P * ttp_c)  # Attention weight
                enc = self.xb * p  # Context vector

                # Output with softmax
                y_t = dy.softmax(self.V * h + self.W * enc)

            return y_t

        else:
            xt_embs = [dy.lookup(self.F, x_t) for x_t in x]
            tt_embs = [dy.lookup(self.E, t_t) for t_t in t]

            y = []
            if self.encoder_type == 'bow':
                # BoW
                enc = dy.average(xt_embs)
                W_enc = self.W * enc
                for i in range(len(t) - self.c + 1):
                    # Neural language model
                    tt_c = dy.concatenate(tt_embs[i:i + self.c])
                    h = dy.tanh(self.U * tt_c)

                    # Output without softmax
                    y_t = self.V * h + W_enc
                    y.append(y_t)

            elif self.encoder_type == 'attention':
                xb = dy.concatenate([
                    dy.esum(xt_embs[max(i - self.q, 0
                                        ):min(len(x) - 1 + 1, i + self.q + 1)])
                    / self.q for i in range(len(x))
                ],
                                    d=1)
                xt = dy.transpose(dy.concatenate(xt_embs, d=1))
                ttp_embs = [dy.lookup(self.G, t_t) for t_t in t]

                for i in range(len(t) - self.c + 1):
                    # Neural language model
                    tt_c = dy.concatenate(tt_embs[i:i + self.c])
                    h = dy.tanh(self.U * tt_c)

                    # Attention
                    ttp_c = dy.concatenate(
                        ttp_embs[i:i + self.c])  # Window-sized embedding
                    p = dy.softmax(xt * self.P * ttp_c)  # Attention weight
                    enc = xb * p  # Context vector

                    # Output without softmax
                    y_t = self.V * h + self.W * enc
                    y.append(y_t)

            return y
コード例 #29
0
ファイル: train.py プロジェクト: y5460y/li_emnlp_2017
def main():
    parser = argparse.ArgumentParser(
        description=
        'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet'
    )

    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--n_epochs',
                        type=int,
                        default=3,
                        help='Number of epochs [default: 3]')
    parser.add_argument(
        '--n_train',
        type=int,
        default=3803957,
        help=
        'Number of training examples (up to 3803957 in gigaword) [default: 3803957]'
    )
    parser.add_argument(
        '--n_valid',
        type=int,
        default=189651,
        help=
        'Number of validation examples (up to 189651 in gigaword) [default: 189651])'
    )
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Mini batch size [default: 32]')
    parser.add_argument('--emb_dim',
                        type=int,
                        default=256,
                        help='Embedding size [default: 256]')
    parser.add_argument('--hid_dim',
                        type=int,
                        default=256,
                        help='Hidden state size [default: 256]')
    parser.add_argument('--lat_dim',
                        type=int,
                        default=256,
                        help='Latent size [default: 256]')
    parser.add_argument(
        '--alloc_mem',
        type=int,
        default=8192,
        help='Amount of memory to allocate [mb] [default: 8192]')
    args = parser.parse_args()
    print(args)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    N_EPOCHS = args.n_epochs
    N_TRAIN = args.n_train
    N_VALID = args.n_valid
    BATCH_SIZE = args.batch_size
    VOCAB_SIZE = 60000
    EMB_DIM = args.emb_dim
    HID_DIM = args.hid_dim
    LAT_DIM = args.lat_dim
    ALLOC_MEM = args.alloc_mem

    # File paths
    TRAIN_X_FILE = './data/train.article.txt'
    TRAIN_Y_FILE = './data/train.title.txt'
    VALID_X_FILE = './data/valid.article.filter.txt'
    VALID_Y_FILE = './data/valid.title.filter.txt'

    # DyNet setting
    dyparams = dy.DynetParams()
    dyparams.set_autobatch(True)
    dyparams.set_random_seed(RANDOM_STATE)
    dyparams.set_mem(ALLOC_MEM)
    dyparams.init()

    # Build dataset ====================================================================================
    w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN)
    w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN)

    train_X, w2i, i2w = build_dataset(TRAIN_X_FILE,
                                      w2c=w2c,
                                      padid=False,
                                      eos=True,
                                      unksym='<unk>',
                                      target=False,
                                      n_data=N_TRAIN,
                                      vocab_size=VOCAB_SIZE)
    train_y, _, _ = build_dataset(TRAIN_Y_FILE,
                                  w2i=w2i,
                                  target=True,
                                  n_data=N_TRAIN)

    valid_X, _, _ = build_dataset(VALID_X_FILE,
                                  w2i=w2i,
                                  target=False,
                                  n_data=N_VALID)
    valid_y, _, _ = build_dataset(VALID_Y_FILE,
                                  w2i=w2i,
                                  target=True,
                                  n_data=N_VALID)

    VOCAB_SIZE = len(w2i)
    OUT_DIM = VOCAB_SIZE
    print(VOCAB_SIZE)

    # Build model ======================================================================================
    model = dy.Model()
    trainer = dy.AdamTrainer(model)

    V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM))

    encoder = BiGRU(model, EMB_DIM, 2 * HID_DIM)
    decoder = RecurrentGenerativeDecoder(model, EMB_DIM, 2 * HID_DIM, LAT_DIM,
                                         OUT_DIM)

    # Train model =======================================================================================
    n_batches_train = math.ceil(len(train_X) / BATCH_SIZE)
    n_batches_valid = math.ceil(len(valid_X) / BATCH_SIZE)

    start_time = time.time()
    for epoch in range(N_EPOCHS):
        # Train
        train_X, train_y = shuffle(train_X, train_y)
        loss_all_train = []
        for i in tqdm(range(n_batches_train)):
            # Create a new computation graph
            dy.renew_cg()
            encoder.associate_parameters()
            decoder.associate_parameters()

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            train_X_mb = train_X[start:end]
            train_y_mb = train_y[start:end]

            losses = []
            for x, t in zip(train_X_mb, train_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                he = encoder(x_embs)

                # Decoder
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                decoder.set_initial_states(he)
                y, KL = decoder(t_embs)

                loss = dy.esum([
                    dy.pickneglogsoftmax(y_t, t_t) + KL_t
                    for y_t, t_t, KL_t in zip(y, t_out, KL)
                ])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_train.append(mb_loss.value())

            # Backward prop
            mb_loss.backward()
            trainer.update()

        # Valid
        loss_all_valid = []
        for i in range(n_batches_valid):
            # Create a new computation graph
            dy.renew_cg()
            encoder.associate_parameters()
            decoder.associate_parameters()

            # Create a mini batch
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            valid_X_mb = valid_X[start:end]
            valid_y_mb = valid_y[start:end]

            losses = []
            for x, t in zip(valid_X_mb, valid_y_mb):
                t_in, t_out = t[:-1], t[1:]

                # Encoder
                x_embs = [dy.lookup(V, x_t) for x_t in x]
                he = encoder(x_embs)

                # Decoder
                t_embs = [dy.lookup(V, t_t) for t_t in t_in]
                decoder.set_initial_states(he)
                y, KL = decoder(t_embs)

                loss = dy.esum([
                    dy.pickneglogsoftmax(y_t, t_t) + KL_t
                    for y_t, t_t, KL_t in zip(y, t_out, KL)
                ])
                losses.append(loss)

            mb_loss = dy.average(losses)

            # Forward prop
            loss_all_valid.append(mb_loss.value())

        print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' %
              (epoch + 1, np.mean(loss_all_train), np.mean(loss_all_valid)))

        # Save model ======================================================================================
        dy.save('./model_e' + str(epoch + 1), [V, encoder, decoder])
        with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump',
                                                     'wb') as f_i2w:
            pickle.dump(w2i, f_w2i)
            pickle.dump(i2w, f_i2w)