Exemple #1
0
def loss_func(ps1, ps2, s1, s2, po1, po2, o1, o2, mask):
    ps1 = layers.concat([1 - ps1, ps1], axis=-1)
    ps2 = layers.concat([1 - ps2, ps2], axis=-1)
    s1 = layers.unsqueeze(s1, -1)
    s2 = layers.unsqueeze(s2, -1)

    s1_loss = layers.cross_entropy(ps1, s1)
    s1_loss = layers.reduce_sum(s1_loss * mask) / layers.reduce_sum(mask)

    s2_loss = layers.cross_entropy(ps2, s2)
    s2_loss = layers.reduce_sum(s2_loss * mask) / layers.reduce_sum(mask)

    po1, o1 = layers.unsqueeze(po1, -1), layers.unsqueeze(o1, -1)
    po1 = layers.concat([1 - po1, po1], axis=-1)
    o1_loss = layers.reduce_sum(layers.cross_entropy(po1, o1), 2)
    o1_loss = layers.reduce_sum(o1_loss * mask) / layers.reduce_sum(mask)

    po2, o2 = layers.unsqueeze(po2, -1), layers.unsqueeze(o2, -1)
    po2 = layers.concat([1 - po2, po2], axis=-1)
    o2_loss = layers.reduce_sum(layers.cross_entropy(po2, o2), 2)
    o2_loss = layers.reduce_sum(o2_loss * mask) / layers.reduce_sum(mask)

    loss = (s1_loss + s2_loss) + (o1_loss + o2_loss)

    return loss
Exemple #2
0
def rc_model(hidden_size, vocab, args):
    emb_shape = [vocab.size(), vocab.embed_dim]
    start_labels = layers.data(name="start_lables",
                               shape=[1],
                               dtype='float32',
                               lod_level=1)
    end_labels = layers.data(name="end_lables",
                             shape=[1],
                             dtype='float32',
                             lod_level=1)

    # stage 1:encode
    q_id0 = get_data('q_id0', 1, args)

    q_ids = get_data('q_ids', 2, args)
    p_ids_name = 'p_ids'

    p_ids = get_data('p_ids', 2, args)
    p_embs = embedding(p_ids, emb_shape, args)
    q_embs = embedding(q_ids, emb_shape, args)
    drnn = layers.DynamicRNN()
    with drnn.block():
        p_emb = drnn.step_input(p_embs)
        q_emb = drnn.step_input(q_embs)

        p_enc = encoder(p_emb, 'p_enc', hidden_size, args)
        q_enc = encoder(q_emb, 'q_enc', hidden_size, args)

        # stage 2:match
        g_i = attn_flow(q_enc, p_enc, p_ids_name, args)
        # stage 3:fusion
        m_i = fusion(g_i, args)
        drnn.output(m_i, q_enc)

    ms, q_encs = drnn()
    p_vec = layers.lod_reset(x=ms, y=start_labels)
    q_vec = layers.lod_reset(x=q_encs, y=q_id0)

    # stage 4:decode
    start_probs, end_probs = point_network_decoder(p_vec=p_vec,
                                                   q_vec=q_vec,
                                                   hidden_size=hidden_size,
                                                   args=args)

    cost0 = layers.sequence_pool(
        layers.cross_entropy(input=start_probs,
                             label=start_labels,
                             soft_label=True), 'sum')
    cost1 = layers.sequence_pool(
        layers.cross_entropy(input=end_probs,
                             label=end_labels,
                             soft_label=True), 'sum')

    cost0 = layers.mean(cost0)
    cost1 = layers.mean(cost1)
    cost = cost0 + cost1
    cost.persistable = True

    feeding_list = ["q_ids", "start_lables", "end_lables", "p_ids", "q_id0"]
    return cost, start_probs, end_probs, ms, feeding_list
    def _collect_metrics(self, inputs, outputs):
        """ Calculate loss function by using inputs and outputs. """
        metrics = {}

        tgt_len = layers.reduce_sum(
            layers.reduce_sum(inputs["tgt_mask"], dim=1) - 1)
        tgt_len.stop_gradient = True

        label = inputs["tgt_token"][:, 1:]
        if self.label_smooth > 0:
            one_hot_label = layers.one_hot(label, self.num_token_embeddings)
            smooth_label = layers.label_smooth(one_hot_label,
                                               epsilon=self.label_smooth,
                                               dtype=self._dtype)
            nll = layers.cross_entropy(outputs["dec_pred"],
                                       smooth_label,
                                       soft_label=True,
                                       ignore_index=self.padding_idx)
        else:
            nll = layers.cross_entropy(outputs["dec_probs"],
                                       label,
                                       ignore_index=self.padding_idx)
        nll = layers.reduce_sum(nll, dim=1)
        token_nll = layers.reduce_sum(nll) / tgt_len
        nll = layers.reduce_mean(nll)
        metrics["nll"] = nll
        metrics["token_nll"] = token_nll
        loss = nll

        if self.num_latent > 0 and self.with_bow:
            bow_probs = F.unsqueeze(outputs["bow_probs"], [1])
            bow_probs = layers.expand(bow_probs, [1, label.shape[1], 1])
            if self.label_smooth > 0:
                bow = layers.cross_entropy(bow_probs,
                                           smooth_label,
                                           soft_label=True,
                                           ignore_index=self.padding_idx)
            else:
                bow = layers.cross_entropy(bow_probs,
                                           label,
                                           ignore_index=self.padding_idx)
            bow = layers.reduce_sum(bow, dim=1)
            token_bow = layers.reduce_sum(bow) / tgt_len
            bow = layers.reduce_mean(bow)
            metrics["bow"] = bow
            metrics["token_bow"] = token_bow
            loss = loss + bow

        if self.num_latent > 0 and self.use_discriminator:
            dis = 0.0 - (layers.log(outputs["pos_probs"]) +
                         layers.log(1.0 - outputs["neg_probs"]))
            dis = layers.reduce_mean(dis)
            metrics["dis"] = dis
            loss = loss + dis * self.dis_ratio

        metrics["loss"] = loss
        metrics["token_num"] = tgt_len
        return metrics
Exemple #4
0
def loss_function(s_arc, s_rel, arcs, rels, mask):
    """Loss function"""
    arcs = nn.masked_select(arcs, mask)
    rels = nn.masked_select(rels, mask)
    s_arc = nn.masked_select(s_arc, mask)
    s_rel = nn.masked_select(s_rel, mask)
    s_rel = nn.index_sample(s_rel, layers.unsqueeze(arcs, 1))
    arc_loss = layers.cross_entropy(layers.softmax(s_arc), arcs)
    rel_loss = layers.cross_entropy(layers.softmax(s_rel), rels)
    loss = layers.reduce_mean(arc_loss + rel_loss)

    return loss
Exemple #5
0
def dynamic(train_data, use_cuda=False, use_parallel_exe=False):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    with fluid.dygraph.guard(place):
        fluid.default_startup_program().random_seed = SEED
        fluid.default_main_program().random_seed = SEED
        dy_layer = DygraphLayer()
        adam = fluid.optimizer.Adam(learning_rate=LR,
                                    parameter_list=dy_layer.parameters())
        sgd = fluid.optimizer.SGD(learning_rate=LR,
                                  parameter_list=dy_layer.parameters())

        for epoch in range(EPOCH_NUM):
            image_data, label = train_data[epoch]
            var_input = fluid.dygraph.to_variable(image_data)
            var_label = fluid.dygraph.to_variable(label)
            hidden, prediction = dy_layer(var_input)

            if epoch % 2 == 0:
                cross_entropy_loss = layers.cross_entropy(
                    prediction, var_label)
                loss = layers.mean(cross_entropy_loss)
                loss.backward()
                adam.minimize(loss)
            else:
                softmax_loss = layers.softmax_with_cross_entropy(
                    prediction, var_label)
                loss = layers.mean(softmax_loss)
                loss.backward()
                sgd.minimize(loss)

            dy_layer.clear_gradients()
        return hidden.numpy(), prediction.numpy(), loss.numpy()
Exemple #6
0
def node_classify_model(word2id, num_labels, embed_dim=16):
    """Build node classify model.

    Args:
        word2id(dict): map word(node) to its corresponding index

        num_labels: The number of labels.

        embed_dim: The dimension of embedding.
    """

    nodes = fl.data('nodes', shape=[None, 1], dtype='int64')
    labels = fl.data('labels', shape=[None, 1], dtype='int64')

    embed_nodes = fl.embedding(input=nodes,
                               size=[len(word2id), embed_dim],
                               param_attr=fluid.ParamAttr(name='content'))

    embed_nodes.stop_gradient = True
    probs = fl.fc(input=embed_nodes, size=num_labels, act='softmax')
    predict = fl.argmax(probs, axis=-1)
    loss = fl.cross_entropy(input=probs, label=labels)
    loss = fl.reduce_mean(loss)

    return {
        'loss': loss,
        'probs': probs,
        'predict': predict,
        'labels': labels,
    }
Exemple #7
0
    def not_test_raw_api(self):
        prog = Program()
        startup_prog = Program()
        with program_guard(prog, startup_prog):
            image = layers.data(name='x', shape=[784], dtype='float32')

            label = layers.data(name='y', shape=[1], dtype='int64')

            limit = layers.fill_constant(shape=[1], dtype='int64', value=5)
            cond = layers.less_than(x=label, y=limit)
            true_image, false_image = split_lod_tensor(input=image, mask=cond)

            true_out = layers.create_tensor(dtype='float32')
            true_cond = ConditionalBlock([cond])

            with true_cond.block():
                hidden = layers.fc(input=true_image, size=100, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                layers.assign(input=prob, output=true_out)

            false_out = layers.create_tensor(dtype='float32')
            false_cond = ConditionalBlock([cond])

            with false_cond.block():
                hidden = layers.fc(input=false_image, size=200, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                layers.assign(input=prob, output=false_out)

            prob = merge_lod_tensor(
                in_true=true_out, in_false=false_out, mask=cond, x=image)
            loss = layers.cross_entropy(input=prob, label=label)
            avg_loss = layers.mean(loss)

            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
            optimizer.minimize(avg_loss, startup_prog)

        train_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=8192),
            batch_size=10)

        place = core.CPUPlace()
        exe = Executor(place)

        exe.run(startup_prog)
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
                x_data = np.array([x[0] for x in data]).astype("float32")
                y_data = np.array([x[1] for x in data]).astype("int64")
                y_data = np.expand_dims(y_data, axis=1)

                outs = exe.run(prog,
                               feed={'x': x_data,
                                     'y': y_data},
                               fetch_list=[avg_loss])
                print(outs[0])
                if outs[0] < 1.0:
                    return
        self.assertFalse(True)
Exemple #8
0
def mlp_pretrain_forward(train_program, start_program):
    with static.program_guard(train_program,
                              start_program), utils.unique_name.guard():
        batch_size = 4
        hidden_size = 1024
        sequence_len = 512
        input = static.data(name="input",
                            shape=[batch_size, sequence_len, hidden_size],
                            dtype='float32')
        label = static.data(name="label",
                            shape=[batch_size, sequence_len, 1],
                            dtype='float32')

        auto.shard_tensor(input,
                          dist_attr={
                              "process_mesh": _global_process_mesh,
                              "dims_mappig": [-1, -1, -1]
                          })

        mlp = MLPLayer(hidden_size=hidden_size,
                       intermediate_size=4 * hidden_size,
                       dropout_ratio=0.1,
                       initializer_range=0.02)

        predict = mlp(input)

        cost = layers.cross_entropy(input=predict, label=label)
        avg_cost = layers.mean(x=cost)

    return avg_cost, train_program, start_program
Exemple #9
0
    def test_recognize_digits_conv(self):
        program = Program()
        with program_guard(program, startup_program=Program()):
            images = layers.data(name='pixel',
                                 shape=[1, 28, 28],
                                 dtype='float32')
            label = layers.data(name='label', shape=[1], dtype='int32')
            conv_pool_1 = nets.simple_img_conv_pool(input=images,
                                                    filter_size=5,
                                                    num_filters=2,
                                                    pool_size=2,
                                                    pool_stride=2,
                                                    act="relu")
            conv_pool_2 = nets.simple_img_conv_pool(input=conv_pool_1,
                                                    filter_size=5,
                                                    num_filters=4,
                                                    pool_size=2,
                                                    pool_stride=2,
                                                    act="relu")

            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
            cost = layers.cross_entropy(input=predict, label=label)
            avg_cost = layers.mean(cost)

        print(str(program))
Exemple #10
0
    def test_recognize_digits_conv(self):
        program = Program()
        with program_guard(program, startup_program=Program()):
            images = layers.data(
                name='pixel', shape=[1, 28, 28], dtype='float32')
            label = layers.data(name='label', shape=[1], dtype='int32')
            conv_pool_1 = nets.simple_img_conv_pool(
                input=images,
                filter_size=5,
                num_filters=2,
                pool_size=2,
                pool_stride=2,
                act="relu")
            conv_pool_2 = nets.simple_img_conv_pool(
                input=conv_pool_1,
                filter_size=5,
                num_filters=4,
                pool_size=2,
                pool_stride=2,
                act="relu")

            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
            cost = layers.cross_entropy(input=predict, label=label)
            avg_cost = layers.mean(cost)

        print(str(program))
Exemple #11
0
    def create_loss_op(self, predict, label, epsilon=1e-7):
        """compute loss with tensor

         Args:
         predict: model output tensor activated by softmax
         label: a non-sparse tensor

         Returns:
         loss: cross-entropy loss
         """
        if self.loss_type == "nl" and self.model_type == "train":
            one_hot_label = fluid.one_hot(label, depth=predict.shape[-1])
            one_hot_label = FL.squeeze(one_hot_label, axes=[-2])
            # log
            neg_prob = 1 - predict
            log_neg_prob = FL.log(
                fluid.layers.clip(neg_prob, min=epsilon, max=1.))
            ce_loss = -1 * log_neg_prob * one_hot_label
            cost = FL.reduce_sum(ce_loss, dim=-1, keep_dim=True)
        else:  # PL or evaluation
            cost = FL.cross_entropy(predict, label)

        loss = FL.mean(cost)

        return loss
def train_program(is_sparse):
    context = encoder(is_sparse)
    rnn_out = train_decoder(context, is_sparse)
    label = pd.data(
        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
    cost = pd.cross_entropy(input=rnn_out, label=label)
    avg_cost = pd.mean(cost)
    return avg_cost
Exemple #13
0
def train_program(is_sparse):
    context = encoder(is_sparse)
    rnn_out = train_decoder(context, is_sparse)
    label = pd.data(
        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
    cost = pd.cross_entropy(input=rnn_out, label=label)
    avg_cost = pd.mean(cost)
    return avg_cost
Exemple #14
0
def loss_func(logits, label, trg_sequence_length):
    probs = layers.softmax(logits)
    loss = layers.cross_entropy(input=probs, label=label)
    trg_mask = layers.sequence_mask(trg_sequence_length,
                                    maxlen=layers.shape(logits)[1],
                                    dtype="float32")
    avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask)
    return avg_cost
Exemple #15
0
 def test_cross_entropy(self):
     program = Program()
     with program_guard(program):
         x = layers.data(name="x", shape=[30, 10], dtype="float32")
         label = layers.data(name="label", shape=[30, 1], dtype="int32")
         mode = 'channel'
         out = layers.cross_entropy(x, label, False, 4)
         self.assertIsNotNone(out)
    def test_ifelse(self):
        prog = Program()
        startup_prog = Program()
        with program_guard(prog, startup_prog):
            image = layers.data(name='x', shape=[784], dtype='float32')

            label = layers.data(name='y', shape=[1], dtype='int64')

            limit = layers.fill_constant_batch_size_like(input=label,
                                                         dtype='int64',
                                                         shape=[1],
                                                         value=5.0)
            cond = layers.less_than(x=label, y=limit)
            ie = layers.IfElse(cond)

            with ie.true_block():
                true_image = ie.input(image)
                hidden = layers.fc(input=true_image, size=100, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            with ie.false_block():
                false_image = ie.input(image)
                hidden = layers.fc(input=false_image, size=200, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            prob = ie()
            loss = layers.cross_entropy(input=prob[0], label=label)
            avg_loss = layers.mean(loss)

            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
            optimizer.minimize(avg_loss, startup_prog)
        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=8192),
                                    batch_size=200)

        place = core.CPUPlace()
        exe = Executor(place)

        exe.run(kwargs['startup_program'])
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                y_data = y_data.reshape((y_data.shape[0], 1))

                outs = exe.run(kwargs['main_program'],
                               feed={
                                   'x': x_data,
                                   'y': y_data
                               },
                               fetch_list=[avg_loss])
                print outs[0]
                if outs[0] < 1.0:
                    return
        self.assertFalse(True)
    def _compute_loss_acc(self, pred):

        loss = layers.cross_entropy(pred, label=self.label, soft_label=False)

        loss = layers.reshape(loss, shape=[self.batch_size, -1])
        loss = layers.reduce_mean(loss)
        acc = fluid.layers.accuracy(input=pred, label=self.label)

        return loss, acc
Exemple #18
0
 def learn(self, probs, label, weight=None, length=None):
     loss = layers.cross_entropy(input=probs, label=label, soft_label=False)
     max_seq_len = layers.shape(probs)[1]
     mask = layers.sequence_mask(length, maxlen=max_seq_len, dtype="float32")
     loss = loss * mask
     loss = layers.reduce_mean(loss, dim=[0])
     loss = layers.reduce_sum(loss)
     optimizer = fluid.optimizer.Adam(self.lr)
     optimizer.minimize(loss)
     return loss
Exemple #19
0
def loss_func(logits, label, trg_sequence_length):
    probs = layers.softmax(logits)
    # 使用交叉熵损失函数
    loss = layers.cross_entropy(input=probs, label=label)
    # 根据长度生成掩码,并依此剔除 padding 部分计算的损失
    trg_mask = layers.sequence_mask(trg_sequence_length,
                                    maxlen=layers.shape(logits)[1],
                                    dtype="float32")
    avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask)
    return avg_cost
Exemple #20
0
 def get_losses(self, out, cls_out, mask, gt_labels):
     loss_cls = L.mean(L.cross_entropy(cls_out,
                                       gt_labels)) * self.train_cfg['w_cls']
     loss_tir = 0
     for feat in out[:-1]:
         feat = L.squeeze(self.avgpool(feat), axes=[2, 3])
         loss_tir += self.triple_loss(feat,
                                      gt_labels) * self.train_cfg['w_tri']
     loss = loss_cls + loss_tir
     return dict(loss_cls=loss_cls, loss_tir=loss_tir, loss=loss)
    def test_ifelse(self):
        prog = Program()
        startup_prog = Program()
        with program_guard(prog, startup_prog):
            image = layers.data(name='x', shape=[784], dtype='float32')

            label = layers.data(name='y', shape=[1], dtype='int64')

            limit = layers.fill_constant_batch_size_like(
                input=label, dtype='int64', shape=[1], value=5.0)
            cond = layers.less_than(x=label, y=limit)
            ie = layers.IfElse(cond)

            with ie.true_block():
                true_image = ie.input(image)
                hidden = layers.fc(input=true_image, size=100, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            with ie.false_block():
                false_image = ie.input(image)
                hidden = layers.fc(input=false_image, size=200, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            prob = ie()
            loss = layers.cross_entropy(input=prob[0], label=label)
            avg_loss = layers.mean(loss)

            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
            optimizer.minimize(avg_loss, startup_prog)
        train_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=8192),
            batch_size=200)

        place = core.CPUPlace()
        exe = Executor(place)

        exe.run(kwargs['startup_program'])
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                y_data = y_data.reshape((y_data.shape[0], 1))

                outs = exe.run(kwargs['main_program'],
                               feed={'x': x_data,
                                     'y': y_data},
                               fetch_list=[avg_loss])
                print outs[0]
                if outs[0] < 1.0:
                    return
        self.assertFalse(True)
def main():
    rnn_out = encoder_decoder()
    label = layers.data(name="target_language_next_word",
                        shape=[1],
                        dtype='int64',
                        lod_level=1)
    cost = layers.cross_entropy(input=rnn_out, label=label)
    avg_cost = fluid.layers.mean(cost)

    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
    optimizer.minimize(avg_cost)

    # fluid.memory_optimize(fluid.default_main_program())
    fluid.release_memory(fluid.default_main_program())

    # fix the order of training data
    train_data = paddle.batch(paddle.dataset.wmt14.train(dict_size),
                              batch_size=batch_size)

    # train_data = paddle.batch(
    #     paddle.reader.shuffle(
    #         paddle.dataset.wmt14.train(dict_size), buf_size=1000),
    #     batch_size=batch_size)

    place = core.CPUPlace()
    exe = Executor(place)

    exe.run(framework.default_startup_program())

    feed_order = [
        'src_word_id', 'target_language_word', 'target_language_next_word'
    ]

    feed_list = [
        fluid.default_main_program().global_block().var(var_name)
        for var_name in feed_order
    ]
    feeder = fluid.DataFeeder(feed_list, place)

    batch_id = 0
    for pass_id in xrange(10):
        for data in train_data():
            outs = exe.run(fluid.default_main_program(),
                           feed=feeder.feed(data),
                           fetch_list=[avg_cost])
            avg_cost_val = np.array(outs[0])
            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                  " avg_cost=" + str(avg_cost_val))
            if batch_id > 2:
                exit(0)
            if math.isnan(float(avg_cost_val)):
                sys.exit("got NaN loss, training failed.")
            batch_id += 1
Exemple #23
0
 def learn(self, act_prob, action, reward, length=None):
     """
     update policy model self.model with policy gradient algorithm
     """
     self.reward = fluid.layers.py_func(
         func=reward_func, x=[action, length], out=reward)
     neg_log_prob = layers.cross_entropy(act_prob, action)
     cost = neg_log_prob * reward
     cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
             ) if length is not None else layers.reduce_mean(cost)
     optimizer = fluid.optimizer.Adam(self.lr)
     optimizer.minimize(cost)
     return cost
Exemple #24
0
    def learn(self, obs, action, reward):
        obs = fluid.dygraph.to_variable(obs)
        obs = layers.cast(obs, dtype='float32')
        act_prob = self.model(obs)
        action = fluid.dygraph.to_variable(action)
        reward = fluid.dygraph.to_variable(reward)

        log_prob = layers.cross_entropy(act_prob, action)
        cost = log_prob * reward
        cost = layers.cast(cost, dtype='float32')
        cost = layers.reduce_mean(cost)
        cost.backward()
        self.optimizer.minimize(cost)
        self.model.clear_gradients()
        return cost
def main():
    rnn_out = encoder_decoder()
    label = layers.data(
        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
    cost = layers.cross_entropy(input=rnn_out, label=label)
    avg_cost = fluid.layers.mean(cost)

    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
    optimizer.minimize(avg_cost)

    # fluid.memory_optimize(fluid.default_main_program())
    fluid.release_memory(fluid.default_main_program())

    # fix the order of training data
    train_data = paddle.batch(
        paddle.dataset.wmt14.train(dict_size), batch_size=batch_size)

    # train_data = paddle.batch(
    #     paddle.reader.shuffle(
    #         paddle.dataset.wmt14.train(dict_size), buf_size=1000),
    #     batch_size=batch_size)

    place = core.CPUPlace()
    exe = Executor(place)

    exe.run(framework.default_startup_program())

    batch_id = 0
    for pass_id in xrange(10):
        for data in train_data():
            word_data = to_lodtensor(map(lambda x: x[0], data), place)
            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
            outs = exe.run(fluid.default_main_program(),
                           feed={
                               'src_word_id': word_data,
                               'target_language_word': trg_word,
                               'target_language_next_word': trg_word_next
                           },
                           fetch_list=[avg_cost])
            avg_cost_val = np.array(outs[0])
            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                  " avg_cost=" + str(avg_cost_val))
            if batch_id > 2:
                exit(0)
            if math.isnan(float(avg_cost_val)):
                sys.exit("got NaN loss, training failed.")
            batch_id += 1
Exemple #26
0
def train_main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    context = encoder()
    state_cell = decoder_state_cell(context)
    rnn_out = decoder_train(state_cell)
    label = layers.data(name="target_next_word",
                        shape=[1],
                        dtype='int64',
                        lod_level=1)
    cost = layers.cross_entropy(input=rnn_out, label=label)
    avg_cost = layers.mean(x=cost)

    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3)
    optimizer.minimize(avg_cost)

    train_reader = paddle.batch(paddle.reader.shuffle(
        paddle.dataset.wmt14.train(dict_size), buf_size=1000),
                                batch_size=batch_size)
    feed_order = ['src_word', 'target_word', 'target_next_word']

    exe = Executor(place)

    def train_loop(main_program):
        exe.run(framework.default_startup_program())

        feed_list = [
            main_program.global_block().var(var_name)
            for var_name in feed_order
        ]
        feeder = fluid.DataFeeder(feed_list, place)

        for pass_id in range(1):
            for batch_id, data in enumerate(train_reader()):
                outs = exe.run(main_program,
                               feed=feeder.feed(data),
                               fetch_list=[avg_cost])
                avg_cost_val = np.array(outs[0])
                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                      " avg_cost=" + str(avg_cost_val))
                if batch_id > 3:
                    break

    train_loop(framework.default_main_program())
Exemple #27
0
    def test_recognize_digits_mlp(self):
        program = Program()
        with program_guard(program, startup_program=Program()):
            # Change g_program, so the rest layers use `g_program`
            images = layers.data(name='pixel', shape=[784], dtype='float32')
            label = layers.data(name='label', shape=[1], dtype='int32')
            hidden1 = layers.fc(input=images, size=128, act='relu')
            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
            predict = layers.fc(input=[hidden2, hidden1],
                                size=10,
                                act='softmax',
                                param_attr=["sftmax.w1", "sftmax.w2"])
            cost = layers.cross_entropy(input=predict, label=label)
            avg_cost = layers.mean(cost)
            self.assertIsNotNone(avg_cost)

        print(str(program))
Exemple #28
0
    def test_word_embedding(self):
        program = Program()
        with program_guard(program, startup_program=Program()):
            dict_size = 10000
            embed_size = 32
            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
            next_word = layers.data(name='nextw', shape=[1], dtype='int64')

            embed_first = layers.embedding(
                input=first_word,
                size=[dict_size, embed_size],
                dtype='float32',
                param_attr='shared_w')
            embed_second = layers.embedding(
                input=second_word,
                size=[dict_size, embed_size],
                dtype='float32',
                param_attr='shared_w')

            embed_third = layers.embedding(
                input=third_word,
                size=[dict_size, embed_size],
                dtype='float32',
                param_attr='shared_w')
            embed_forth = layers.embedding(
                input=forth_word,
                size=[dict_size, embed_size],
                dtype='float32',
                param_attr='shared_w')

            concat_embed = layers.concat(
                input=[embed_first, embed_second, embed_third, embed_forth],
                axis=1)

            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
            predict_word = layers.fc(input=hidden1,
                                     size=dict_size,
                                     act='softmax')
            cost = layers.cross_entropy(input=predict_word, label=next_word)
            avg_cost = layers.mean(cost)
            self.assertIsNotNone(avg_cost)

        print(str(program))
Exemple #29
0
 def forward(self, cue, label, return_loss=True):
     out = self.conv1(cue)
     out = self.norm1(out)
     out = self.maxpool(out)
     out = self.conv2(out)
     out = self.norm2(out)
     out = self.avgpool(out)
     if return_loss:
         cls_out = L.dropout(out, dropout_prob=0.5, is_test=False)
         cls_out = self.fc(L.squeeze(cls_out, axes=[2, 3]))
         loss_cls = L.mean(L.cross_entropy(cls_out, label))
         losses = dict(loss_cls=loss_cls, loss=loss_cls)
         return losses
     else:
         cls_out = self.fc(L.squeeze(out, axes=[2, 3]))
         cls_out = L.softmax(cls_out).numpy()[:, 0]
         return cls_out
Exemple #30
0
    def test_word_embedding(self):
        program = Program()
        with program_guard(program, startup_program=Program()):
            dict_size = 10000
            embed_size = 32
            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
            next_word = layers.data(name='nextw', shape=[1], dtype='int64')

            embed_first = layers.embedding(
                input=first_word,
                size=[dict_size, embed_size],
                dtype='float32',
                param_attr='shared_w')
            embed_second = layers.embedding(
                input=second_word,
                size=[dict_size, embed_size],
                dtype='float32',
                param_attr='shared_w')

            embed_third = layers.embedding(
                input=third_word,
                size=[dict_size, embed_size],
                dtype='float32',
                param_attr='shared_w')
            embed_forth = layers.embedding(
                input=forth_word,
                size=[dict_size, embed_size],
                dtype='float32',
                param_attr='shared_w')

            concat_embed = layers.concat(
                input=[embed_first, embed_second, embed_third, embed_forth],
                axis=1)

            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
            predict_word = layers.fc(input=hidden1,
                                     size=dict_size,
                                     act='softmax')
            cost = layers.cross_entropy(input=predict_word, label=next_word)
            avg_cost = layers.mean(cost)
            self.assertIsNotNone(avg_cost)

        print(str(program))
Exemple #31
0
    def test_recognize_digits_mlp(self):
        program = Program()
        with program_guard(program, startup_program=Program()):
            # Change g_program, so the rest layers use `g_program`
            images = layers.data(name='pixel', shape=[784], dtype='float32')
            label = layers.data(name='label', shape=[1], dtype='int32')
            hidden1 = layers.fc(input=images, size=128, act='relu')
            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
            predict = layers.fc(input=[hidden2, hidden1],
                                size=10,
                                act='softmax',
                                param_attr=["sftmax.w1", "sftmax.w2"])
            cost = layers.cross_entropy(input=predict, label=label)
            avg_cost = layers.mean(cost)
            self.assertIsNotNone(avg_cost)

        print(str(program))
Exemple #32
0
 def get_losses(self, out, cls_out, mask, gt_labels):
     loss_cls = L.mean(L.cross_entropy(cls_out,
                                       gt_labels)) * self.train_cfg['w_cls']
     cue = out[-1] if self.train_cfg['with_mask'] else L.elementwise_mul(
         out[-1], L.cast(gt_labels, 'float32'), axis=0)
     num_reg = L.cast(
         L.reduce_sum(gt_labels) * cue.shape[1] * cue.shape[2] *
         cue.shape[3], 'float32')
     loss_reg = L.reduce_sum(
         L.abs(mask - cue)) / (num_reg + 1e-8) * self.train_cfg['w_reg']
     loss_tir = 0
     for feat in out[:-1]:
         feat = L.squeeze(self.avgpool(feat), axes=[2, 3])
         loss_tir += self.triple_loss(feat,
                                      gt_labels) * self.train_cfg['w_tri']
     loss = loss_cls + loss_reg + loss_tir
     return dict(loss_cls=loss_cls,
                 loss_reg=loss_reg,
                 loss_tir=loss_tir,
                 loss=loss)
Exemple #33
0
def node_classify_model(config):
    """Build node classify model.
    """
    nodes = fl.data('nodes', shape=[None, 1], dtype='int64')
    labels = fl.data('labels', shape=[None, 1], dtype='int64')

    embed_nodes = fl.embedding(input=nodes,
                               size=[config.num_nodes, config.embed_dim],
                               param_attr=fluid.ParamAttr(name='weight'))

    embed_nodes.stop_gradient = True
    probs = fl.fc(input=embed_nodes, size=config.num_labels, act='softmax')
    predict = fl.argmax(probs, axis=-1)
    loss = fl.cross_entropy(input=probs, label=labels)
    loss = fl.reduce_mean(loss)

    return {
        'loss': loss,
        'probs': probs,
        'predict': predict,
        'labels': labels,
    }
    def test_raw_api(self):
        prog = Program()
        startup_prog = Program()
        with program_guard(prog, startup_prog):
            image = layers.data(name='x', shape=[784], dtype='float32')

            label = layers.data(name='y', shape=[1], dtype='int64')

            limit = layers.fill_constant_batch_size_like(
                input=label, dtype='int64', shape=[1], value=5.0)
            cond = layers.less_than(x=label, y=limit)
            true_image, false_image = layers.split_lod_tensor(
                input=image, mask=cond)

            true_out = layers.create_tensor(dtype='float32')
            true_cond = layers.ConditionalBlock([true_image])

            with true_cond.block():
                hidden = layers.fc(input=true_image, size=100, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                layers.assign(input=prob, output=true_out)

            false_out = layers.create_tensor(dtype='float32')
            false_cond = layers.ConditionalBlock([false_image])

            with false_cond.block():
                hidden = layers.fc(input=false_image, size=200, act='tanh')
                prob = layers.fc(input=hidden, size=10, act='softmax')
                layers.assign(input=prob, output=false_out)

            prob = layers.merge_lod_tensor(
                in_true=true_out, in_false=false_out, mask=cond, x=image)
            loss = layers.cross_entropy(input=prob, label=label)
            avg_loss = layers.mean(loss)

            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
            optimizer.minimize(avg_loss, startup_prog)

        train_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=8192),
            batch_size=200)

        place = core.CPUPlace()
        exe = Executor(place)

        exe.run(startup_prog)
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                y_data = np.expand_dims(y_data, axis=1)

                outs = exe.run(prog,
                               feed={'x': x_data,
                                     'y': y_data},
                               fetch_list=[avg_loss])
                print outs[0]
                if outs[0] < 1.0:
                    return
        self.assertFalse(True)
def knowledge_seq2seq(config):
    """ knowledge seq2seq """
    emb_size = config.embed_size
    hidden_size = config.hidden_size
    input_size = emb_size
    num_layers = config.num_layers
    bi_direc = config.bidirectional
    batch_size = config.batch_size
    vocab_size = config.vocab_size
    run_type = config.run_type

    enc_input = layers.data(name="enc_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #enc_input --> goal
    enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32')
    goal_input = layers.data(name="goal_input",
                             shape=[1],
                             dtype='int64',
                             lod_level=1)  #goal_input --> x
    cue_input = layers.data(name="cue_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #cue_input --> kg
    #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32')
    memory_mask = layers.data(name='memory_mask',
                              shape=[-1, 1],
                              dtype='float32')
    tar_input = layers.data(name='tar_input',
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #tar_input --> y
    # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32')

    rnn_hidden_size = hidden_size
    if bi_direc:
        rnn_hidden_size //= 2

    enc_out, enc_last_hidden = \
        rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc")
    goal_out, goal_last_hidden = \
        rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc1")
    context_goal_out = fluid.layers.concat(
        input=[enc_last_hidden, goal_last_hidden], axis=2)
    context_goal_out = layers.reshape(context_goal_out,
                                      shape=[-1, 1, rnn_hidden_size * 4])
    # context_goal_out = layers.squeeze(context_goal_out, axes=[1])
    context_goal_out = fluid.layers.fc(context_goal_out,
                                       size=rnn_hidden_size * 2,
                                       bias_attr=False)
    context_goal_out = layers.unsqueeze(context_goal_out, axes=[0])
    bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge")
    bridge_out = layers.tanh(bridge_out)

    cue_last_mask = layers.data(name='cue_last_mask',
                                shape=[-1],
                                dtype='float32')
    knowledge_out, knowledge_last_hidden = \
        rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc")

    query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1])
    query = layers.squeeze(query, axes=[0])
    query = layers.unsqueeze(query, axes=[1])
    query = layers.reshape(query, shape=[batch_size, -1, hidden_size])
    cue_memory = layers.slice(knowledge_last_hidden,
                              axes=[0],
                              starts=[0],
                              ends=[1])
    cue_memory = layers.reshape(cue_memory,
                                shape=[batch_size, -1, hidden_size])
    memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1])

    weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask)

    cue_att = layers.reshape(cue_att, shape=[batch_size, -1])

    knowledge = weighted_cue
    if config.use_posterior:
        print("config.use_posterior", config.use_posterior)
        target_out, target_last_hidden = \
            rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                        dropout=0.0, batch_first=True, name="knowledge_enc1")
        target_goal_out = fluid.layers.concat(
            input=[target_last_hidden, goal_last_hidden], axis=2)
        target_goal_out = layers.reshape(target_goal_out,
                                         shape=[-1, 1, rnn_hidden_size * 4])
        # target_goal_out = layers.squeeze(target_goal_out, axes=[1])
        target_goal_out = fluid.layers.fc(target_goal_out,
                                          size=rnn_hidden_size * 2,
                                          bias_attr=False)
        target_goal_out = layers.unsqueeze(target_goal_out, axes=[0])

        # get attenion
        # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1])
        target_query = layers.slice(target_goal_out,
                                    axes=[0],
                                    starts=[0],
                                    ends=[1])
        target_query = layers.squeeze(target_query, axes=[0])
        target_query = layers.unsqueeze(target_query, axes=[1])
        target_query = layers.reshape(target_query,
                                      shape=[batch_size, -1, hidden_size])

        weight_target, target_att = dot_attention(target_query,
                                                  cue_memory,
                                                  mask=memory_mask)
        target_att = layers.reshape(target_att, shape=[batch_size, -1])
        # add to output
        knowledge = weight_target

    enc_memory_mask = layers.data(name="enc_memory_mask",
                                  shape=[-1, 1],
                                  dtype='float32')
    enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1])
    # decoder init_hidden, enc_memory, enc_mask
    dec_init_hidden = bridge_out
    pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32'))

    enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out,
                                                  pad_value=pad_value)
    enc_memory.persistable = True

    gru_unit = GRU_unit(input_size + hidden_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=0.0,
                        name="decoder_gru_unit")

    cue_gru_unit = GRU_unit(hidden_size + hidden_size,
                            hidden_size,
                            num_layers=num_layers,
                            dropout=0.0,
                            name="decoder_cue_gru_unit")

    tgt_vocab_size = config.vocab_size
    if run_type == "train":
        if config.use_bow:
            bow_logits = fc(knowledge,
                            hidden_size,
                            hidden_size,
                            name='bow_fc_1')
            bow_logits = layers.tanh(bow_logits)
            bow_logits = fc(bow_logits,
                            hidden_size,
                            tgt_vocab_size,
                            name='bow_fc_2')
            bow_logits = layers.softmax(bow_logits)

            bow_label = layers.data(name='bow_label',
                                    shape=[-1, config.max_len],
                                    dtype='int64')
            bow_mask = layers.data(name="bow_mask",
                                   shape=[-1, config.max_len],
                                   dtype='float32')

            bow_logits = layers.expand(bow_logits, [1, config.max_len, 1])
            bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size])
            bow_label = layers.reshape(bow_label, shape=[-1, 1])
            bow_loss = layers.cross_entropy(bow_logits,
                                            bow_label,
                                            soft_label=False)
            bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len])

            bow_loss *= bow_mask
            bow_loss = layers.reduce_sum(bow_loss, dim=[1])
            bow_loss = layers.reduce_mean(bow_loss)

        dec_input = layers.data(name="dec_input",
                                shape=[-1, 1, 1],
                                dtype='int64')
        dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32')

        dec_knowledge = weight_target

        knowledge_goal_out = fluid.layers.concat(
            input=[dec_knowledge, target_query], axis=2)
        knowledge_goal_out = layers.reshape(knowledge_goal_out,
                                            shape=[-1, 1, rnn_hidden_size * 4])
        # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1])
        knowledge_goal_out = fluid.layers.fc(knowledge_goal_out,
                                             size=rnn_hidden_size * 2,
                                             bias_attr=False)
        knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0])

        decoder_logits = \
            rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers,
                         enc_memory, enc_memory_mask, dec_knowledge, vocab_size,
                         init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout)

        target_label = layers.data(name='target_label',
                                   shape=[-1, 1],
                                   dtype='int64')
        target_mask = layers.data(name='target_mask',
                                  shape=[-1, 1],
                                  dtype='float32')

        decoder_logits = layers.reshape(decoder_logits,
                                        shape=[-1, tgt_vocab_size])
        target_label = layers.reshape(target_label, shape=[-1, 1])

        nll_loss = layers.cross_entropy(decoder_logits,
                                        target_label,
                                        soft_label=False)
        nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1])
        nll_loss *= target_mask
        nll_loss = layers.reduce_sum(nll_loss, dim=[1])
        nll_loss = layers.reduce_mean(nll_loss)

        prior_attn = cue_att + 1e-10
        posterior_att = target_att
        posterior_att.stop_gradient = True

        prior_attn = layers.log(prior_attn)

        kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) -
                                   prior_attn)
        kl_loss = layers.reduce_mean(kl_loss)

        kl_and_nll_factor = layers.data(name='kl_and_nll_factor',
                                        shape=[1],
                                        dtype='float32')
        kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1])

        final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor

        return [bow_loss, kl_loss, nll_loss, final_loss]

    elif run_type == "test":
        beam_size = config.beam_size
        batch_size = config.batch_size
        token = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                     value=config.bos_id,
                                     dtype='int64')

        token = layers.reshape(token, shape=[-1, 1])
        max_decode_len = config.max_dec_len

        dec_knowledge = knowledge
        INF = 100000000.0

        init_score_np = np.ones([beam_size * batch_size],
                                dtype='float32') * -INF

        for i in range(batch_size):
            init_score_np[i * beam_size] = 0.0

        pre_score = layers.assign(init_score_np)

        pos_index_np = np.arange(batch_size).reshape(-1, 1)
        pos_index_np = \
            np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size

        pos_index = layers.assign(pos_index_np)

        id_array = []
        score_array = []
        index_array = []
        init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1])
        init_enc_memory = layers.reshape(
            init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size])
        init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1])
        init_enc_mask = layers.reshape(init_enc_mask,
                                       shape=[batch_size * beam_size, 1, -1])

        dec_knowledge = layers.reshape(dec_knowledge,
                                       shape=[-1, 1, hidden_size])
        init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1])
        init_dec_knowledge = layers.reshape(
            init_dec_knowledge,
            shape=[batch_size * beam_size, -1, hidden_size])

        dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size])
        dec_init_hidden = layers.reshape(dec_init_hidden,
                                         shape=[1, -1, hidden_size])

        length_average = config.length_average
        UNK = config.unk_id
        EOS = config.eos_id
        for i in range(1, max_decode_len + 1):
            dec_emb = get_embedding(token, input_size, vocab_size)
            dec_out, dec_last_hidden = \
                decoder_step(gru_unit, cue_gru_unit,
                             dec_emb, dec_init_hidden, input_size, hidden_size,
                             init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None)
            output_in_size = hidden_size + hidden_size

            rnnout = layers.dropout(dec_out,
                                    dropout_prob=config.dropout,
                                    is_test=True)
            rnnout = fc(rnnout,
                        output_in_size,
                        hidden_size,
                        name='dec_out_fc1')
            rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2')

            log_softmax_output = log_softmax(rnnout)
            log_softmax_output = layers.squeeze(log_softmax_output, axes=[1])

            if i > 1:
                if length_average:
                    log_softmax_output = layers.elementwise_add(
                        (log_softmax_output / i),
                        (pre_score * (1.0 - 1.0 / i)),
                        axis=0)
                else:
                    log_softmax_output = layers.elementwise_add(
                        log_softmax_output, pre_score, axis=0)
            else:
                log_softmax_output = layers.elementwise_add(log_softmax_output,
                                                            pre_score,
                                                            axis=0)

            log_softmax_output = layers.reshape(log_softmax_output,
                                                shape=[batch_size, -1])

            topk_score, topk_index = layers.topk(log_softmax_output,
                                                 k=beam_size)
            topk_score = layers.reshape(topk_score, shape=[-1])
            topk_index = layers.reshape(topk_index, shape=[-1])

            vocab_var = layers.fill_constant([1],
                                             dtype='int64',
                                             value=vocab_size)
            new_token = topk_index % vocab_var

            index = topk_index // vocab_var
            id_array.append(new_token)
            index_array.append(index)
            index = index + pos_index

            score_array.append(topk_score)

            eos_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=EOS)
            unk_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=UNK)
            eos_eq = layers.cast(layers.equal(new_token, eos_ids),
                                 dtype='float32')

            topk_score += eos_eq * -100000000.0

            unk_eq = layers.cast(layers.equal(new_token, unk_ids),
                                 dtype='float32')
            topk_score += unk_eq * -100000000.0

            # update
            token = new_token
            pre_score = topk_score
            token = layers.reshape(token, shape=[-1, 1])

            index = layers.cast(index, dtype='int32')
            dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0])
            dec_init_hidden = layers.gather(dec_last_hidden, index=index)
            dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0])
            init_enc_memory = layers.gather(init_enc_memory, index)
            init_enc_mask = layers.gather(init_enc_mask, index)
            init_dec_knowledge = layers.gather(init_dec_knowledge, index)

        final_score = layers.concat(score_array, axis=0)
        final_ids = layers.concat(id_array, axis=0)
        final_index = layers.concat(index_array, axis=0)

        final_score = layers.reshape(
            final_score, shape=[max_decode_len, beam_size * batch_size])
        final_ids = layers.reshape(
            final_ids, shape=[max_decode_len, beam_size * batch_size])
        final_index = layers.reshape(
            final_index, shape=[max_decode_len, beam_size * batch_size])

        return final_score, final_ids, final_index
def transformer(
        src_vocab_size,
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        src_pad_idx,
        trg_pad_idx,
        pos_pad_idx, ):
    file_obj = open_recordio_file(
        filename=os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio'),
        shapes=[
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
            [batch_size, n_head, max_length, max_length],
            [batch_size, n_head, max_length, max_length],
            [batch_size, n_head, max_length, max_length],
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
        ],
        dtypes=[
            'int64',
            'int64',
            'int64',
            'int64',
            'float32',
            'float32',
            'float32',
            'int64',
            'float32',
        ],
        lod_levels=[0] * 9)

    src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file(
        file_obj)

    enc_input = prepare_encoder(
        src_word,
        src_pos,
        src_vocab_size,
        d_model,
        src_pad_idx,
        max_length,
        dropout_rate, )
    enc_output = encoder(
        enc_input,
        src_slf_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate, )

    dec_input = prepare_decoder(
        trg_word,
        trg_pos,
        trg_vocab_size,
        d_model,
        trg_pad_idx,
        max_length,
        dropout_rate, )
    dec_output = decoder(
        dec_input,
        enc_output,
        trg_slf_attn_bias,
        trg_src_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate, )

    # TODO(guosheng): Share the weight matrix between the embedding layers and
    # the pre-softmax linear transformation.
    predict = layers.reshape(
        x=layers.fc(input=dec_output,
                    size=trg_vocab_size,
                    param_attr=fluid.initializer.Xavier(uniform=False),
                    bias_attr=False,
                    num_flatten_dims=2),
        shape=[-1, trg_vocab_size],
        act="softmax")

    cost = layers.cross_entropy(input=predict, label=gold)
    weighted_cost = cost * weights
    return layers.reduce_sum(weighted_cost)
def train_main(use_cuda, is_sparse, is_local=True):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    context = encoder(is_sparse)
    rnn_out = decoder_train(context, is_sparse)
    label = pd.data(name="target_language_next_word",
                    shape=[1],
                    dtype='int64',
                    lod_level=1)
    cost = pd.cross_entropy(input=rnn_out, label=label)
    avg_cost = pd.mean(cost)

    optimizer = fluid.optimizer.Adagrad(
        learning_rate=1e-4,
        regularization=fluid.regularizer.L2DecayRegularizer(
            regularization_coeff=0.1))
    optimize_ops, params_grads = optimizer.minimize(avg_cost)

    train_data = paddle.batch(paddle.reader.shuffle(
        paddle.dataset.wmt14.train(dict_size), buf_size=1000),
                              batch_size=batch_size)

    exe = Executor(place)

    def train_loop(main_program):
        exe.run(framework.default_startup_program())

        batch_id = 0
        for pass_id in xrange(1):
            for data in train_data():
                word_data = to_lodtensor(map(lambda x: x[0], data), place)
                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
                outs = exe.run(main_program,
                               feed={
                                   'src_word_id': word_data,
                                   'target_language_word': trg_word,
                                   'target_language_next_word': trg_word_next
                               },
                               fetch_list=[avg_cost])
                avg_cost_val = np.array(outs[0])
                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                      " avg_cost=" + str(avg_cost_val))
                if batch_id > 3:
                    break
                batch_id += 1

    if is_local:
        train_loop(framework.default_main_program())
    else:
        port = os.getenv("PADDLE_INIT_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())
Exemple #38
0
def transformer(
        src_vocab_size,
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        src_pad_idx,
        trg_pad_idx,
        pos_pad_idx, ):
    file_obj = fluid.layers.open_recordio_file(
        filename='./wmt16.recordio',
        shapes=[
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
            [batch_size, n_head, max_length, max_length],
            [batch_size, n_head, max_length, max_length],
            [batch_size, n_head, max_length, max_length],
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
        ],
        dtypes=[
            'int64',
            'int64',
            'int64',
            'int64',
            'float32',
            'float32',
            'float32',
            'int64',
            'float32',
        ],
        lod_levels=[0] * 9)

    src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file(
        file_obj)

    enc_input = prepare_encoder(
        src_word,
        src_pos,
        src_vocab_size,
        d_model,
        src_pad_idx,
        max_length,
        dropout_rate, )
    enc_output = encoder(
        enc_input,
        src_slf_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate, )

    dec_input = prepare_decoder(
        trg_word,
        trg_pos,
        trg_vocab_size,
        d_model,
        trg_pad_idx,
        max_length,
        dropout_rate, )
    dec_output = decoder(
        dec_input,
        enc_output,
        trg_slf_attn_bias,
        trg_src_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate, )

    # TODO(guosheng): Share the weight matrix between the embedding layers and
    # the pre-softmax linear transformation.
    predict = layers.reshape(
        x=layers.fc(input=dec_output,
                    size=trg_vocab_size,
                    param_attr=fluid.initializer.Xavier(uniform=False),
                    bias_attr=False,
                    num_flatten_dims=2),
        shape=[-1, trg_vocab_size],
        act="softmax")

    cost = layers.cross_entropy(input=predict, label=gold)
    weighted_cost = cost * weights
    return layers.reduce_sum(weighted_cost)