def loss_func(ps1, ps2, s1, s2, po1, po2, o1, o2, mask): ps1 = layers.concat([1 - ps1, ps1], axis=-1) ps2 = layers.concat([1 - ps2, ps2], axis=-1) s1 = layers.unsqueeze(s1, -1) s2 = layers.unsqueeze(s2, -1) s1_loss = layers.cross_entropy(ps1, s1) s1_loss = layers.reduce_sum(s1_loss * mask) / layers.reduce_sum(mask) s2_loss = layers.cross_entropy(ps2, s2) s2_loss = layers.reduce_sum(s2_loss * mask) / layers.reduce_sum(mask) po1, o1 = layers.unsqueeze(po1, -1), layers.unsqueeze(o1, -1) po1 = layers.concat([1 - po1, po1], axis=-1) o1_loss = layers.reduce_sum(layers.cross_entropy(po1, o1), 2) o1_loss = layers.reduce_sum(o1_loss * mask) / layers.reduce_sum(mask) po2, o2 = layers.unsqueeze(po2, -1), layers.unsqueeze(o2, -1) po2 = layers.concat([1 - po2, po2], axis=-1) o2_loss = layers.reduce_sum(layers.cross_entropy(po2, o2), 2) o2_loss = layers.reduce_sum(o2_loss * mask) / layers.reduce_sum(mask) loss = (s1_loss + s2_loss) + (o1_loss + o2_loss) return loss
def rc_model(hidden_size, vocab, args): emb_shape = [vocab.size(), vocab.embed_dim] start_labels = layers.data(name="start_lables", shape=[1], dtype='float32', lod_level=1) end_labels = layers.data(name="end_lables", shape=[1], dtype='float32', lod_level=1) # stage 1:encode q_id0 = get_data('q_id0', 1, args) q_ids = get_data('q_ids', 2, args) p_ids_name = 'p_ids' p_ids = get_data('p_ids', 2, args) p_embs = embedding(p_ids, emb_shape, args) q_embs = embedding(q_ids, emb_shape, args) drnn = layers.DynamicRNN() with drnn.block(): p_emb = drnn.step_input(p_embs) q_emb = drnn.step_input(q_embs) p_enc = encoder(p_emb, 'p_enc', hidden_size, args) q_enc = encoder(q_emb, 'q_enc', hidden_size, args) # stage 2:match g_i = attn_flow(q_enc, p_enc, p_ids_name, args) # stage 3:fusion m_i = fusion(g_i, args) drnn.output(m_i, q_enc) ms, q_encs = drnn() p_vec = layers.lod_reset(x=ms, y=start_labels) q_vec = layers.lod_reset(x=q_encs, y=q_id0) # stage 4:decode start_probs, end_probs = point_network_decoder(p_vec=p_vec, q_vec=q_vec, hidden_size=hidden_size, args=args) cost0 = layers.sequence_pool( layers.cross_entropy(input=start_probs, label=start_labels, soft_label=True), 'sum') cost1 = layers.sequence_pool( layers.cross_entropy(input=end_probs, label=end_labels, soft_label=True), 'sum') cost0 = layers.mean(cost0) cost1 = layers.mean(cost1) cost = cost0 + cost1 cost.persistable = True feeding_list = ["q_ids", "start_lables", "end_lables", "p_ids", "q_id0"] return cost, start_probs, end_probs, ms, feeding_list
def _collect_metrics(self, inputs, outputs): """ Calculate loss function by using inputs and outputs. """ metrics = {} tgt_len = layers.reduce_sum( layers.reduce_sum(inputs["tgt_mask"], dim=1) - 1) tgt_len.stop_gradient = True label = inputs["tgt_token"][:, 1:] if self.label_smooth > 0: one_hot_label = layers.one_hot(label, self.num_token_embeddings) smooth_label = layers.label_smooth(one_hot_label, epsilon=self.label_smooth, dtype=self._dtype) nll = layers.cross_entropy(outputs["dec_pred"], smooth_label, soft_label=True, ignore_index=self.padding_idx) else: nll = layers.cross_entropy(outputs["dec_probs"], label, ignore_index=self.padding_idx) nll = layers.reduce_sum(nll, dim=1) token_nll = layers.reduce_sum(nll) / tgt_len nll = layers.reduce_mean(nll) metrics["nll"] = nll metrics["token_nll"] = token_nll loss = nll if self.num_latent > 0 and self.with_bow: bow_probs = F.unsqueeze(outputs["bow_probs"], [1]) bow_probs = layers.expand(bow_probs, [1, label.shape[1], 1]) if self.label_smooth > 0: bow = layers.cross_entropy(bow_probs, smooth_label, soft_label=True, ignore_index=self.padding_idx) else: bow = layers.cross_entropy(bow_probs, label, ignore_index=self.padding_idx) bow = layers.reduce_sum(bow, dim=1) token_bow = layers.reduce_sum(bow) / tgt_len bow = layers.reduce_mean(bow) metrics["bow"] = bow metrics["token_bow"] = token_bow loss = loss + bow if self.num_latent > 0 and self.use_discriminator: dis = 0.0 - (layers.log(outputs["pos_probs"]) + layers.log(1.0 - outputs["neg_probs"])) dis = layers.reduce_mean(dis) metrics["dis"] = dis loss = loss + dis * self.dis_ratio metrics["loss"] = loss metrics["token_num"] = tgt_len return metrics
def loss_function(s_arc, s_rel, arcs, rels, mask): """Loss function""" arcs = nn.masked_select(arcs, mask) rels = nn.masked_select(rels, mask) s_arc = nn.masked_select(s_arc, mask) s_rel = nn.masked_select(s_rel, mask) s_rel = nn.index_sample(s_rel, layers.unsqueeze(arcs, 1)) arc_loss = layers.cross_entropy(layers.softmax(s_arc), arcs) rel_loss = layers.cross_entropy(layers.softmax(s_rel), rels) loss = layers.reduce_mean(arc_loss + rel_loss) return loss
def dynamic(train_data, use_cuda=False, use_parallel_exe=False): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = SEED fluid.default_main_program().random_seed = SEED dy_layer = DygraphLayer() adam = fluid.optimizer.Adam(learning_rate=LR, parameter_list=dy_layer.parameters()) sgd = fluid.optimizer.SGD(learning_rate=LR, parameter_list=dy_layer.parameters()) for epoch in range(EPOCH_NUM): image_data, label = train_data[epoch] var_input = fluid.dygraph.to_variable(image_data) var_label = fluid.dygraph.to_variable(label) hidden, prediction = dy_layer(var_input) if epoch % 2 == 0: cross_entropy_loss = layers.cross_entropy( prediction, var_label) loss = layers.mean(cross_entropy_loss) loss.backward() adam.minimize(loss) else: softmax_loss = layers.softmax_with_cross_entropy( prediction, var_label) loss = layers.mean(softmax_loss) loss.backward() sgd.minimize(loss) dy_layer.clear_gradients() return hidden.numpy(), prediction.numpy(), loss.numpy()
def node_classify_model(word2id, num_labels, embed_dim=16): """Build node classify model. Args: word2id(dict): map word(node) to its corresponding index num_labels: The number of labels. embed_dim: The dimension of embedding. """ nodes = fl.data('nodes', shape=[None, 1], dtype='int64') labels = fl.data('labels', shape=[None, 1], dtype='int64') embed_nodes = fl.embedding(input=nodes, size=[len(word2id), embed_dim], param_attr=fluid.ParamAttr(name='content')) embed_nodes.stop_gradient = True probs = fl.fc(input=embed_nodes, size=num_labels, act='softmax') predict = fl.argmax(probs, axis=-1) loss = fl.cross_entropy(input=probs, label=labels) loss = fl.reduce_mean(loss) return { 'loss': loss, 'probs': probs, 'predict': predict, 'labels': labels, }
def not_test_raw_api(self): prog = Program() startup_prog = Program() with program_guard(prog, startup_prog): image = layers.data(name='x', shape=[784], dtype='float32') label = layers.data(name='y', shape=[1], dtype='int64') limit = layers.fill_constant(shape=[1], dtype='int64', value=5) cond = layers.less_than(x=label, y=limit) true_image, false_image = split_lod_tensor(input=image, mask=cond) true_out = layers.create_tensor(dtype='float32') true_cond = ConditionalBlock([cond]) with true_cond.block(): hidden = layers.fc(input=true_image, size=100, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') layers.assign(input=prob, output=true_out) false_out = layers.create_tensor(dtype='float32') false_cond = ConditionalBlock([cond]) with false_cond.block(): hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') layers.assign(input=prob, output=false_out) prob = merge_lod_tensor( in_true=true_out, in_false=false_out, mask=cond, x=image) loss = layers.cross_entropy(input=prob, label=label) avg_loss = layers.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), batch_size=10) place = core.CPUPlace() exe = Executor(place) exe.run(startup_prog) PASS_NUM = 100 for pass_id in range(PASS_NUM): for data in train_reader(): x_data = np.array([x[0] for x in data]).astype("float32") y_data = np.array([x[1] for x in data]).astype("int64") y_data = np.expand_dims(y_data, axis=1) outs = exe.run(prog, feed={'x': x_data, 'y': y_data}, fetch_list=[avg_loss]) print(outs[0]) if outs[0] < 1.0: return self.assertFalse(True)
def mlp_pretrain_forward(train_program, start_program): with static.program_guard(train_program, start_program), utils.unique_name.guard(): batch_size = 4 hidden_size = 1024 sequence_len = 512 input = static.data(name="input", shape=[batch_size, sequence_len, hidden_size], dtype='float32') label = static.data(name="label", shape=[batch_size, sequence_len, 1], dtype='float32') auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mappig": [-1, -1, -1] }) mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, initializer_range=0.02) predict = mlp(input) cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) return avg_cost, train_program, start_program
def test_recognize_digits_conv(self): program = Program() with program_guard(program, startup_program=Program()): images = layers.data(name='pixel', shape=[1, 28, 28], dtype='float32') label = layers.data(name='label', shape=[1], dtype='int32') conv_pool_1 = nets.simple_img_conv_pool(input=images, filter_size=5, num_filters=2, pool_size=2, pool_stride=2, act="relu") conv_pool_2 = nets.simple_img_conv_pool(input=conv_pool_1, filter_size=5, num_filters=4, pool_size=2, pool_stride=2, act="relu") predict = layers.fc(input=conv_pool_2, size=10, act="softmax") cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(cost) print(str(program))
def test_recognize_digits_conv(self): program = Program() with program_guard(program, startup_program=Program()): images = layers.data( name='pixel', shape=[1, 28, 28], dtype='float32') label = layers.data(name='label', shape=[1], dtype='int32') conv_pool_1 = nets.simple_img_conv_pool( input=images, filter_size=5, num_filters=2, pool_size=2, pool_stride=2, act="relu") conv_pool_2 = nets.simple_img_conv_pool( input=conv_pool_1, filter_size=5, num_filters=4, pool_size=2, pool_stride=2, act="relu") predict = layers.fc(input=conv_pool_2, size=10, act="softmax") cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(cost) print(str(program))
def create_loss_op(self, predict, label, epsilon=1e-7): """compute loss with tensor Args: predict: model output tensor activated by softmax label: a non-sparse tensor Returns: loss: cross-entropy loss """ if self.loss_type == "nl" and self.model_type == "train": one_hot_label = fluid.one_hot(label, depth=predict.shape[-1]) one_hot_label = FL.squeeze(one_hot_label, axes=[-2]) # log neg_prob = 1 - predict log_neg_prob = FL.log( fluid.layers.clip(neg_prob, min=epsilon, max=1.)) ce_loss = -1 * log_neg_prob * one_hot_label cost = FL.reduce_sum(ce_loss, dim=-1, keep_dim=True) else: # PL or evaluation cost = FL.cross_entropy(predict, label) loss = FL.mean(cost) return loss
def train_program(is_sparse): context = encoder(is_sparse) rnn_out = train_decoder(context, is_sparse) label = pd.data( name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) cost = pd.cross_entropy(input=rnn_out, label=label) avg_cost = pd.mean(cost) return avg_cost
def loss_func(logits, label, trg_sequence_length): probs = layers.softmax(logits) loss = layers.cross_entropy(input=probs, label=label) trg_mask = layers.sequence_mask(trg_sequence_length, maxlen=layers.shape(logits)[1], dtype="float32") avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask) return avg_cost
def test_cross_entropy(self): program = Program() with program_guard(program): x = layers.data(name="x", shape=[30, 10], dtype="float32") label = layers.data(name="label", shape=[30, 1], dtype="int32") mode = 'channel' out = layers.cross_entropy(x, label, False, 4) self.assertIsNotNone(out)
def test_ifelse(self): prog = Program() startup_prog = Program() with program_guard(prog, startup_prog): image = layers.data(name='x', shape=[784], dtype='float32') label = layers.data(name='y', shape=[1], dtype='int64') limit = layers.fill_constant_batch_size_like(input=label, dtype='int64', shape=[1], value=5.0) cond = layers.less_than(x=label, y=limit) ie = layers.IfElse(cond) with ie.true_block(): true_image = ie.input(image) hidden = layers.fc(input=true_image, size=100, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) with ie.false_block(): false_image = ie.input(image) hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) prob = ie() loss = layers.cross_entropy(input=prob[0], label=label) avg_loss = layers.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), batch_size=200) place = core.CPUPlace() exe = Executor(place) exe.run(kwargs['startup_program']) PASS_NUM = 100 for pass_id in range(PASS_NUM): for data in train_reader(): x_data = np.array(map(lambda x: x[0], data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape((y_data.shape[0], 1)) outs = exe.run(kwargs['main_program'], feed={ 'x': x_data, 'y': y_data }, fetch_list=[avg_loss]) print outs[0] if outs[0] < 1.0: return self.assertFalse(True)
def _compute_loss_acc(self, pred): loss = layers.cross_entropy(pred, label=self.label, soft_label=False) loss = layers.reshape(loss, shape=[self.batch_size, -1]) loss = layers.reduce_mean(loss) acc = fluid.layers.accuracy(input=pred, label=self.label) return loss, acc
def learn(self, probs, label, weight=None, length=None): loss = layers.cross_entropy(input=probs, label=label, soft_label=False) max_seq_len = layers.shape(probs)[1] mask = layers.sequence_mask(length, maxlen=max_seq_len, dtype="float32") loss = loss * mask loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(loss) return loss
def loss_func(logits, label, trg_sequence_length): probs = layers.softmax(logits) # 使用交叉熵损失函数 loss = layers.cross_entropy(input=probs, label=label) # 根据长度生成掩码,并依此剔除 padding 部分计算的损失 trg_mask = layers.sequence_mask(trg_sequence_length, maxlen=layers.shape(logits)[1], dtype="float32") avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask) return avg_cost
def get_losses(self, out, cls_out, mask, gt_labels): loss_cls = L.mean(L.cross_entropy(cls_out, gt_labels)) * self.train_cfg['w_cls'] loss_tir = 0 for feat in out[:-1]: feat = L.squeeze(self.avgpool(feat), axes=[2, 3]) loss_tir += self.triple_loss(feat, gt_labels) * self.train_cfg['w_tri'] loss = loss_cls + loss_tir return dict(loss_cls=loss_cls, loss_tir=loss_tir, loss=loss)
def test_ifelse(self): prog = Program() startup_prog = Program() with program_guard(prog, startup_prog): image = layers.data(name='x', shape=[784], dtype='float32') label = layers.data(name='y', shape=[1], dtype='int64') limit = layers.fill_constant_batch_size_like( input=label, dtype='int64', shape=[1], value=5.0) cond = layers.less_than(x=label, y=limit) ie = layers.IfElse(cond) with ie.true_block(): true_image = ie.input(image) hidden = layers.fc(input=true_image, size=100, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) with ie.false_block(): false_image = ie.input(image) hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) prob = ie() loss = layers.cross_entropy(input=prob[0], label=label) avg_loss = layers.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), batch_size=200) place = core.CPUPlace() exe = Executor(place) exe.run(kwargs['startup_program']) PASS_NUM = 100 for pass_id in range(PASS_NUM): for data in train_reader(): x_data = np.array(map(lambda x: x[0], data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape((y_data.shape[0], 1)) outs = exe.run(kwargs['main_program'], feed={'x': x_data, 'y': y_data}, fetch_list=[avg_loss]) print outs[0] if outs[0] < 1.0: return self.assertFalse(True)
def main(): rnn_out = encoder_decoder() label = layers.data(name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) cost = layers.cross_entropy(input=rnn_out, label=label) avg_cost = fluid.layers.mean(cost) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) # fluid.memory_optimize(fluid.default_main_program()) fluid.release_memory(fluid.default_main_program()) # fix the order of training data train_data = paddle.batch(paddle.dataset.wmt14.train(dict_size), batch_size=batch_size) # train_data = paddle.batch( # paddle.reader.shuffle( # paddle.dataset.wmt14.train(dict_size), buf_size=1000), # batch_size=batch_size) place = core.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) feed_order = [ 'src_word_id', 'target_language_word', 'target_language_next_word' ] feed_list = [ fluid.default_main_program().global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list, place) batch_id = 0 for pass_id in xrange(10): for data in train_data(): outs = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost]) avg_cost_val = np.array(outs[0]) print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + " avg_cost=" + str(avg_cost_val)) if batch_id > 2: exit(0) if math.isnan(float(avg_cost_val)): sys.exit("got NaN loss, training failed.") batch_id += 1
def learn(self, act_prob, action, reward, length=None): """ update policy model self.model with policy gradient algorithm """ self.reward = fluid.layers.py_func( func=reward_func, x=[action, length], out=reward) neg_log_prob = layers.cross_entropy(act_prob, action) cost = neg_log_prob * reward cost = (layers.reduce_sum(cost) / layers.reduce_sum(length) ) if length is not None else layers.reduce_mean(cost) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost
def learn(self, obs, action, reward): obs = fluid.dygraph.to_variable(obs) obs = layers.cast(obs, dtype='float32') act_prob = self.model(obs) action = fluid.dygraph.to_variable(action) reward = fluid.dygraph.to_variable(reward) log_prob = layers.cross_entropy(act_prob, action) cost = log_prob * reward cost = layers.cast(cost, dtype='float32') cost = layers.reduce_mean(cost) cost.backward() self.optimizer.minimize(cost) self.model.clear_gradients() return cost
def main(): rnn_out = encoder_decoder() label = layers.data( name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) cost = layers.cross_entropy(input=rnn_out, label=label) avg_cost = fluid.layers.mean(cost) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) # fluid.memory_optimize(fluid.default_main_program()) fluid.release_memory(fluid.default_main_program()) # fix the order of training data train_data = paddle.batch( paddle.dataset.wmt14.train(dict_size), batch_size=batch_size) # train_data = paddle.batch( # paddle.reader.shuffle( # paddle.dataset.wmt14.train(dict_size), buf_size=1000), # batch_size=batch_size) place = core.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) batch_id = 0 for pass_id in xrange(10): for data in train_data(): word_data = to_lodtensor(map(lambda x: x[0], data), place) trg_word = to_lodtensor(map(lambda x: x[1], data), place) trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) outs = exe.run(fluid.default_main_program(), feed={ 'src_word_id': word_data, 'target_language_word': trg_word, 'target_language_next_word': trg_word_next }, fetch_list=[avg_cost]) avg_cost_val = np.array(outs[0]) print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + " avg_cost=" + str(avg_cost_val)) if batch_id > 2: exit(0) if math.isnan(float(avg_cost_val)): sys.exit("got NaN loss, training failed.") batch_id += 1
def train_main(use_cuda): if use_cuda and not fluid.core.is_compiled_with_cuda(): return place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() context = encoder() state_cell = decoder_state_cell(context) rnn_out = decoder_train(state_cell) label = layers.data(name="target_next_word", shape=[1], dtype='int64', lod_level=1) cost = layers.cross_entropy(input=rnn_out, label=label) avg_cost = layers.mean(x=cost) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3) optimizer.minimize(avg_cost) train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) feed_order = ['src_word', 'target_word', 'target_next_word'] exe = Executor(place) def train_loop(main_program): exe.run(framework.default_startup_program()) feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list, place) for pass_id in range(1): for batch_id, data in enumerate(train_reader()): outs = exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) avg_cost_val = np.array(outs[0]) print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + " avg_cost=" + str(avg_cost_val)) if batch_id > 3: break train_loop(framework.default_main_program())
def test_recognize_digits_mlp(self): program = Program() with program_guard(program, startup_program=Program()): # Change g_program, so the rest layers use `g_program` images = layers.data(name='pixel', shape=[784], dtype='float32') label = layers.data(name='label', shape=[1], dtype='int32') hidden1 = layers.fc(input=images, size=128, act='relu') hidden2 = layers.fc(input=hidden1, size=64, act='relu') predict = layers.fc(input=[hidden2, hidden1], size=10, act='softmax', param_attr=["sftmax.w1", "sftmax.w2"]) cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(cost) self.assertIsNotNone(avg_cost) print(str(program))
def test_word_embedding(self): program = Program() with program_guard(program, startup_program=Program()): dict_size = 10000 embed_size = 32 first_word = layers.data(name='firstw', shape=[1], dtype='int64') second_word = layers.data(name='secondw', shape=[1], dtype='int64') third_word = layers.data(name='thirdw', shape=[1], dtype='int64') forth_word = layers.data(name='forthw', shape=[1], dtype='int64') next_word = layers.data(name='nextw', shape=[1], dtype='int64') embed_first = layers.embedding( input=first_word, size=[dict_size, embed_size], dtype='float32', param_attr='shared_w') embed_second = layers.embedding( input=second_word, size=[dict_size, embed_size], dtype='float32', param_attr='shared_w') embed_third = layers.embedding( input=third_word, size=[dict_size, embed_size], dtype='float32', param_attr='shared_w') embed_forth = layers.embedding( input=forth_word, size=[dict_size, embed_size], dtype='float32', param_attr='shared_w') concat_embed = layers.concat( input=[embed_first, embed_second, embed_third, embed_forth], axis=1) hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid') predict_word = layers.fc(input=hidden1, size=dict_size, act='softmax') cost = layers.cross_entropy(input=predict_word, label=next_word) avg_cost = layers.mean(cost) self.assertIsNotNone(avg_cost) print(str(program))
def forward(self, cue, label, return_loss=True): out = self.conv1(cue) out = self.norm1(out) out = self.maxpool(out) out = self.conv2(out) out = self.norm2(out) out = self.avgpool(out) if return_loss: cls_out = L.dropout(out, dropout_prob=0.5, is_test=False) cls_out = self.fc(L.squeeze(cls_out, axes=[2, 3])) loss_cls = L.mean(L.cross_entropy(cls_out, label)) losses = dict(loss_cls=loss_cls, loss=loss_cls) return losses else: cls_out = self.fc(L.squeeze(out, axes=[2, 3])) cls_out = L.softmax(cls_out).numpy()[:, 0] return cls_out
def get_losses(self, out, cls_out, mask, gt_labels): loss_cls = L.mean(L.cross_entropy(cls_out, gt_labels)) * self.train_cfg['w_cls'] cue = out[-1] if self.train_cfg['with_mask'] else L.elementwise_mul( out[-1], L.cast(gt_labels, 'float32'), axis=0) num_reg = L.cast( L.reduce_sum(gt_labels) * cue.shape[1] * cue.shape[2] * cue.shape[3], 'float32') loss_reg = L.reduce_sum( L.abs(mask - cue)) / (num_reg + 1e-8) * self.train_cfg['w_reg'] loss_tir = 0 for feat in out[:-1]: feat = L.squeeze(self.avgpool(feat), axes=[2, 3]) loss_tir += self.triple_loss(feat, gt_labels) * self.train_cfg['w_tri'] loss = loss_cls + loss_reg + loss_tir return dict(loss_cls=loss_cls, loss_reg=loss_reg, loss_tir=loss_tir, loss=loss)
def node_classify_model(config): """Build node classify model. """ nodes = fl.data('nodes', shape=[None, 1], dtype='int64') labels = fl.data('labels', shape=[None, 1], dtype='int64') embed_nodes = fl.embedding(input=nodes, size=[config.num_nodes, config.embed_dim], param_attr=fluid.ParamAttr(name='weight')) embed_nodes.stop_gradient = True probs = fl.fc(input=embed_nodes, size=config.num_labels, act='softmax') predict = fl.argmax(probs, axis=-1) loss = fl.cross_entropy(input=probs, label=labels) loss = fl.reduce_mean(loss) return { 'loss': loss, 'probs': probs, 'predict': predict, 'labels': labels, }
def test_raw_api(self): prog = Program() startup_prog = Program() with program_guard(prog, startup_prog): image = layers.data(name='x', shape=[784], dtype='float32') label = layers.data(name='y', shape=[1], dtype='int64') limit = layers.fill_constant_batch_size_like( input=label, dtype='int64', shape=[1], value=5.0) cond = layers.less_than(x=label, y=limit) true_image, false_image = layers.split_lod_tensor( input=image, mask=cond) true_out = layers.create_tensor(dtype='float32') true_cond = layers.ConditionalBlock([true_image]) with true_cond.block(): hidden = layers.fc(input=true_image, size=100, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') layers.assign(input=prob, output=true_out) false_out = layers.create_tensor(dtype='float32') false_cond = layers.ConditionalBlock([false_image]) with false_cond.block(): hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') layers.assign(input=prob, output=false_out) prob = layers.merge_lod_tensor( in_true=true_out, in_false=false_out, mask=cond, x=image) loss = layers.cross_entropy(input=prob, label=label) avg_loss = layers.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), batch_size=200) place = core.CPUPlace() exe = Executor(place) exe.run(startup_prog) PASS_NUM = 100 for pass_id in range(PASS_NUM): for data in train_reader(): x_data = np.array(map(lambda x: x[0], data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = np.expand_dims(y_data, axis=1) outs = exe.run(prog, feed={'x': x_data, 'y': y_data}, fetch_list=[avg_loss]) print outs[0] if outs[0] < 1.0: return self.assertFalse(True)
def knowledge_seq2seq(config): """ knowledge seq2seq """ emb_size = config.embed_size hidden_size = config.hidden_size input_size = emb_size num_layers = config.num_layers bi_direc = config.bidirectional batch_size = config.batch_size vocab_size = config.vocab_size run_type = config.run_type enc_input = layers.data(name="enc_input", shape=[1], dtype='int64', lod_level=1) #enc_input --> goal enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32') goal_input = layers.data(name="goal_input", shape=[1], dtype='int64', lod_level=1) #goal_input --> x cue_input = layers.data(name="cue_input", shape=[1], dtype='int64', lod_level=1) #cue_input --> kg #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32') memory_mask = layers.data(name='memory_mask', shape=[-1, 1], dtype='float32') tar_input = layers.data(name='tar_input', shape=[1], dtype='int64', lod_level=1) #tar_input --> y # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32') rnn_hidden_size = hidden_size if bi_direc: rnn_hidden_size //= 2 enc_out, enc_last_hidden = \ rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc") goal_out, goal_last_hidden = \ rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc1") context_goal_out = fluid.layers.concat( input=[enc_last_hidden, goal_last_hidden], axis=2) context_goal_out = layers.reshape(context_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # context_goal_out = layers.squeeze(context_goal_out, axes=[1]) context_goal_out = fluid.layers.fc(context_goal_out, size=rnn_hidden_size * 2, bias_attr=False) context_goal_out = layers.unsqueeze(context_goal_out, axes=[0]) bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge") bridge_out = layers.tanh(bridge_out) cue_last_mask = layers.data(name='cue_last_mask', shape=[-1], dtype='float32') knowledge_out, knowledge_last_hidden = \ rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc") query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1]) query = layers.squeeze(query, axes=[0]) query = layers.unsqueeze(query, axes=[1]) query = layers.reshape(query, shape=[batch_size, -1, hidden_size]) cue_memory = layers.slice(knowledge_last_hidden, axes=[0], starts=[0], ends=[1]) cue_memory = layers.reshape(cue_memory, shape=[batch_size, -1, hidden_size]) memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1]) weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask) cue_att = layers.reshape(cue_att, shape=[batch_size, -1]) knowledge = weighted_cue if config.use_posterior: print("config.use_posterior", config.use_posterior) target_out, target_last_hidden = \ rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="knowledge_enc1") target_goal_out = fluid.layers.concat( input=[target_last_hidden, goal_last_hidden], axis=2) target_goal_out = layers.reshape(target_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # target_goal_out = layers.squeeze(target_goal_out, axes=[1]) target_goal_out = fluid.layers.fc(target_goal_out, size=rnn_hidden_size * 2, bias_attr=False) target_goal_out = layers.unsqueeze(target_goal_out, axes=[0]) # get attenion # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1]) target_query = layers.slice(target_goal_out, axes=[0], starts=[0], ends=[1]) target_query = layers.squeeze(target_query, axes=[0]) target_query = layers.unsqueeze(target_query, axes=[1]) target_query = layers.reshape(target_query, shape=[batch_size, -1, hidden_size]) weight_target, target_att = dot_attention(target_query, cue_memory, mask=memory_mask) target_att = layers.reshape(target_att, shape=[batch_size, -1]) # add to output knowledge = weight_target enc_memory_mask = layers.data(name="enc_memory_mask", shape=[-1, 1], dtype='float32') enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1]) # decoder init_hidden, enc_memory, enc_mask dec_init_hidden = bridge_out pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32')) enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out, pad_value=pad_value) enc_memory.persistable = True gru_unit = GRU_unit(input_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_gru_unit") cue_gru_unit = GRU_unit(hidden_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_cue_gru_unit") tgt_vocab_size = config.vocab_size if run_type == "train": if config.use_bow: bow_logits = fc(knowledge, hidden_size, hidden_size, name='bow_fc_1') bow_logits = layers.tanh(bow_logits) bow_logits = fc(bow_logits, hidden_size, tgt_vocab_size, name='bow_fc_2') bow_logits = layers.softmax(bow_logits) bow_label = layers.data(name='bow_label', shape=[-1, config.max_len], dtype='int64') bow_mask = layers.data(name="bow_mask", shape=[-1, config.max_len], dtype='float32') bow_logits = layers.expand(bow_logits, [1, config.max_len, 1]) bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size]) bow_label = layers.reshape(bow_label, shape=[-1, 1]) bow_loss = layers.cross_entropy(bow_logits, bow_label, soft_label=False) bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len]) bow_loss *= bow_mask bow_loss = layers.reduce_sum(bow_loss, dim=[1]) bow_loss = layers.reduce_mean(bow_loss) dec_input = layers.data(name="dec_input", shape=[-1, 1, 1], dtype='int64') dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32') dec_knowledge = weight_target knowledge_goal_out = fluid.layers.concat( input=[dec_knowledge, target_query], axis=2) knowledge_goal_out = layers.reshape(knowledge_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1]) knowledge_goal_out = fluid.layers.fc(knowledge_goal_out, size=rnn_hidden_size * 2, bias_attr=False) knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0]) decoder_logits = \ rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers, enc_memory, enc_memory_mask, dec_knowledge, vocab_size, init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout) target_label = layers.data(name='target_label', shape=[-1, 1], dtype='int64') target_mask = layers.data(name='target_mask', shape=[-1, 1], dtype='float32') decoder_logits = layers.reshape(decoder_logits, shape=[-1, tgt_vocab_size]) target_label = layers.reshape(target_label, shape=[-1, 1]) nll_loss = layers.cross_entropy(decoder_logits, target_label, soft_label=False) nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1]) nll_loss *= target_mask nll_loss = layers.reduce_sum(nll_loss, dim=[1]) nll_loss = layers.reduce_mean(nll_loss) prior_attn = cue_att + 1e-10 posterior_att = target_att posterior_att.stop_gradient = True prior_attn = layers.log(prior_attn) kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) - prior_attn) kl_loss = layers.reduce_mean(kl_loss) kl_and_nll_factor = layers.data(name='kl_and_nll_factor', shape=[1], dtype='float32') kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1]) final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor return [bow_loss, kl_loss, nll_loss, final_loss] elif run_type == "test": beam_size = config.beam_size batch_size = config.batch_size token = layers.fill_constant(shape=[batch_size * beam_size, 1], value=config.bos_id, dtype='int64') token = layers.reshape(token, shape=[-1, 1]) max_decode_len = config.max_dec_len dec_knowledge = knowledge INF = 100000000.0 init_score_np = np.ones([beam_size * batch_size], dtype='float32') * -INF for i in range(batch_size): init_score_np[i * beam_size] = 0.0 pre_score = layers.assign(init_score_np) pos_index_np = np.arange(batch_size).reshape(-1, 1) pos_index_np = \ np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size pos_index = layers.assign(pos_index_np) id_array = [] score_array = [] index_array = [] init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1]) init_enc_memory = layers.reshape( init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size]) init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1]) init_enc_mask = layers.reshape(init_enc_mask, shape=[batch_size * beam_size, 1, -1]) dec_knowledge = layers.reshape(dec_knowledge, shape=[-1, 1, hidden_size]) init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1]) init_dec_knowledge = layers.reshape( init_dec_knowledge, shape=[batch_size * beam_size, -1, hidden_size]) dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size]) dec_init_hidden = layers.reshape(dec_init_hidden, shape=[1, -1, hidden_size]) length_average = config.length_average UNK = config.unk_id EOS = config.eos_id for i in range(1, max_decode_len + 1): dec_emb = get_embedding(token, input_size, vocab_size) dec_out, dec_last_hidden = \ decoder_step(gru_unit, cue_gru_unit, dec_emb, dec_init_hidden, input_size, hidden_size, init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(dec_out, dropout_prob=config.dropout, is_test=True) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2') log_softmax_output = log_softmax(rnnout) log_softmax_output = layers.squeeze(log_softmax_output, axes=[1]) if i > 1: if length_average: log_softmax_output = layers.elementwise_add( (log_softmax_output / i), (pre_score * (1.0 - 1.0 / i)), axis=0) else: log_softmax_output = layers.elementwise_add( log_softmax_output, pre_score, axis=0) else: log_softmax_output = layers.elementwise_add(log_softmax_output, pre_score, axis=0) log_softmax_output = layers.reshape(log_softmax_output, shape=[batch_size, -1]) topk_score, topk_index = layers.topk(log_softmax_output, k=beam_size) topk_score = layers.reshape(topk_score, shape=[-1]) topk_index = layers.reshape(topk_index, shape=[-1]) vocab_var = layers.fill_constant([1], dtype='int64', value=vocab_size) new_token = topk_index % vocab_var index = topk_index // vocab_var id_array.append(new_token) index_array.append(index) index = index + pos_index score_array.append(topk_score) eos_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=EOS) unk_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=UNK) eos_eq = layers.cast(layers.equal(new_token, eos_ids), dtype='float32') topk_score += eos_eq * -100000000.0 unk_eq = layers.cast(layers.equal(new_token, unk_ids), dtype='float32') topk_score += unk_eq * -100000000.0 # update token = new_token pre_score = topk_score token = layers.reshape(token, shape=[-1, 1]) index = layers.cast(index, dtype='int32') dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0]) dec_init_hidden = layers.gather(dec_last_hidden, index=index) dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0]) init_enc_memory = layers.gather(init_enc_memory, index) init_enc_mask = layers.gather(init_enc_mask, index) init_dec_knowledge = layers.gather(init_dec_knowledge, index) final_score = layers.concat(score_array, axis=0) final_ids = layers.concat(id_array, axis=0) final_index = layers.concat(index_array, axis=0) final_score = layers.reshape( final_score, shape=[max_decode_len, beam_size * batch_size]) final_ids = layers.reshape( final_ids, shape=[max_decode_len, beam_size * batch_size]) final_index = layers.reshape( final_index, shape=[max_decode_len, beam_size * batch_size]) return final_score, final_ids, final_index
def transformer( src_vocab_size, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, src_pad_idx, trg_pad_idx, pos_pad_idx, ): file_obj = open_recordio_file( filename=os.environ.get('RECORDIO_FILENAME', '/tmp/wmt16.recordio'), shapes=[ [batch_size * max_length, 1], [batch_size * max_length, 1], [batch_size * max_length, 1], [batch_size * max_length, 1], [batch_size, n_head, max_length, max_length], [batch_size, n_head, max_length, max_length], [batch_size, n_head, max_length, max_length], [batch_size * max_length, 1], [batch_size * max_length, 1], ], dtypes=[ 'int64', 'int64', 'int64', 'int64', 'float32', 'float32', 'float32', 'int64', 'float32', ], lod_levels=[0] * 9) src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file( file_obj) enc_input = prepare_encoder( src_word, src_pos, src_vocab_size, d_model, src_pad_idx, max_length, dropout_rate, ) enc_output = encoder( enc_input, src_slf_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, ) dec_input = prepare_decoder( trg_word, trg_pos, trg_vocab_size, d_model, trg_pad_idx, max_length, dropout_rate, ) dec_output = decoder( dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, ) # TODO(guosheng): Share the weight matrix between the embedding layers and # the pre-softmax linear transformation. predict = layers.reshape( x=layers.fc(input=dec_output, size=trg_vocab_size, param_attr=fluid.initializer.Xavier(uniform=False), bias_attr=False, num_flatten_dims=2), shape=[-1, trg_vocab_size], act="softmax") cost = layers.cross_entropy(input=predict, label=gold) weighted_cost = cost * weights return layers.reduce_sum(weighted_cost)
def train_main(use_cuda, is_sparse, is_local=True): if use_cuda and not fluid.core.is_compiled_with_cuda(): return place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() context = encoder(is_sparse) rnn_out = decoder_train(context, is_sparse) label = pd.data(name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) cost = pd.cross_entropy(input=rnn_out, label=label) avg_cost = pd.mean(cost) optimizer = fluid.optimizer.Adagrad( learning_rate=1e-4, regularization=fluid.regularizer.L2DecayRegularizer( regularization_coeff=0.1)) optimize_ops, params_grads = optimizer.minimize(avg_cost) train_data = paddle.batch(paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) exe = Executor(place) def train_loop(main_program): exe.run(framework.default_startup_program()) batch_id = 0 for pass_id in xrange(1): for data in train_data(): word_data = to_lodtensor(map(lambda x: x[0], data), place) trg_word = to_lodtensor(map(lambda x: x[1], data), place) trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) outs = exe.run(main_program, feed={ 'src_word_id': word_data, 'target_language_word': trg_word, 'target_language_next_word': trg_word_next }, fetch_list=[avg_cost]) avg_cost_val = np.array(outs[0]) print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + " avg_cost=" + str(avg_cost_val)) if batch_id > 3: break batch_id += 1 if is_local: train_loop(framework.default_main_program()) else: port = os.getenv("PADDLE_INIT_PORT", "6174") pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) training_role = os.getenv("TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": train_loop(t.get_trainer_program())
def transformer( src_vocab_size, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, src_pad_idx, trg_pad_idx, pos_pad_idx, ): file_obj = fluid.layers.open_recordio_file( filename='./wmt16.recordio', shapes=[ [batch_size * max_length, 1], [batch_size * max_length, 1], [batch_size * max_length, 1], [batch_size * max_length, 1], [batch_size, n_head, max_length, max_length], [batch_size, n_head, max_length, max_length], [batch_size, n_head, max_length, max_length], [batch_size * max_length, 1], [batch_size * max_length, 1], ], dtypes=[ 'int64', 'int64', 'int64', 'int64', 'float32', 'float32', 'float32', 'int64', 'float32', ], lod_levels=[0] * 9) src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file( file_obj) enc_input = prepare_encoder( src_word, src_pos, src_vocab_size, d_model, src_pad_idx, max_length, dropout_rate, ) enc_output = encoder( enc_input, src_slf_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, ) dec_input = prepare_decoder( trg_word, trg_pos, trg_vocab_size, d_model, trg_pad_idx, max_length, dropout_rate, ) dec_output = decoder( dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, ) # TODO(guosheng): Share the weight matrix between the embedding layers and # the pre-softmax linear transformation. predict = layers.reshape( x=layers.fc(input=dec_output, size=trg_vocab_size, param_attr=fluid.initializer.Xavier(uniform=False), bias_attr=False, num_flatten_dims=2), shape=[-1, trg_vocab_size], act="softmax") cost = layers.cross_entropy(input=predict, label=gold) weighted_cost = cost * weights return layers.reduce_sum(weighted_cost)