def infer(self, inputs, outputs): """Run model inference. Only support generation now. """ if self.do_generation: return self.generator.inference(self, inputs, outputs) else: tgt_logits = self._calc_logits(outputs["enc_out"], inputs["tgt_idx"]) tgt_lm_loss = layers.softmax_with_cross_entropy( logits=tgt_logits, label=inputs["tgt_label"]) lm_loss = layers.fill_constant_batch_size_like( outputs["enc_out"], [-1], self.dtype, 0) lm_loss = layers.scatter(lm_loss, inputs["tgt_idx"][:, 0], tgt_lm_loss[:, 0], overwrite=False) tokens_num = layers.fill_constant_batch_size_like( outputs["enc_out"], [-1], self.dtype, 0) tgt_tokens_num = layers.fill_constant_batch_size_like( tgt_lm_loss, [-1], self.dtype, 1) tokens_num = layers.scatter(tokens_num, inputs["tgt_idx"][:, 0], tgt_tokens_num, overwrite=False) predictions = { "lm_loss": lm_loss, "tokens_num": tokens_num, "data_id": inputs["data_id"] } return predictions
def forward(self, src_word, src_pos, src_slf_attn_bias, trg_src_attn_bias): enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) ## init states (caches) for transformer, need to be updated according to selected beam caches = [{ "k": layers.fill_constant_batch_size_like( input=enc_output, shape=[-1, self.n_head, 0, self.d_key], dtype=enc_output.dtype, value=0), "v": layers.fill_constant_batch_size_like( input=enc_output, shape=[-1, self.n_head, 0, self.d_value], dtype=enc_output.dtype, value=0), } for i in range(self.n_layer)] enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( enc_output, self.beam_size) trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( trg_src_attn_bias, self.beam_size) static_caches = self.decoder.decoder.prepare_static_cache(enc_output) rs, _ = self.beam_search_decoder( inits=caches, enc_output=enc_output, trg_src_attn_bias=trg_src_attn_bias, static_caches=static_caches) return rs
def forward(self, inputs, is_infer=False): """ Run model main forward. """ outputs = {} if is_infer: self.generation_caches = [{ "k": layers.fill_constant_batch_size_like( input=inputs["token_ids"], shape=[-1, 0, self.d_key * self.n_head], dtype=self.dtype, value=0), "v": layers.fill_constant_batch_size_like( input=inputs["token_ids"], shape=[-1, 0, self.d_value * self.n_head], dtype=self.dtype, value=0), } for i in range(self.n_layer)] else: self.generation_caches = None outputs["enc_out"], generation_checkpoints = self._generation_network( token_ids=inputs["token_ids"], type_ids=inputs["type_ids"], pos_ids=inputs["pos_ids"], generation_mask=inputs["generation_mask"], gather_idx=inputs.get("parent_idx", None)) if not is_infer: outputs["checkpoints"] = generation_checkpoints return outputs
def _init_generation_caches(self, src_ids): # not fuse, return None if self._init_gen_cache or self._fuse is False: return self.generation_caches self.generation_caches = [] num_heads = self.gpt.num_attention_heads num_layers = self.gpt.num_hidden_layers mp_n_head = num_heads // self.gpt.topo.mp_info.size hidden_size = self.gpt.hidden_size head_size = hidden_size // num_heads for i in range(num_layers): if self._fuse: kv = layers.fill_constant_batch_size_like( input=src_ids, shape=[2, -1, mp_n_head, 0, head_size], dtype=self._dtype, value=0, output_dim_idx=1) self.generation_caches.append( TransformerDecoderLayer.Cache(kv)) else: k = layers.fill_constant_batch_size_like( input=src_ids, shape=[-1, mp_n_head, 0, head_size], dtype=self._dtype, value=0) v = layers.fill_constant_batch_size_like( input=src_ids, shape=[-1, mp_n_head, 0, head_size], dtype=self._dtype, value=0) self.generation_caches.append(MultiHeadAttention.Cache(k, v)) self._init_gen_cache = True return self.generation_caches
def _prepare_timestep_input(self, state, step_idx): model_input = {"gather_idx": state["parent_idx"]} # token ids pre_ids = layers.array_read(array=state["tgt_ids"], i=step_idx) model_input["token_ids"] = layers.unsqueeze(pre_ids, 1) # position ids pre_pos = layers.array_read(array=state["tgt_pos"], i=step_idx) model_input["pos_ids"] = layers.gather(pre_pos, state["parent_idx"]) pre_scores = layers.array_read(array=state["scores"], i=step_idx) # generation_mask tgt_generation_mask = layers.array_read(state["tgt_generation_mask"], i=step_idx) append_mask = layers.fill_constant_batch_size_like(pre_ids, [-1, 1, 1], "float32", 1.0) tgt_generation_mask = layers.concat([tgt_generation_mask, append_mask], axis=2) model_input["generation_mask"] = pre_mask = layers.gather(tgt_generation_mask, state["parent_idx"]) model_input["type_ids"] = layers.fill_constant_batch_size_like(pre_mask, [-1, 1, 1], "int64", 1) if self.use_role: model_input["role_ids"] = layers.fill_constant_batch_size_like(pre_mask, [-1, 1, 1], "int64", 0) return model_input, pre_ids, pre_scores
def forward(self, inputs, is_infer=False): """ Run model main forward. """ outputs = {} if is_infer: self.generation_caches = [{ "k": layers.fill_constant_batch_size_like( input=inputs["token_ids"], shape=[-1, 0, self.d_key * self.n_head], dtype=self.dtype, value=0), "v": layers.fill_constant_batch_size_like( input=inputs["token_ids"], shape=[-1, 0, self.d_value * self.n_head], dtype=self.dtype, value=0), } for i in range(self.n_layer)] else: self.generation_caches = None latent_embeddings = layers.create_parameter( shape=[self.emb_size, self.latent_type_size], dtype=self.dtype, attr=fluid.ParamAttr(name=self.latent_emb_name, initializer=self.param_initializer)) if is_infer: latent_id = inputs["latent_id"] weights = layers.one_hot(latent_id, self.latent_type_size) else: logits, recognition_checkpoints = self._recognition_network( token_ids=inputs["token_ids"], type_ids=inputs["type_ids"], pos_ids=inputs["pos_ids"], role_ids=inputs.get("role_ids", None), recognition_mask=inputs["recognition_mask"], ) outputs["post_probs"] = layers.softmax(logits) weights = self._gumbel_softmax(logits) outputs["checkpoints"] = recognition_checkpoints latent_emb = layers.matmul(x=weights, y=latent_embeddings, transpose_y=True) outputs["enc_out"], generation_checkpoints = self._generation_network( token_ids=inputs["token_ids"], type_ids=inputs["type_ids"], pos_ids=inputs["pos_ids"], role_ids=inputs.get("role_ids", None), generation_mask=inputs["generation_mask"], aux_emb=layers.unsqueeze(latent_emb, axes=[1]), gather_idx=inputs.get("parent_idx", None), ) if not is_infer: outputs["checkpoints"].extend(generation_checkpoints) return outputs
def _get_statistics(self, inputs, outputs): statistics = {} if "tgt_label" in inputs: statistics["tokens_num"] = layers.reduce_sum( layers.fill_constant_batch_size_like(input=inputs["tgt_label"], value=1.0, shape=[-1], dtype="int64")) statistics["batch_size"] = layers.reduce_sum( layers.fill_constant_batch_size_like(input=inputs["token_ids"], value=1.0, shape=[-1], dtype="int64")) return statistics
def fluid_sequence_first_step(lodtensor): """ return a lod tensor """ offset = layers.fill_constant_batch_size_like(lodtensor, shape=[-1,1], value=0, dtype='int64') length = layers.fill_constant_batch_size_like(lodtensor, shape=[-1,1], value=1, dtype='int64') res = layers.sequence_slice(lodtensor, offset=offset, length=length) return res
def forward(self): """ forward """ src, dst = L.read_file(self.pyreader) if self.is_sparse: # sparse mode use 2 dims input. src = L.reshape(src, [-1, 1]) dst = L.reshape(dst, [-1, 1]) src_embed = split_embedding(src, self.num_nodes, self.hidden_size, self.embed_init, "weight", self.num_part, self.is_sparse) dst_embed = split_embedding(dst, self.num_nodes, self.hidden_size, self.embed_init, "weight", self.num_part, self.is_sparse) if self.is_sparse: src_embed = L.reshape(src_embed, [-1, 1, self.num_featuers, self.hidden_size]) dst_embed = L.reshape( dst_embed, [-1, self.neg_num + 1, self.num_featuers, self.hidden_size]) src_embed = L.reduce_mean(src_embed, 2) dst_embed = L.reduce_mean(dst_embed, 2) logits = L.matmul(src_embed, dst_embed, transpose_y=True) # [batch_size, 1, neg_num+1] pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1], "float32", 1) neg_label = L.fill_constant_batch_size_like(logits, [-1, 1, self.neg_num], "float32", 0) label = L.concat([pos_label, neg_label], -1) pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1], "float32", self.neg_num) neg_weight = L.fill_constant_batch_size_like(logits, [-1, 1, self.neg_num], "float32", 1) weight = L.concat([pos_weight, neg_weight], -1) weight.stop_gradient = True label.stop_gradient = True loss = L.sigmoid_cross_entropy_with_logits(logits, label) loss = loss * weight loss = L.reduce_mean(loss) loss = loss * ((self.neg_num + 1) / 2 / self.neg_num) loss.persistable = True self.loss = loss return loss
def erniesage_v2_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name): feature = L.unsqueeze(feature, [-1]) msg = gw.send(ernie_send, nfeat_list=[("term_ids", feature)]) neigh_feature = gw.recv( msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum")) term_ids = feature cls = L.fill_constant_batch_size_like(term_ids, [-1, 1, 1], "int64", 1) term_ids = L.concat([cls, term_ids], 1) term_ids.stop_gradient = True ernie = ErnieModel(term_ids, L.zeros_like(term_ids), config=self.config.ernie_config) self_feature = ernie.get_pooled_output() self_feature = L.fc( self_feature, hidden_size, act=act, param_attr=F.ParamAttr(name=name + "_l", learning_rate=learning_rate), ) neigh_feature = L.fc( neigh_feature, hidden_size, act=act, param_attr=F.ParamAttr(name=name + "_r", learning_rate=learning_rate), ) output = L.concat([self_feature, neigh_feature], axis=1) output = L.l2_normalize(output, axis=1) return output
def _apply_rule(condition, inputs, gmr_mask, grammar, name=None): """apply_rule. Args: condition (TYPE): NULL inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1 gmr_mask (TYPE): NULL grammar (TYPE): NULL Returns: TODO Raises: NULL """ fc_name = None if name is not None: fc_name = name + '_apply_rule_fc' condition = layers.cast(condition, dtype='float32') gmr_output = layers.fc(inputs, size=grammar.grammar_size, **nn_utils.param_attr(fc_name, INIT_SCALE, need_bias=True)) gmr_output_masked = layers.elementwise_add(gmr_output, gmr_mask) zeros = layers.fill_constant_batch_size_like( gmr_output_masked, shape=[-1, grammar.MAX_TABLE + grammar.MAX_COLUMN + grammar.MAX_VALUE], dtype='float32', value=-INF) final_output = tensor.concat([gmr_output_masked, zeros], axis=-1) true_final_output = layers.elementwise_mul(final_output, condition, axis=0) return true_final_output
def ernie_pool(self, term_ids): cls = L.fill_constant_batch_size_like(term_ids, [-1, 1], "int64", self.config.cls_id) term_ids = L.concat([cls, term_ids], 1) ernie_model = ErnieModel(self.config.ernie_config, "") feature, _ = ernie_model(term_ids) return feature
def _recognition_network(self, token_ids, type_ids, pos_ids, role_ids, input_mask): """Run recognition network. Args: tokens_ids: represents the token id of each token, shape is [batch_size, max_seq_len, 1] type_ids: represents the type of each token, shape is [batch_size, max_seq_len, 1] pos_ids: represents the position of each token, shape is [batch_size, max_seq_len, 1] input_mask: represents the attention masking mastrix in each Transformer blocks, shape is [batch_size, max_seq_len + 1, max_seq_len + 1] Returns: A tuple contains the output embeddings of Transformer and the checkpoints of Transformer in this pass. """ mask_id = layers.fill_constant_batch_size_like( input=token_ids, shape=[-1, 1, 1], value=self.mask_id, dtype="int64") mask_emb = layers.embedding( input=mask_id, size=[self.vocab_size, self.emb_size], dtype=self.dtype, param_attr=fluid.ParamAttr( name=self.token_emb_name, initializer=self.param_initializer)) emb_out, attn_bias = self._gen_input( token_ids, type_ids, pos_ids, role_ids, input_mask, aux_emb=mask_emb) return self._encode(emb_out, attn_bias)
def fluid_sequence_index(input, index): """ index: (batch_size, 1) """ ones = layers.fill_constant_batch_size_like(input, shape=[-1,1], value=1, dtype='int64') output = layers.sequence_slice(input, offset=index, length=ones) return output
def _recognition_network(self, token_ids, type_ids, pos_ids, role_ids, recognition_mask): mask_id = layers.fill_constant_batch_size_like( input=token_ids, shape=[-1, 1, 1], value=self.mask_id, dtype="int64") mask_emb = layers.embedding( input=mask_id, size=[self.vocab_size, self.emb_size], dtype=self.dtype, param_attr=fluid.ParamAttr( name=self.token_emb_name, initializer=self.param_initializer)) emb_out, n_head_self_attn_mask = self._gen_input( token_ids, type_ids, pos_ids, role_ids, recognition_mask, aux_emb=mask_emb) recognition_out, checkpoints = self._encode(emb_out, n_head_self_attn_mask) recognition_feat = layers.slice( input=recognition_out, axes=[1], starts=[0], ends=[1]) recognition_feat = layers.fc( input=recognition_feat, size=self.hidden_size, act="tanh", param_attr=fluid.ParamAttr( name="recognition_fc.w_0", initializer=self.param_initializer), bias_attr="recognition_fc.b_0") logits = layers.fc( input=recognition_feat, size=self.latent_type_size, param_attr=fluid.ParamAttr( name=self.latent_emb_name, initializer=self.param_initializer), bias_attr="recognition_bias") return logits, checkpoints
def forward(self, feat): """ Args: feat: input feature with shape [batch, n_edges, dim]. Return: output_feat: output feature of set2set pooling with shape [batch, 2*dim]. """ seqlen = 1 h = L.fill_constant_batch_size_like( feat, [1, self.n_layers, self.input_dim], "float32", 0) h = L.transpose(h, [1, 0, 2]) c = h # [seqlen, batch, dim] q_star = L.fill_constant_batch_size_like( feat, [1, seqlen, self.output_dim], "float32", 0) q_star = L.transpose(q_star, [1, 0, 2]) for _ in range(self.n_iters): # q [seqlen, batch, dim] # h [layer, batch, dim] q, h, c = L.lstm( q_star, h, c, seqlen, self.input_dim, self.n_layers, is_bidirec=False) # e [batch, seqlen, n_edges] e = L.matmul(L.transpose(q, [1, 0, 2]), feat, transpose_y=True) # alpha [batch, seqlen, n_edges] alpha = L.softmax(e) # readout [batch, seqlen, dim] readout = L.matmul(alpha, feat) readout = L.transpose(readout, [1, 0, 2]) # q_star [seqlen, batch, dim + dim] q_star = L.concat([q, readout], -1) return L.squeeze(q_star, [0])
def fluid_sequence_advance(input, OOV): """ args: input.data = [1,2,3, 4,5] input.lod = [[0, 3, 5]] return: output.data = [0,1,2, 0,4] output.lod = [[0, 3, 5]] """ seq_len = fluid_sequence_get_seq_len(input) zeros = layers.fill_constant_batch_size_like(seq_len, shape=[-1,1], value=0, dtype='int64') ones = layers.fill_constant_batch_size_like(seq_len, shape=[-1,1], value=1, dtype='int64') oov = layers.sequence_slice(input, zeros, ones) * 0 + OOV oov.stop_gradient = True input_padded = layers.sequence_concat([oov, input]) output = layers.sequence_slice(input_padded, zeros, seq_len) return output
def test_ifelse(self): prog = Program() startup_prog = Program() with program_guard(prog, startup_prog): image = layers.data(name='x', shape=[784], dtype='float32') label = layers.data(name='y', shape=[1], dtype='int64') limit = layers.fill_constant_batch_size_like(input=label, dtype='int64', shape=[1], value=5.0) cond = layers.less_than(x=label, y=limit) ie = layers.IfElse(cond) with ie.true_block(): true_image = ie.input(image) hidden = layers.fc(input=true_image, size=100, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) with ie.false_block(): false_image = ie.input(image) hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) prob = ie() loss = layers.cross_entropy(input=prob[0], label=label) avg_loss = layers.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), batch_size=200) place = core.CPUPlace() exe = Executor(place) exe.run(kwargs['startup_program']) PASS_NUM = 100 for pass_id in range(PASS_NUM): for data in train_reader(): x_data = np.array(map(lambda x: x[0], data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape((y_data.shape[0], 1)) outs = exe.run(kwargs['main_program'], feed={ 'x': x_data, 'y': y_data }, fetch_list=[avg_loss]) print outs[0] if outs[0] < 1.0: return self.assertFalse(True)
def body_func(step_idx, pre_ids, pre_scores, gather_idx, caches, trg_src_attn_bias): # gather cell states corresponding to selected parent pre_caches = map_structure( lambda x: layers.gather(x, index=gather_idx), caches) pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=gather_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias), trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, enc_output=enc_output, caches=pre_caches, bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. accu_scores = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) step_idx = layers.increment(x=step_idx, value=1.0, in_place=False) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) return (step_idx, selected_ids, selected_scores, gather_idx, pre_caches, pre_src_attn_bias)
def gen_batch_like(value, dtype="int64", shape=[-1, 1, 1], is_scalar=True): if is_scalar: return layers.fill_constant_batch_size_like( input=parent_idx, value=value, shape=shape, dtype=dtype) else: return layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=parent_idx, value=1, shape=shape, dtype=dtype), y=value, axis=0)
def erniesage_v3_aggregator(gw, feature, hidden_size, act, initializer, learning_rate, name): msg = gw.send(copy_send, nfeat_list=[("h", feature)]) neigh_feature = gw.recv(msg, ernie_recv) neigh_feature = L.cast(L.unsqueeze(neigh_feature, [-1]), "int64") feature = L.unsqueeze(feature, [-1]) cls = L.fill_constant_batch_size_like(feature, [-1, 1, 1], "int64", 1) term_ids = L.concat([cls, feature[:, :-1], neigh_feature], 1) term_ids.stop_gradient = True return term_ids
def test_ifelse(self): prog = Program() startup_prog = Program() with program_guard(prog, startup_prog): image = layers.data(name='x', shape=[784], dtype='float32') label = layers.data(name='y', shape=[1], dtype='int64') limit = layers.fill_constant_batch_size_like( input=label, dtype='int64', shape=[1], value=5.0) cond = layers.less_than(x=label, y=limit) ie = layers.IfElse(cond) with ie.true_block(): true_image = ie.input(image) hidden = layers.fc(input=true_image, size=100, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) with ie.false_block(): false_image = ie.input(image) hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) prob = ie() loss = layers.cross_entropy(input=prob[0], label=label) avg_loss = layers.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), batch_size=200) place = core.CPUPlace() exe = Executor(place) exe.run(kwargs['startup_program']) PASS_NUM = 100 for pass_id in range(PASS_NUM): for data in train_reader(): x_data = np.array(map(lambda x: x[0], data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape((y_data.shape[0], 1)) outs = exe.run(kwargs['main_program'], feed={'x': x_data, 'y': y_data}, fetch_list=[avg_loss]) print outs[0] if outs[0] < 1.0: return self.assertFalse(True)
def fluid_sequence_delay(input, OOV): """ args: input: 1-level LoDTensor return: """ seq_len = fluid_sequence_get_seq_len(input) zeros = layers.fill_constant_batch_size_like(seq_len, shape=[-1, 1], value=0, dtype='int64') ones = layers.fill_constant_batch_size_like(seq_len, shape=[-1, 1], value=1, dtype='int64') oov = layers.sequence_slice(input, zeros, ones) * 0 + OOV oov.stop_gradient = True input_padded = layers.sequence_concat([input, oov]) output = layers.sequence_slice(input_padded, ones, seq_len) return output
def __call__(self, src, src_length, trg=None, trg_length=None): # encoder encoder_output, encoder_final_state = self.encoder( self.src_embeder(src), src_length) decoder_initial_states = [ encoder_final_state, self.decoder.decoder_cell.get_initial_states( batch_ref=encoder_output, shape=[encoder_output.shape[-1]]) ] src_mask = layers.sequence_mask(src_length, maxlen=layers.shape(src)[1], dtype="float32") encoder_padding_mask = (src_mask - 1.0) * 1e9 encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) # decoder decoder_kwargs = { "inputs": self.trg_embeder(trg), "sequence_length": trg_length, } if self.decoder.decoding_strategy == "train_greedy" else ( { "embedding_fn": self.trg_embeder, "beam_size": self.beam_size, "start_token": self.start_token, "end_token": self.end_token } if self.decoder.decoding_strategy == "beam_search" else { "embedding_fn": self.trg_embeder, "start_tokens": layers.fill_constant_batch_size_like(input=encoder_output, shape=[-1], dtype=src.dtype, value=self.start_token), "end_token": self.end_token }) decoder_kwargs["output_layer"] = self.output_layer (decoder_output, decoder_final_state, dec_seq_lengths) = self.decoder(decoder_initial_states, encoder_output, encoder_padding_mask, **decoder_kwargs) if self.decoder.decoding_strategy == "beam_search": # for inference return decoder_output logits, samples, sample_length = (decoder_output.cell_outputs, decoder_output.sample_ids, dec_seq_lengths) probs = layers.softmax(logits) return probs, samples, sample_length
def spatio_conv_layer(self, x, Ks, c_in, c_out, name): """Spatio convolution layer""" _, T, n, _ = x.shape if c_in > c_out: x_input = fl.conv2d(input=x, num_filters=c_out, filter_size=[1, 1], stride=[1, 1], padding="SAME", data_format="NHWC", param_attr=fluid.ParamAttr(name="%s_conv2d_1" % name)) elif c_in < c_out: # if the size of input channel is less than the output, # padding x to the same size of output channel. pad = fl.fill_constant_batch_size_like( input=x, shape=[-1, T, n, c_out - c_in], dtype="float32", value=0.0) x_input = fl.concat([x, pad], axis=3) else: x_input = x for i in range(Ks): # x_input shape: [B,T, num_nodes, c_out] x_input = fl.reshape(x_input, [-1, c_out]) x_input = self.message_passing(self.gw, x_input, name="%s_mp_%d" % (name, i), norm=self.gw.node_feat["norm"]) x_input = fl.fc(x_input, size=c_out, bias_attr=False, param_attr=fluid.ParamAttr(name="%s_gcn_fc_%d" % (name, i))) bias = fluid.layers.create_parameter(shape=[c_out], dtype='float32', is_bias=True, name='%s_gcn_bias_%d' % (name, i)) x_input = fluid.layers.elementwise_add(x_input, bias, act="relu") x_input = fl.reshape(x_input, [-1, T, n, c_out]) return x_input
def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiHeadAttention.Cache` or an instance of `MultiHeadAttention.StaticCache`. """ if type == MultiHeadAttention.StaticCache: # static_kv k, v = self.compute_kv(key, value) return self.StaticCache(k, v) elif value is None: # incremental_state k = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) v = layers.fill_constant_batch_size_like( input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0) return self.Cache(k, v) else: # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value)
def gru_step(self, input, hidden, mask=None): """ gru step """ hidden_array = [] for i in range(self.num_layers): hidden_temp = layers.slice(hidden, axes=[0], starts=[i], ends=[i + 1]) hidden_temp = layers.reshape(hidden_temp, shape=[-1, self.hidden_size]) hidden_array.append(hidden_temp) last_hidden_array = [] for k in range(self.num_layers): trans_input = layers.matmul(input, self.weight_input_array[k]) trans_input += self.bias_input_array[k] trans_hidden = layers.matmul(hidden_array[k], self.weight_hidden_array[k]) trans_hidden += self.bias_hidden_array[k] input_array = layers.split(trans_input, num_or_sections=3, dim=-1) trans_array = layers.split(trans_hidden, num_or_sections=3, dim=-1) reset_gate = layers.sigmoid(input_array[0] + trans_array[0]) input_gate = layers.sigmoid(input_array[1] + trans_array[1]) new_gate = layers.tanh(input_array[2] + reset_gate * trans_array[2]) new_hidden = new_gate + input_gate * (hidden_array[k] - new_gate) if mask: neg_mask = layers.fill_constant_batch_size_like( input=mask, shape=[1], value=1.0, dtype='float32') - mask new_hidden = new_hidden * mask + hidden_array[k] * neg_mask last_hidden_array.append(new_hidden) input = new_hidden if self.dropout and self.dropout > 0.0: input = layers.dropout(input, dropout_prob=self.dropout) last_hidden = layers.concat(last_hidden_array, 0) last_hidden = layers.reshape( last_hidden, shape=[self.num_layers, -1, self.hidden_size]) return input, last_hidden
def fluid_sequence_delay2(input, seq_len, OOV): """ args: input: 1-level LoDTensor seq_len: 1- return: """ oov = layers.cast(seq_len * 0 + OOV, input.dtype) oov.stop_gradient = True input_padded = layers.sequence_concat([input, oov]) offset = layers.fill_constant_batch_size_like(seq_len, shape=[-1, 1], value=1, dtype='int64') output = layers.sequence_slice(input_padded, offset, layers.cast(seq_len, 'int64')) return output
def ernie_send(src_feat, dst_feat, edge_feat): """doc""" cls = L.fill_constant_batch_size_like(src_feat["term_ids"], [-1, 1, 1], "int64", 1) src_ids = L.concat([cls, src_feat["term_ids"]], 1) dst_ids = dst_feat["term_ids"] sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1) term_ids = L.concat([src_ids, dst_ids], 1) term_ids.stop_gradient = True sent_ids.stop_gradient = True ernie = ErnieModel(term_ids, sent_ids, config=self.config.ernie_config) feature = ernie.get_pooled_output() return feature
def ernie_send_aggregate(self, gw, feature, act, name): def ernie_send(src_feat, dst_feat, edge_feat): def build_position_ids(term_ids): input_mask = L.cast(term_ids > 0, "int64") position_ids = L.cumsum(input_mask, axis=1) - 1 return position_ids """doc""" # input_ids cls = L.fill_constant_batch_size_like(src_feat["term_ids"], [-1, 1], "int64", self.config.cls_id) src_ids = L.concat([cls, src_feat["term_ids"]], 1) dst_ids = dst_feat["term_ids"] # sent_ids sent_ids = L.concat([L.zeros_like(src_ids), L.ones_like(dst_ids)], 1) term_ids = L.concat([src_ids, dst_ids], 1) # position_ids position_ids = build_position_ids(term_ids) ernie_model = ErnieModel(self.config.ernie_config, "") feature, _ = ernie_model(term_ids, sent_ids, position_ids) return feature term_ids = feature msg = gw.send(ernie_send, nfeat_list=[("term_ids", term_ids)]) neigh_feature = gw.recv( msg, lambda feat: F.layers.sequence_pool(feat, pool_type="sum")) cls = L.fill_constant_batch_size_like(term_ids, [-1, 1], "int64", self.config.cls_id) term_ids = L.concat([cls, term_ids], 1) ernie_model = ErnieModel(self.config.ernie_config, "") self_feature, _ = ernie_model(term_ids) hidden_size = self.config.hidden_size self_feature = linear(self_feature, hidden_size, name + "_l", act) neigh_feature = linear(neigh_feature, hidden_size, name + "_r", act) output = L.concat([self_feature, neigh_feature], axis=1) output = L.l2_normalize(output, axis=1) return output
def topp_sampling(self, probs): sorted_probs, sorted_idx = layers.argsort(probs, descending=True) cum_sorted_probs = layers.cumsum(sorted_probs, axis=1, exclusive=True) lt_cond = paddle.cast( paddle.less_than( cum_sorted_probs, layers.fill_constant_batch_size_like(cum_sorted_probs, cum_sorted_probs.shape, cum_sorted_probs.dtype, self.topp)), "float32") old_probs = probs candidate_probs = sorted_probs * lt_cond probs = candidate_probs / paddle.sum( candidate_probs, axis=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") sampling_ids = paddle.index_sample(sorted_idx, paddle.unsqueeze(sampling_ids, [1])) sampling_ids = paddle.squeeze(sampling_ids, [1]) probs = old_probs return probs, sampling_ids
def test_raw_api(self): prog = Program() startup_prog = Program() with program_guard(prog, startup_prog): image = layers.data(name='x', shape=[784], dtype='float32') label = layers.data(name='y', shape=[1], dtype='int64') limit = layers.fill_constant_batch_size_like( input=label, dtype='int64', shape=[1], value=5.0) cond = layers.less_than(x=label, y=limit) true_image, false_image = layers.split_lod_tensor( input=image, mask=cond) true_out = layers.create_tensor(dtype='float32') true_cond = layers.ConditionalBlock([true_image]) with true_cond.block(): hidden = layers.fc(input=true_image, size=100, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') layers.assign(input=prob, output=true_out) false_out = layers.create_tensor(dtype='float32') false_cond = layers.ConditionalBlock([false_image]) with false_cond.block(): hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') layers.assign(input=prob, output=false_out) prob = layers.merge_lod_tensor( in_true=true_out, in_false=false_out, mask=cond, x=image) loss = layers.cross_entropy(input=prob, label=label) avg_loss = layers.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=8192), batch_size=200) place = core.CPUPlace() exe = Executor(place) exe.run(startup_prog) PASS_NUM = 100 for pass_id in range(PASS_NUM): for data in train_reader(): x_data = np.array(map(lambda x: x[0], data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = np.expand_dims(y_data, axis=1) outs = exe.run(prog, feed={'x': x_data, 'y': y_data}, fetch_list=[avg_loss]) print outs[0] if outs[0] < 1.0: return self.assertFalse(True)