def prepare_encoder(src_word, src_pos, src_vocab_size, src_emb_dim, src_pad_idx, src_max_len, dropout=0., pos_pad_idx=0, pos_enc_param_name=None): """Add word embeddings and position encodings. The output tensor has a shape of: [batch_size, max_src_length_in_batch, d_model]. This module is used at the bottom of the encoder stacks. """ src_word_emb = layers.embedding( src_word, size=[src_vocab_size, src_emb_dim], padding_idx=src_pad_idx, param_attr=fluid.initializer.Normal(0., 1.)) src_pos_enc = layers.embedding( src_pos, size=[src_max_len, src_emb_dim], padding_idx=pos_pad_idx, param_attr=fluid.ParamAttr( name=pos_enc_param_name, trainable=False)) enc_input = src_word_emb + src_pos_enc # FIXME(guosheng): Decouple the program desc with batch_size. enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim]) return layers.dropout( enc_input, dropout_prob=dropout, is_test=False) if dropout else enc_input
def __combine_heads(x): """ Transpose and then reshape the last two dimensions of inpunt tensor x so that it becomes one dimension, which is reverse to __split_heads. """ if len(x.shape) == 3: return x if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) # FIXME(guosheng): Decouple the program desc with batch_size. return layers.reshape( x=trans_x, shape=map(int, [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
def __split_heads(x, n_head): """ Reshape the last dimension of inpunt tensor x so that it becomes two dimensions and then transpose. Specifically, input a tensor with shape [bs, max_sequence_length, n_head * hidden_dim] then output a tensor with shape [bs, n_head, max_sequence_length, hidden_dim]. """ if n_head == 1: return x hidden_size = x.shape[-1] # FIXME(guosheng): Decouple the program desc with batch_size. reshaped = layers.reshape( x=x, shape=[batch_size, -1, n_head, hidden_size // n_head]) # permuate the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def encoder(x, y, vocab_size, emb_size, init_hidden=None, init_cell=None, para_name='', custom_samples=None, custom_probabilities=None, test_mode=False, args=None): x_emb = layers.embedding( input=x, size=[vocab_size, emb_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr(name='embedding_para')) rnn_input = x_emb rnn_outs = [] rnn_outs_ori = [] cells = [] projs = [] for i in range(args.num_layers): rnn_input = dropout(rnn_input, test_mode, args) if init_hidden and init_cell: h0 = layers.squeeze( layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) c0 = layers.squeeze( layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) else: h0 = c0 = None rnn_out, cell, input_proj = lstmp_encoder( rnn_input, args.hidden_size, h0, c0, para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args) rnn_out_ori = rnn_out if i > 0: rnn_out = rnn_out + rnn_input rnn_out = dropout(rnn_out, test_mode, args) cell = dropout(cell, test_mode, args) rnn_outs.append(rnn_out) rnn_outs_ori.append(rnn_out_ori) rnn_input = rnn_out cells.append(cell) projs.append(input_proj) softmax_weight = layers.create_parameter( [vocab_size, emb_size], dtype="float32", name="softmax_weight") softmax_bias = layers.create_parameter( [vocab_size], dtype="float32", name='softmax_bias') projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape(projection, shape=[-1, vocab_size]) if args.sample_softmax and (not test_mode): loss = layers.sampled_softmax_with_cross_entropy( logits=projection, label=y, num_samples=args.n_negative_samples_batch, seed=args.random_seed) if args.debug: layers.Print(loss, message='out_loss', summarize=100) else: label = layers.one_hot(input=y, depth=vocab_size) loss = layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=True) return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
def wrap_decoder(trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, dec_inputs=None, enc_output=None): """ The wrapper assembles together all needed layers for the decoder. """ if dec_inputs is None: # This is used to implement independent decoder program in inference. trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ enc_output, trg_data_shape, slf_attn_pre_softmax_shape, \ slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \ src_attn_post_softmax_shape = make_all_inputs( decoder_data_input_fields + decoder_util_input_fields) else: trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ trg_data_shape, slf_attn_pre_softmax_shape, \ slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \ src_attn_post_softmax_shape = dec_inputs dec_input = prepare_decoder( trg_word, trg_pos, trg_vocab_size, d_model, max_length, dropout_rate, trg_data_shape, ) dec_output = decoder( dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, src_attn_post_softmax_shape, ) # Return logits for training and probs for inference. predict = layers.reshape(x=layers.fc(input=dec_output, size=trg_vocab_size, bias_attr=False, num_flatten_dims=2), shape=[-1, trg_vocab_size], act="softmax" if dec_inputs is None else None) return predict
def beam_search(): max_len = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=max_out_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) # default force_cpu=True while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)), step_idx) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. # caches contains states of history steps in decoder self-attention # and static encoder output projections in encoder-decoder attention # to reduce redundant computation. caches = [ { "k": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, n_head, 0, d_key], dtype=enc_output.dtype, value=0), "v": # for self attention layers.fill_constant_batch_size_like( input=start_tokens, shape=[-1, n_head, 0, d_value], dtype=enc_output.dtype, value=0), "static_k": # for encoder-decoder attention layers.create_tensor(dtype=enc_output.dtype), "static_v": # for encoder-decoder attention layers.create_tensor(dtype=enc_output.dtype) } for i in range(n_layer) ] with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) # Since beam_search_op dosen't enforce pre_ids' shape, we can do # inplace reshape here which actually change the shape of pre_ids. pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) # gather cell states corresponding to selected parent pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=parent_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder(trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias), enc_output=enc_output, caches=caches, gather_idx=parent_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. topk_indices = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) layers.increment(x=step_idx, value=1.0, in_place=True) # cell states(caches) have been updated in wrap_decoder, # only need to update beam search states here. layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.assign(gather_idx, parent_idx) layers.assign(pre_src_attn_bias, trg_src_attn_bias) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=eos_idx) return finished_ids, finished_scores
def mask_prob(p, onehot_eos, finished): is_finished = L.cast(L.reshape(finished, [-1, 1]) != 0, 'float32') p = is_finished * (1. - L.cast(onehot_eos, 'float32')) * -9999. + ( 1. - is_finished) * p return p
def get_single_direction_output(rnn_input, unit_list, mask=None, direc_index=0): rnn = StaticRNN() with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) pre_cell = rnn.memory(init=init_cell[i, direc_index]) else: pre_hidden = rnn.memory( batch_ref=rnn_input, shape=[-1, hidden_size]) pre_cell = rnn.memory( batch_ref=rnn_input, shape=[-1, hidden_size]) new_hidden, new_cell = unit_list[i](step_input, pre_hidden, pre_cell) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul( pre_hidden, (step_mask - 1), axis=0) new_cell = layers.elementwise_mul( new_cell, step_mask, axis=0) - layers.elementwise_mul( pre_cell, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.update_memory(pre_cell, new_cell) rnn.step_output(new_hidden) rnn.step_output(new_cell) step_input = new_hidden if dropout_prob != None and dropout_prob > 0.0: step_input = layers.dropout( step_input, dropout_prob=dropout_prob, dropout_implementation='upscale_in_train') rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] last_cell_array = [] rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i * 2] last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) last_cell = rnn_out[i * 2 + 1] last_cell = last_cell[-1] last_cell_array.append(last_cell) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape( last_hidden_output, shape=[num_layers, -1, hidden_size]) last_cell_output = layers.concat(last_cell_array, axis=0) last_cell_output = layers.reshape( last_cell_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output, last_cell_output
def beam_search(enc_output, enc_bias, source_length): """ beam_search """ max_len = layers.fill_constant(shape=[1], dtype='int64', value=max_out_len) step_idx = layers.fill_constant(shape=[1], dtype='int64', value=0) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) caches_batch_size = batch_size * beam_size init_score = np.zeros([1, beam_size]).astype('float32') init_score[:, 1:] = -INF initial_log_probs = layers.assign(init_score) alive_log_probs = layers.expand(initial_log_probs, [batch_size, 1]) # alive seq [batch_size, beam_size, 1] initial_ids = layers.zeros([batch_size, 1, 1], 'float32') alive_seq = layers.expand(initial_ids, [1, beam_size, 1]) alive_seq = layers.cast(alive_seq, 'int64') enc_output = layers.unsqueeze(enc_output, axes=[1]) enc_output = layers.expand(enc_output, [1, beam_size, 1, 1]) enc_output = layers.reshape(enc_output, [caches_batch_size, -1, d_model]) tgt_src_attn_bias = layers.unsqueeze(enc_bias, axes=[1]) tgt_src_attn_bias = layers.expand(tgt_src_attn_bias, [1, beam_size, n_head, 1, 1]) enc_bias_shape = layers.shape(tgt_src_attn_bias) tgt_src_attn_bias = layers.reshape( tgt_src_attn_bias, [-1, enc_bias_shape[2], enc_bias_shape[3], enc_bias_shape[4]]) beam_search = BeamSearch(beam_size, batch_size, decode_alpha, trg_vocab_size, d_model) caches = [{ "k": layers.fill_constant(shape=[caches_batch_size, 0, d_model], dtype=enc_output.dtype, value=0), "v": layers.fill_constant(shape=[caches_batch_size, 0, d_model], dtype=enc_output.dtype, value=0) } for i in range(n_layer)] finished_seq = layers.zeros_like(alive_seq) finished_scores = layers.fill_constant([batch_size, beam_size], dtype='float32', value=-INF) finished_flags = layers.fill_constant([batch_size, beam_size], dtype='float32', value=0) with while_op.block(): pos = layers.fill_constant([caches_batch_size, 1, 1], dtype='int64', value=1) pos = layers.elementwise_mul(pos, step_idx, axis=0) alive_seq_1 = layers.reshape(alive_seq, [caches_batch_size, -1]) alive_seq_2 = alive_seq_1[:, -1:] alive_seq_2 = layers.unsqueeze(alive_seq_2, axes=[1]) logits = wrap_decoder(trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, dec_inputs=(alive_seq_2, alive_seq_2, pos, None, tgt_src_attn_bias), enc_output=enc_output, caches=caches, is_train=False, params_type=params_type) alive_seq_2, alive_log_probs_2, finished_seq_2, finished_scores_2, finished_flags_2, caches_2 = \ beam_search.inner_func(step_idx, logits, alive_seq_1, alive_log_probs, finished_seq, finished_scores, finished_flags, caches, enc_output, tgt_src_attn_bias) layers.increment(x=step_idx, value=1.0, in_place=True) finish_cond = beam_search.is_finished(step_idx, source_length, alive_log_probs_2, finished_scores_2, finished_flags_2) layers.assign(alive_seq_2, alive_seq) layers.assign(alive_log_probs_2, alive_log_probs) layers.assign(finished_seq_2, finished_seq) layers.assign(finished_scores_2, finished_scores) layers.assign(finished_flags_2, finished_flags) for i in xrange(len(caches_2)): layers.assign(caches_2[i]["k"], caches[i]["k"]) layers.assign(caches_2[i]["v"], caches[i]["v"]) layers.logical_and(x=cond, y=finish_cond, out=cond) finished_flags = layers.reduce_sum( finished_flags, dim=1, keep_dim=True) / beam_size finished_flags = layers.cast(finished_flags, 'bool') mask = layers.cast( layers.reduce_any(input=finished_flags, dim=1, keep_dim=True), 'float32') mask = layers.expand(mask, [1, beam_size]) mask2 = 1.0 - mask finished_seq = layers.cast(finished_seq, 'float32') alive_seq = layers.cast(alive_seq, 'float32') #print mask finished_seq = layers.elementwise_mul(finished_seq, mask, axis=0) + \ layers.elementwise_mul(alive_seq, mask2, axis = 0) finished_seq = layers.cast(finished_seq, 'int32') finished_scores = layers.elementwise_mul(finished_scores, mask, axis=0) + \ layers.elementwise_mul(alive_log_probs, mask2) finished_seq.persistable = True finished_scores.persistable = True return finished_seq, finished_scores
def graph_linformer(gw, feature, edge_feature, hidden_size, name, num_heads=4, attn_drop=False, concat=True, skip_feat=True, gate=False, norm=True, relu=True, k_hop=2, is_test=False): """Implementation of graph Transformer from UniMP This is an implementation of the paper Unified Massage Passing Model for Semi-Supervised Classification (https://arxiv.org/abs/2009.03509). Args: name: Granph Transformer layer names. gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, feature_size). hidden_size: The hidden size for graph transformer. num_heads: The head number in graph transformer. attn_drop: Dropout rate for attention. edge_feature: A tensor with shape (num_edges, feature_size). num_heads: 8 concat: Reshape the output (num_nodes, num_heads, hidden_size) by concat (num_nodes, hidden_size * num_heads) or mean (num_nodes, hidden_size) skip_feat: Whether use skip connect gate: Whether add skip_feat and output up with gate weight norm: Whether use layer_norm for output relu: Whether use relu activation for output is_test: Whether in test phrase. Return: A tensor with shape (num_nodes, hidden_size * num_heads) or (num_nodes, hidden_size) """ def send_attention(src_feat, dst_feat, edge_feat): if edge_feat is None or not edge_feat: k_h = L.elu( L.reshape(src_feat["k_h"], [-1, num_heads, hidden_size, 1])) + 1 v_h = dst_feat["v_h"] else: edge_feat = edge_feat["edge"] edge_feat = L.reshape(edge_feat, [-1, num_heads, hidden_size]) k_h = L.elu(src_feat["k_h"] + edge_feat) + 1 v_h = dst_feat["v_h"] + edge_feat k_h = L.reshape(k_h, [-1, num_heads, hidden_size, 1]) v_h = L.reshape(v_h, [-1, num_heads, hidden_size, 1]) sum_kTv = L.matmul(k_h, v_h, transpose_y=True) sum_k = L.reshape(k_h, [-1, num_heads * hidden_size]) sum_kTv = L.reshape(sum_kTv, [-1, num_heads * hidden_size * hidden_size]) return {"sum_k": sum_k, "sum_kTv": sum_kTv} def send_copy(src_feat, dst_feat, edge_feat): return src_feat def reduce_sum(msg): return L.sequence_pool(msg, "sum") q = L.elu( linear(feature, hidden_size * num_heads, name=name + '_q_weight', init_type='gcn')) + 1 k = linear(feature, hidden_size * num_heads, name=name + '_k_weight', init_type='gcn') v = linear(feature, hidden_size * num_heads, name=name + '_v_weight', init_type='gcn') reshape_q = L.reshape(q, [-1, num_heads, 1, hidden_size]) reshape_k = L.reshape(k, [-1, num_heads, hidden_size]) reshape_v = L.reshape(v, [-1, num_heads, hidden_size]) msg = gw.send(send_attention, nfeat_list=[("k_h", reshape_k), ("v_h", reshape_v)], efeat_list=[('edge', edge_feature)]) sum_k = gw.recv(msg["sum_k"], reduce_sum) sum_kTv = gw.recv(msg["sum_kTv"], reduce_sum) for i in range(1, k_hop): msg = gw.send(send_copy, nfeat_list=[("sum_k", sum_k), ("sum_kTv", sum_kTv)]) sum_k = gw.recv(msg["sum_k"], reduce_sum) sum_kTv = gw.recv(msg["sum_kTv"], reduce_sum) # sum_k: [-1, num_heads * hidden_size] # sum_kTv: [-1, num_heads * hidden_size * hidden_size] sum_k = L.reshape(sum_k, [-1, num_heads, 1, hidden_size]) sum_kTv = L.reshape(sum_kTv, [-1, num_heads, hidden_size, hidden_size]) out_feat = L.reshape(L.matmul(reshape_q, sum_kTv), [-1, num_heads, hidden_size]) / L.reduce_sum( reshape_q * sum_k, -1) if concat: out_feat = L.reshape(out_feat, [-1, num_heads * hidden_size]) else: out_feat = L.reduce_mean(out_feat, dim=1) if skip_feat: if concat: skip_feature = linear(feature, hidden_size * num_heads, name=name + '_skip_weight', init_type='lin') else: skip_feature = linear(feature, hidden_size, name=name + '_skip_weight', init_type='lin') if gate: temp_output = L.concat( [skip_feature, out_feat, out_feat - skip_feature], axis=-1) gate_f = L.sigmoid( linear(temp_output, 1, name=name + '_gate_weight', init_type='lin')) out_feat = skip_feature * gate_f + out_feat * (1 - gate_f) else: out_feat = skip_feature + out_feat if norm: out_feat = layer_norm(out_feat, name="ln_%s" % name) if relu: out_feat = L.relu(out_feat) return out_feat
def basic_gru(input, init_hidden, hidden_size, num_layers=1, sequence_length=None, dropout_prob=0.0, bidirectional=False, batch_first=True, param_attr=None, bias_attr=None, gate_activation=None, activation=None, dtype='float32', name='basic_gru'): r""" GRU implementation using basic operator, supports multiple layers and bidirectional gru. .. math:: u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u) r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r) m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m) h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) Args: input (Variable): GRU input tensor, if batch_first = False, shape should be ( seq_len x batch_size x input_size ) if batch_first = True, shape should be ( batch_size x seq_len x hidden_size ) init_hidden(Variable|None): The initial hidden state of the GRU This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) and can be reshaped to tensor with ( num_layers x 2 x batch_size x hidden_size) to use. If it's None, it will be set to all 0. hidden_size (int): Hidden size of the GRU num_layers (int): The total number of layers of the GRU sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance, This tensor will be convert to a mask to mask the padding ids If it's None means NO padding ids dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers, NOT between time steps bidirectional (bool|False): If it is bidirectional batch_first (bool|True): The shape format of the input and output tensors. If true, the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false, the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default this function accepts input and emits output in batch-major form to be consistent with most of data format, though a bit less efficient because of extra transposes. param_attr(ParamAttr|None): The parameter attribute for the learnable weight matrix. Note: If it is set to None or one attribute of ParamAttr, gru_unit will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr (ParamAttr|None): The parameter attribute for the bias of GRU unit. If it is set to None or one attribute of ParamAttr, gru_unit will create ParamAttr as bias_attr. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. gate_activation (function|None): The activation function for gates (actGate). Default: 'fluid.layers.sigmoid' activation (function|None): The activation function for cell (actNode). Default: 'fluid.layers.tanh' dtype(string): data type used in this unit name(string): name used to identify parameters and biases Returns: rnn_out(Tensor),last_hidden(Tensor) - rnn_out is result of GRU hidden, with shape (seq_len x batch_size x hidden_size) \ if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) - last_hidden is the hidden state of the last step of GRU \ shape is ( num_layers x batch_size x hidden_size ) \ if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size), can be reshaped to a tensor with shape( num_layers x 2 x batch_size x hidden_size) Examples: .. code-block:: python import paddle.fluid.layers as layers from paddle.fluid.contrib.layers import basic_gru batch_size = 20 input_size = 128 hidden_size = 256 num_layers = 2 dropout = 0.5 bidirectional = True batch_first = False input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32') pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32') sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32') rnn_out, last_hidden = basic_gru( input, pre_hidden, hidden_size, num_layers = num_layers, \ sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \ batch_first = batch_first) """ fw_unit_list = [] for i in range(num_layers): new_name = name + "_layers_" + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += "_fw_w_" + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += "_fw_b_" + str(i) else: layer_bias_attr = bias_attr fw_unit_list.append( BasicGRUUnit(new_name, hidden_size, layer_param_attr, layer_bias_attr, gate_activation, activation, dtype)) if bidirectional: bw_unit_list = [] for i in range(num_layers): new_name = name + "_reverse_layers_" + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += "_bw_w_" + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += "_bw_b_" + str(i) else: layer_bias_attr = bias_attr bw_unit_list.append( BasicGRUUnit(new_name, hidden_size, layer_param_attr, layer_bias_attr, gate_activation, activation, dtype)) if batch_first: input = layers.transpose(input, [1, 0, 2]) mask = None if sequence_length: max_seq_len = layers.shape(input)[0] mask = layers.sequence_mask( sequence_length, maxlen=max_seq_len, dtype='float32') mask = layers.transpose(mask, [1, 0]) direc_num = 1 if bidirectional: direc_num = 2 if init_hidden: init_hidden = layers.reshape( init_hidden, shape=[num_layers, direc_num, -1, hidden_size]) def get_single_direction_output(rnn_input, unit_list, mask=None, direc_index=0): rnn = StaticRNN() with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) else: pre_hidden = rnn.memory( batch_ref=rnn_input, shape=[-1, hidden_size], ref_batch_dim_idx=1) new_hidden = unit_list[i](step_input, pre_hidden) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul( pre_hidden, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.step_output(new_hidden) step_input = new_hidden if dropout_prob != None and dropout_prob > 0.0: step_input = layers.dropout( step_input, dropout_prob=dropout_prob, ) rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i] last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape( last_hidden_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output # seq_len, batch_size, hidden_size fw_rnn_out, fw_last_hidden = get_single_direction_output( input, fw_unit_list, mask, direc_index=0) if bidirectional: bw_input = layers.reverse(input, axis=[0]) bw_mask = None if mask: bw_mask = layers.reverse(mask, axis=[0]) bw_rnn_out, bw_last_hidden = get_single_direction_output( bw_input, bw_unit_list, bw_mask, direc_index=1) bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0]) rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2) last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1) last_hidden = layers.reshape( last_hidden, shape=[num_layers * direc_num, -1, hidden_size]) if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden else: rnn_out = fw_rnn_out last_hidden = fw_last_hidden if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden
def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): input = rnn.step_input(input_embedding) for k in range(num_layers): pre_hidden = rnn.memory(init=hidden_array[k]) pre_cell = rnn.memory(init=cell_array[k]) weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i = layers.slice(gate_input, axes=[1], starts=[0], ends=[hidden_size]) j = layers.slice(gate_input, axes=[1], starts=[hidden_size], ends=[hidden_size * 2]) f = layers.slice(gate_input, axes=[1], starts=[hidden_size * 2], ends=[hidden_size * 3]) o = layers.slice(gate_input, axes=[1], starts=[hidden_size * 3], ends=[hidden_size * 4]) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) rnn.step_output(m) rnn.step_output(c) input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') rnn.step_output(input) rnnout = rnn() last_hidden_array = [] last_cell_array = [] real_res = rnnout[-1] for i in range(num_layers): m = rnnout[i * 2] c = rnnout[i * 2 + 1] m.stop_gradient = True c.stop_gradient = True last_h = layers.slice(m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_hidden_array.append(last_h) last_c = layers.slice(c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_cell_array.append(last_c) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) return real_res, last_hidden, last_cell
def graph_transformer(gw, feature, edge_feature, hidden_size, name, num_heads=4, attn_drop=False, concat=True, skip_feat=True, gate=False, norm=True, relu=True, is_test=False): """Implementation of graph Transformer from UniMP This is an implementation of the paper Unified Massage Passing Model for Semi-Supervised Classification (https://arxiv.org/abs/2009.03509). Args: name: Granph Transformer layer names. gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, feature_size). hidden_size: The hidden size for graph transformer. num_heads: The head number in graph transformer. attn_drop: Dropout rate for attention. edge_feature: A tensor with shape (num_edges, feature_size). concat: Reshape the output (num_nodes, num_heads, hidden_size) by concat (num_nodes, hidden_size * num_heads) or mean (num_nodes, hidden_size) skip_feat: Whether use skip connect gate: Whether add skip_feat and output up with gate weight norm: Whether use layer_norm for output relu: Whether use relu activation for output is_test: Whether in test phrase. Return: A tensor with shape (num_nodes, hidden_size * num_heads) or (num_nodes, hidden_size) """ def send_attention(src_feat, dst_feat, edge_feat): if edge_feat is None or not edge_feat: output = src_feat["k_h"] * dst_feat["q_h"] output = L.reduce_sum(output, -1) output = output / (hidden_size**0.5) return { "alpha": output, "v": src_feat["v_h"] } # batch x h batch x h x feat else: edge_feat = edge_feat["edge"] edge_feat = L.reshape(edge_feat, [-1, num_heads, hidden_size]) output = (src_feat["k_h"] + edge_feat) * dst_feat["q_h"] output = L.reduce_sum(output, -1) output = output / (hidden_size**0.5) return { "alpha": output, "v": (src_feat["v_h"] + edge_feat) } # batch x h batch x h x feat class Reduce_attention(): def __init__(self, ): self.alpha = None def __call__(self, msg): alpha = msg["alpha"] # lod-tensor (batch_size, num_heads) if attn_drop: old_h = alpha dropout = F.data(name='attn_drop', shape=[1], dtype="int64") u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'), min=0., max=1.) keeped = L.cast(u > dropout, dtype="float32") self_attn_mask = L.scale(x=keeped, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads, axis=1) n_head_self_attn_mask.stop_gradient = True alpha = n_head_self_attn_mask + alpha alpha = L.lod_reset(alpha, old_h) h = msg["v"] alpha = paddle_helper.sequence_softmax(alpha) self.alpha = alpha old_h = h h = h * alpha h = L.lod_reset(h, old_h) h = L.sequence_pool(h, "sum") if concat: h = L.reshape(h, [-1, num_heads * hidden_size]) else: h = L.reduce_mean(h, dim=1) return h reduce_attention = Reduce_attention() q = linear(feature, hidden_size * num_heads, name=name + '_q_weight', init_type='gcn') k = linear(feature, hidden_size * num_heads, name=name + '_k_weight', init_type='gcn') v = linear(feature, hidden_size * num_heads, name=name + '_v_weight', init_type='gcn') reshape_q = L.reshape(q, [-1, num_heads, hidden_size]) reshape_k = L.reshape(k, [-1, num_heads, hidden_size]) reshape_v = L.reshape(v, [-1, num_heads, hidden_size]) msg = gw.send(send_attention, nfeat_list=[("q_h", reshape_q), ("k_h", reshape_k), ("v_h", reshape_v)], efeat_list=[('edge', edge_feature)]) out_feat = gw.recv(msg, reduce_attention) if skip_feat: if concat: skip_feature = linear(feature, hidden_size * num_heads, name=name + '_skip_weight', init_type='lin') else: skip_feature = linear(feature, hidden_size, name=name + '_skip_weight', init_type='lin') if gate: temp_output = L.concat( [skip_feature, out_feat, out_feat - skip_feature], axis=-1) gate_f = L.sigmoid( linear(temp_output, 1, name=name + '_gate_weight', init_type='lin')) out_feat = skip_feature * gate_f + out_feat * (1 - gate_f) else: out_feat = skip_feature + out_feat if norm: out_feat = layer_norm(out_feat, name="ln_%s" % name) if relu: out_feat = L.relu(out_feat) return out_feat
def lm_model(hidden_size, vocab_size, num_layers=2, num_steps=20, init_scale=0.1, dropout=None, rnn_model='static', use_dataloader=False): def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): input = rnn.step_input(input_embedding) for k in range(num_layers): pre_hidden = rnn.memory(init=hidden_array[k]) pre_cell = rnn.memory(init=cell_array[k]) weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i = layers.slice(gate_input, axes=[1], starts=[0], ends=[hidden_size]) j = layers.slice(gate_input, axes=[1], starts=[hidden_size], ends=[hidden_size * 2]) f = layers.slice(gate_input, axes=[1], starts=[hidden_size * 2], ends=[hidden_size * 3]) o = layers.slice(gate_input, axes=[1], starts=[hidden_size * 3], ends=[hidden_size * 4]) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) rnn.step_output(m) rnn.step_output(c) input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') rnn.step_output(input) rnnout = rnn() last_hidden_array = [] last_cell_array = [] real_res = rnnout[-1] for i in range(num_layers): m = rnnout[i * 2] c = rnnout[i * 2 + 1] m.stop_gradient = True c.stop_gradient = True last_h = layers.slice(m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_hidden_array.append(last_h) last_c = layers.slice(c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_cell_array.append(last_c) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) return real_res, last_hidden, last_cell def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split(input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape(last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape(last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape(real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell x = fluid.data(name="x", shape=[None, num_steps, 1], dtype='int64') y = fluid.data(name="y", shape=[None, 1], dtype='int64') if use_dataloader: dataloader = fluid.io.DataLoader.from_generator(feed_list=[x, y], capacity=16, iterable=False, use_double_buffer=True) init_hidden = fluid.data(name="init_hidden", shape=[None, num_layers, hidden_size], dtype='float32') init_cell = fluid.data(name="init_cell", shape=[None, num_layers, hidden_size], dtype='float32') init_cell.persistable = True init_hidden.persistable = True init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2]) init_cell = layers.transpose(init_cell, perm=[1, 0, 2]) init_hidden_reshape = layers.reshape(init_hidden, shape=[num_layers, -1, hidden_size]) init_cell_reshape = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size]) x_emb = layers.embedding( input=x, size=[vocab_size, hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))) x_emb = layers.reshape(x_emb, shape=[-1, num_steps, hidden_size], inplace=True) if dropout != None and dropout > 0.0: x_emb = layers.dropout(x_emb, dropout_prob=dropout, dropout_implementation='upscale_in_train') if rnn_model == "padding": rnn_out, last_hidden, last_cell = padding_rnn( x_emb, len=num_steps, init_hidden=init_hidden_reshape, init_cell=init_cell_reshape) elif rnn_model == "static": rnn_out, last_hidden, last_cell = encoder_static( x_emb, len=num_steps, init_hidden=init_hidden_reshape, init_cell=init_cell_reshape) elif rnn_model == "cudnn": x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden_reshape, init_cell_reshape, num_steps, hidden_size, num_layers, is_bidirec=False, default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2]) elif rnn_model == "basic_lstm": rnn_out, last_hidden, last_cell = basic_lstm( x_emb, init_hidden, init_cell, hidden_size, \ num_layers=num_layers, batch_first=True, dropout_prob=dropout, \ param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ), \ bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ), \ forget_bias = 0.0) else: print("type not support") return rnn_out = layers.reshape(rnn_out, shape=[-1, num_steps, hidden_size], inplace=True) softmax_weight = layers.create_parameter( [hidden_size, vocab_size], dtype="float32", name="softmax_weight", default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) softmax_bias = layers.create_parameter( [vocab_size], dtype="float32", name='softmax_bias', default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) projection = layers.matmul(rnn_out, softmax_weight) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape(projection, shape=[-1, vocab_size], inplace=True) loss = layers.softmax_with_cross_entropy(logits=projection, label=y, soft_label=False) loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True) loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) loss.persistable = True last_cell.persistable = True last_hidden.persistable = True # This will feed last_hidden, last_cell to init_hidden, init_cell, which # can be used directly in next batch. This can avoid the fetching of # last_hidden and last_cell and feeding of init_hidden and init_cell in # each training step. last_hidden = layers.transpose(last_hidden, perm=[1, 0, 2]) last_cell = layers.transpose(last_cell, perm=[1, 0, 2]) feeding_list = ['x', 'y', 'init_hidden', 'init_cell'] if use_dataloader: return loss, last_hidden, last_cell, feeding_list, dataloader else: return loss, last_hidden, last_cell, feeding_list
def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split(input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape(last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape(last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape(real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell
def knowledge_seq2seq(config): """ knowledge seq2seq """ emb_size = config.embed_size hidden_size = config.hidden_size input_size = emb_size num_layers = config.num_layers bi_direc = config.bidirectional batch_size = config.batch_size vocab_size = config.vocab_size run_type = config.run_type enc_input = layers.data(name="enc_input", shape=[1], dtype='int64', lod_level=1) #enc_input --> goal enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32') goal_input = layers.data(name="goal_input", shape=[1], dtype='int64', lod_level=1) #goal_input --> x cue_input = layers.data(name="cue_input", shape=[1], dtype='int64', lod_level=1) #cue_input --> kg #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32') memory_mask = layers.data(name='memory_mask', shape=[-1, 1], dtype='float32') tar_input = layers.data(name='tar_input', shape=[1], dtype='int64', lod_level=1) #tar_input --> y # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32') rnn_hidden_size = hidden_size if bi_direc: rnn_hidden_size //= 2 enc_out, enc_last_hidden = \ rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc") goal_out, goal_last_hidden = \ rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc1") context_goal_out = fluid.layers.concat( input=[enc_last_hidden, goal_last_hidden], axis=2) context_goal_out = layers.reshape(context_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # context_goal_out = layers.squeeze(context_goal_out, axes=[1]) context_goal_out = fluid.layers.fc(context_goal_out, size=rnn_hidden_size * 2, bias_attr=False) context_goal_out = layers.unsqueeze(context_goal_out, axes=[0]) bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge") bridge_out = layers.tanh(bridge_out) cue_last_mask = layers.data(name='cue_last_mask', shape=[-1], dtype='float32') knowledge_out, knowledge_last_hidden = \ rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc") query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1]) query = layers.squeeze(query, axes=[0]) query = layers.unsqueeze(query, axes=[1]) query = layers.reshape(query, shape=[batch_size, -1, hidden_size]) cue_memory = layers.slice(knowledge_last_hidden, axes=[0], starts=[0], ends=[1]) cue_memory = layers.reshape(cue_memory, shape=[batch_size, -1, hidden_size]) memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1]) weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask) cue_att = layers.reshape(cue_att, shape=[batch_size, -1]) knowledge = weighted_cue if config.use_posterior: print("config.use_posterior", config.use_posterior) target_out, target_last_hidden = \ rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="knowledge_enc1") target_goal_out = fluid.layers.concat( input=[target_last_hidden, goal_last_hidden], axis=2) target_goal_out = layers.reshape(target_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # target_goal_out = layers.squeeze(target_goal_out, axes=[1]) target_goal_out = fluid.layers.fc(target_goal_out, size=rnn_hidden_size * 2, bias_attr=False) target_goal_out = layers.unsqueeze(target_goal_out, axes=[0]) # get attenion # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1]) target_query = layers.slice(target_goal_out, axes=[0], starts=[0], ends=[1]) target_query = layers.squeeze(target_query, axes=[0]) target_query = layers.unsqueeze(target_query, axes=[1]) target_query = layers.reshape(target_query, shape=[batch_size, -1, hidden_size]) weight_target, target_att = dot_attention(target_query, cue_memory, mask=memory_mask) target_att = layers.reshape(target_att, shape=[batch_size, -1]) # add to output knowledge = weight_target enc_memory_mask = layers.data(name="enc_memory_mask", shape=[-1, 1], dtype='float32') enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1]) # decoder init_hidden, enc_memory, enc_mask dec_init_hidden = bridge_out pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32')) enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out, pad_value=pad_value) enc_memory.persistable = True gru_unit = GRU_unit(input_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_gru_unit") cue_gru_unit = GRU_unit(hidden_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_cue_gru_unit") tgt_vocab_size = config.vocab_size if run_type == "train": if config.use_bow: bow_logits = fc(knowledge, hidden_size, hidden_size, name='bow_fc_1') bow_logits = layers.tanh(bow_logits) bow_logits = fc(bow_logits, hidden_size, tgt_vocab_size, name='bow_fc_2') bow_logits = layers.softmax(bow_logits) bow_label = layers.data(name='bow_label', shape=[-1, config.max_len], dtype='int64') bow_mask = layers.data(name="bow_mask", shape=[-1, config.max_len], dtype='float32') bow_logits = layers.expand(bow_logits, [1, config.max_len, 1]) bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size]) bow_label = layers.reshape(bow_label, shape=[-1, 1]) bow_loss = layers.cross_entropy(bow_logits, bow_label, soft_label=False) bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len]) bow_loss *= bow_mask bow_loss = layers.reduce_sum(bow_loss, dim=[1]) bow_loss = layers.reduce_mean(bow_loss) dec_input = layers.data(name="dec_input", shape=[-1, 1, 1], dtype='int64') dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32') dec_knowledge = weight_target knowledge_goal_out = fluid.layers.concat( input=[dec_knowledge, target_query], axis=2) knowledge_goal_out = layers.reshape(knowledge_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1]) knowledge_goal_out = fluid.layers.fc(knowledge_goal_out, size=rnn_hidden_size * 2, bias_attr=False) knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0]) decoder_logits = \ rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers, enc_memory, enc_memory_mask, dec_knowledge, vocab_size, init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout) target_label = layers.data(name='target_label', shape=[-1, 1], dtype='int64') target_mask = layers.data(name='target_mask', shape=[-1, 1], dtype='float32') decoder_logits = layers.reshape(decoder_logits, shape=[-1, tgt_vocab_size]) target_label = layers.reshape(target_label, shape=[-1, 1]) nll_loss = layers.cross_entropy(decoder_logits, target_label, soft_label=False) nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1]) nll_loss *= target_mask nll_loss = layers.reduce_sum(nll_loss, dim=[1]) nll_loss = layers.reduce_mean(nll_loss) prior_attn = cue_att + 1e-10 posterior_att = target_att posterior_att.stop_gradient = True prior_attn = layers.log(prior_attn) kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) - prior_attn) kl_loss = layers.reduce_mean(kl_loss) kl_and_nll_factor = layers.data(name='kl_and_nll_factor', shape=[1], dtype='float32') kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1]) final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor return [bow_loss, kl_loss, nll_loss, final_loss] elif run_type == "test": beam_size = config.beam_size batch_size = config.batch_size token = layers.fill_constant(shape=[batch_size * beam_size, 1], value=config.bos_id, dtype='int64') token = layers.reshape(token, shape=[-1, 1]) max_decode_len = config.max_dec_len dec_knowledge = knowledge INF = 100000000.0 init_score_np = np.ones([beam_size * batch_size], dtype='float32') * -INF for i in range(batch_size): init_score_np[i * beam_size] = 0.0 pre_score = layers.assign(init_score_np) pos_index_np = np.arange(batch_size).reshape(-1, 1) pos_index_np = \ np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size pos_index = layers.assign(pos_index_np) id_array = [] score_array = [] index_array = [] init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1]) init_enc_memory = layers.reshape( init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size]) init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1]) init_enc_mask = layers.reshape(init_enc_mask, shape=[batch_size * beam_size, 1, -1]) dec_knowledge = layers.reshape(dec_knowledge, shape=[-1, 1, hidden_size]) init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1]) init_dec_knowledge = layers.reshape( init_dec_knowledge, shape=[batch_size * beam_size, -1, hidden_size]) dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size]) dec_init_hidden = layers.reshape(dec_init_hidden, shape=[1, -1, hidden_size]) length_average = config.length_average UNK = config.unk_id EOS = config.eos_id for i in range(1, max_decode_len + 1): dec_emb = get_embedding(token, input_size, vocab_size) dec_out, dec_last_hidden = \ decoder_step(gru_unit, cue_gru_unit, dec_emb, dec_init_hidden, input_size, hidden_size, init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(dec_out, dropout_prob=config.dropout, is_test=True) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2') log_softmax_output = log_softmax(rnnout) log_softmax_output = layers.squeeze(log_softmax_output, axes=[1]) if i > 1: if length_average: log_softmax_output = layers.elementwise_add( (log_softmax_output / i), (pre_score * (1.0 - 1.0 / i)), axis=0) else: log_softmax_output = layers.elementwise_add( log_softmax_output, pre_score, axis=0) else: log_softmax_output = layers.elementwise_add(log_softmax_output, pre_score, axis=0) log_softmax_output = layers.reshape(log_softmax_output, shape=[batch_size, -1]) topk_score, topk_index = layers.topk(log_softmax_output, k=beam_size) topk_score = layers.reshape(topk_score, shape=[-1]) topk_index = layers.reshape(topk_index, shape=[-1]) vocab_var = layers.fill_constant([1], dtype='int64', value=vocab_size) new_token = topk_index % vocab_var index = topk_index // vocab_var id_array.append(new_token) index_array.append(index) index = index + pos_index score_array.append(topk_score) eos_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=EOS) unk_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=UNK) eos_eq = layers.cast(layers.equal(new_token, eos_ids), dtype='float32') topk_score += eos_eq * -100000000.0 unk_eq = layers.cast(layers.equal(new_token, unk_ids), dtype='float32') topk_score += unk_eq * -100000000.0 # update token = new_token pre_score = topk_score token = layers.reshape(token, shape=[-1, 1]) index = layers.cast(index, dtype='int32') dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0]) dec_init_hidden = layers.gather(dec_last_hidden, index=index) dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0]) init_enc_memory = layers.gather(init_enc_memory, index) init_enc_mask = layers.gather(init_enc_mask, index) init_dec_knowledge = layers.gather(init_dec_knowledge, index) final_score = layers.concat(score_array, axis=0) final_ids = layers.concat(id_array, axis=0) final_index = layers.concat(index_array, axis=0) final_score = layers.reshape( final_score, shape=[max_decode_len, beam_size * batch_size]) final_ids = layers.reshape( final_ids, shape=[max_decode_len, beam_size * batch_size]) final_index = layers.reshape( final_index, shape=[max_decode_len, beam_size * batch_size]) return final_score, final_ids, final_index
def static_identity(x): x = layers.reshape(x, x.shape) return x
def network(items_num, hidden_size, step, bs): stdv = 1.0 / math.sqrt(hidden_size) items = fluid.data(name="items", shape=[bs, -1], dtype="int64") #[batch_size, uniq_max] seq_index = fluid.data(name="seq_index", shape=[bs, -1, 2], dtype="int32") #[batch_size, seq_max, 2] last_index = fluid.data(name="last_index", shape=[bs, 2], dtype="int32") #[batch_size, 2] adj_in = fluid.data(name="adj_in", shape=[bs, -1, -1], dtype="float32") #[batch_size, seq_max, seq_max] adj_out = fluid.data(name="adj_out", shape=[bs, -1, -1], dtype="float32") #[batch_size, seq_max, seq_max] mask = fluid.data(name="mask", shape=[bs, -1, 1], dtype="float32") #[batch_size, seq_max, 1] label = fluid.data(name="label", shape=[bs, 1], dtype="int64") #[batch_size, 1] datas = [items, seq_index, last_index, adj_in, adj_out, mask, label] py_reader = fluid.io.DataLoader.from_generator(capacity=256, feed_list=datas, iterable=False) feed_datas = datas items_emb = fluid.embedding( input=items, param_attr=fluid.ParamAttr(name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[batch_size, uniq_max, h] pre_state = items_emb for i in range(step): pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_adj_in = layers.matmul(adj_in, state_in) #[batch_size, uniq_max, h] state_adj_out = layers.matmul(adj_out, state_out) #[batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_fc = layers.fc(input=gru_input, name="gru_fc", size=3 * hidden_size, bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit(input=gru_fc, hidden=layers.reshape( x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size]) seq = layers.gather_nd(final_state, seq_index) last = layers.gather_nd(final_state, last_index) seq_fc = layers.fc( input=seq, name="seq_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, seq_max, h] last_fc = layers.fc( input=last, name="last_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=1, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[bathc_size, h] seq_fc_t = layers.transpose(seq_fc, perm=[1, 0, 2]) #[seq_max, batch_size, h] add = layers.elementwise_add(seq_fc_t, last_fc) #[seq_max, batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', default_initializer=fluid.initializer.Constant(value=0.0)) #[h] add = layers.elementwise_add(add, b) #[seq_max, batch_size, h] add_sigmoid = layers.sigmoid(add) #[seq_max, batch_size, h] add_sigmoid = layers.transpose(add_sigmoid, perm=[1, 0, 2]) #[batch_size, seq_max, h] weight = layers.fc( input=add_sigmoid, name="weight_fc", size=1, act=None, num_flatten_dims=2, bias_attr=False, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, seq_max, 1] weight *= mask weight_mask = layers.elementwise_mul(seq, weight, axis=0) #[batch_size, seq_max, h] global_attention = layers.reduce_sum(weight_mask, dim=1) #[batch_size, h] final_attention = layers.concat([global_attention, last], axis=1) #[batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="final_attention_fc", size=hidden_size, bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, h] all_vocab = layers.create_global_var(shape=[items_num - 1], value=0, dtype="int64", persistable=True, name="all_vocab") all_emb = fluid.embedding(input=all_vocab, param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[all_vocab, h] logits = layers.matmul(x=final_attention_fc, y=all_emb, transpose_y=True) #[batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy(logits=logits, label=label) #[batch_size, 1] loss = layers.reduce_mean(softmax) # [1] acc = layers.accuracy(input=logits, label=label, k=20) return loss, acc, py_reader, feed_datas
def multi_head_attention(queries, keys, values, attn_bias, d_key, d_value, d_model, n_head=1, dropout_rate=0., cache=None, param_initializer=None, name='multi_head_att'): """ Multi-Head Attention. Note that attn_bias is added to the logit before computing softmax activiation to mask certain selected positions so that they will not considered in attention weights. """ keys = queries if keys is None else keys values = keys if values is None else values if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): raise ValueError( "Inputs: quries, keys and values should all be 3-D tensors.") def __compute_qkv(queries, keys, values, n_head, d_key, d_value): """ Add linear projection to queries, keys, and values. """ q = layers.fc(input=queries, size=d_key * n_head, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_query_fc.w_0', initializer=param_initializer), bias_attr=name + '_query_fc.b_0') k = layers.fc(input=keys, size=d_key * n_head, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_key_fc.w_0', initializer=param_initializer), bias_attr=name + '_key_fc.b_0') v = layers.fc(input=values, size=d_value * n_head, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_value_fc.w_0', initializer=param_initializer), bias_attr=name + '_value_fc.b_0') return q, k, v def __split_heads(x, n_head): """ Reshape the last dimension of inpunt tensor x so that it becomes two dimensions and then transpose. Specifically, input a tensor with shape [bs, max_sequence_length, n_head * hidden_dim] then output a tensor with shape [bs, n_head, max_sequence_length, hidden_dim]. """ hidden_size = x.shape[-1] # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) # permuate the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) def __combine_heads(x): """ Transpose and then reshape the last two dimensions of inpunt tensor x so that it becomes one dimension, which is reverse to __split_heads. """ if len(x.shape) == 3: return x if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. return layers.reshape( x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True) def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """ Scaled Dot-Product Attention """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias weights = layers.softmax(product) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) if cache is not None: # use cache and concat time steps # Since the inplace reshape in __split_heads changes the shape of k and # v, which is the cache input for next time step, reshape the cache # input from the previous time step first. k = cache["k"] = layers.concat( [layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1) v = cache["v"] = layers.concat( [layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1) q = __split_heads(q, n_head) k = __split_heads(k, n_head) v = __split_heads(v, n_head) ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate) out = __combine_heads(ctx_multiheads) # Project back to the model size. proj_out = layers.fc(input=out, size=d_model, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_output_fc.w_0', initializer=param_initializer), bias_attr=name + '_output_fc.b_0') return proj_out
def conditional_gru(input, encode_hidden, init_hidden, encode_hidden_size, hidden_size, num_layers=1, sequence_length=None, dropout_prob=0.0, bidirectional=False, batch_first=True, param_attr=None, bias_attr=None, gate_activation=None, activation=None, dtype="float32", name="conditional_gru"): """ 定义一个新的GRU类型,多了参数Cu,Cr,C。GRU的新公式: .. math:: u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + C_u h_i + b_u) r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + C_r h_i + b_r) m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + C_u h_i + C h_i + b_m) h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) 其他定义与GRU相同 Args: input (Variable): GRU input tensor, if batch_first = False, shape should be ( seq_len x batch_size x input_size ) if batch_first = True, shape should be ( batch_size x seq_len x hidden_size ) encode_hidden: The hidden state from the encoder of the GRU. If bidirectional is True, the encode_hidden is assert to contain two parts, former half part is for forward direction, and later half part is for backward direction. encode_hidden_size: The size of encode_hidden. If bidirectional is True, the encode_hidden_size includes the former half part and the later half part, i.e., the actual size of encode_hidden is encode_hidden_size / 2 init_hidden(Variable|None): The initial hidden state of the GRU This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) and can be reshaped to tensor with ( num_layers x 2 x batch_size x hidden_size) to use. If it's None, it will be set to all 0. hidden_size (int): Hidden size of the GRU num_layers (int): The total number of layers of the GRU sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance, This tensor will be convert to a mask to mask the padding ids If it's None means NO padding ids dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers, NOT between time steps bidirectional (bool|False): If it is bidirectional batch_first (bool|True): The shape format of the input and output tensors. If true, the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false, the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default this function accepts input and emits output in batch-major form to be consistent with most of data format, though a bit less efficient because of extra transposes. param_attr(ParamAttr|None): The parameter attribute for the learnable weight matrix. Note: If it is set to None or one attribute of ParamAttr, gru_unit will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr (ParamAttr|None): The parameter attribute for the bias of GRU unit. If it is set to None or one attribute of ParamAttr, gru_unit will create ParamAttr as bias_attr. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. gate_activation (function|None): The activation function for gates (actGate). Default: 'fluid.layers.sigmoid' activation (function|None): The activation function for cell (actNode). Default: 'fluid.layers.tanh' dtype(string): data type used in this unit name(string): name used to identify parameters and biases Returns: rnn_out(Tensor),last_hidden(Tensor) - rnn_out is result of GRU hidden, with shape (seq_len x batch_size x hidden_size) \ if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) - last_hidden is the hidden state of the last step of GRU \ shape is ( num_layers x batch_size x hidden_size ) \ if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size), can be reshaped to a tensor with shape( num_layers x 2 x batch_size x hidden_size) - all_hidden is all the hidden states of the input, including the last_hidden and medium hidden states. \ shape is (num_layers x seq_len x batch_size x hidden_size). if is_bidirec set to True, shape will be (2 x num_layers x seq_len x batch_size x hidden_size) """ if bidirectional: encode_hidden, bw_encode_hidden = layers.split(encode_hidden, num_or_sections=2, dim=-1) encode_hidden_size = int(encode_hidden_size / 2) fw_unit_list = [] for i in range(num_layers): new_name = name + '_layers_' + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += '_fw_w_' + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += '_fw_b_' + str(i) else: layer_bias_attr = bias_attr fw_unit_list.append( ConditionalGRUUnit(new_name, encode_hidden_size, hidden_size, layer_param_attr, layer_bias_attr, gate_activation, activation, dtype) ) if bidirectional: bw_unit_list = [] for i in range(num_layers): new_name = name + '_reverse_layers_' + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += '_bw_w_' + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += '_bw_b_' + str(i) else: layer_bias_attr = bias_attr bw_unit_list.append( ConditionalGRUUnit(new_name, encode_hidden_size, hidden_size, layer_param_attr, layer_bias_attr, gate_activation, activation, dtype) ) if batch_first: input = layers.transpose(input, [1, 0, 2]) mask = None if sequence_length: max_seq_len = layers.shape(input)[0] mask = layers.sequence_mask( sequence_length, maxlen=max_seq_len, dtype='float32' ) mask = layers.transpose(mask, [1, 0]) direc_num = 1 if bidirectional: direc_num = 2 if init_hidden: init_hidden = layers.reshape( init_hidden, shape=[num_layers, direc_num, -1, hidden_size] ) def get_single_direction_output(rnn_input, encode_hidden, unit_list, mask=None, direc_index=0): rnn = StaticRNN() #print(rnn_input.shape) with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size], ref_batch_dim_idx=1) encode_h = encode_hidden[i] pre_encode_hidden = layers.concat([pre_hidden, encode_h], axis=1) new_hidden = unit_list[i](step_input, pre_encode_hidden) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul( pre_hidden, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.step_output(new_hidden) step_input = new_hidden if dropout_prob is not None and dropout_prob > 0.0: step_input = layers.dropout(step_input, dropout_prob=dropout_prob, ) rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] all_hidden_array = [] # 增加这个来得到所有隐含状态 rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i] all_hidden_array.append(last_hidden) last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) all_hidden_array = layers.concat(all_hidden_array, axis=0) all_hidden_array = layers.reshape(all_hidden_array, shape=[num_layers, input.shape[0], -1, hidden_size]) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape(last_hidden_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output, all_hidden_array fw_rnn_out, fw_last_hidden, fw_all_hidden = get_single_direction_output( input, encode_hidden, fw_unit_list, mask, direc_index=0) if bidirectional: bw_input = layers.reverse(input, axis=[0]) bw_mask = None if mask: bw_mask = layers.reverse(mask, axis=[0]) bw_rnn_out, bw_last_hidden, bw_all_hidden = get_single_direction_output( bw_input, bw_encode_hidden, bw_unit_list, bw_mask, direc_index=1) bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0]) rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2) last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1) all_hidden = layers.concat([fw_all_hidden, bw_all_hidden], axis=0) last_hidden = layers.reshape( last_hidden, shape=[num_layers * direc_num, -1, hidden_size]) if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden, all_hidden else: rnn_out = fw_rnn_out last_hidden = fw_last_hidden all_hidden = fw_all_hidden if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden, all_hidden
def basic_lstm(input, init_hidden, init_cell, hidden_size, num_layers=1, sequence_length=None, dropout_prob=0.0, bidirectional=False, batch_first=True, param_attr=None, bias_attr=None, gate_activation=None, activation=None, forget_bias=1.0, dtype='float32', name='basic_lstm'): r""" LSTM implementation using basic operators, supports multiple layers and bidirectional LSTM. .. math:: i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i) f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias ) o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o) \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c) c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} h_t &= o_t \odot tanh(c_t) Args: input (Variable): lstm input tensor, if batch_first = False, shape should be ( seq_len x batch_size x input_size ) if batch_first = True, shape should be ( batch_size x seq_len x hidden_size ) init_hidden(Variable|None): The initial hidden state of the LSTM This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use. If it's None, it will be set to all 0. init_cell(Variable|None): The initial hidden state of the LSTM This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use. If it's None, it will be set to all 0. hidden_size (int): Hidden size of the LSTM num_layers (int): The total number of layers of the LSTM sequence_length (Variabe|None): A tensor (shape [batch_size]) stores each real length of each instance, This tensor will be convert to a mask to mask the padding ids If it's None means NO padding ids dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of each layers, NOT between time steps bidirectional (bool|False): If it is bidirectional batch_first (bool|True): The shape format of the input and output tensors. If true, the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false, the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default this function accepts input and emits output in batch-major form to be consistent with most of data format, though a bit less efficient because of extra transposes. param_attr(ParamAttr|None): The parameter attribute for the learnable weight matrix. Note: If it is set to None or one attribute of ParamAttr, lstm_unit will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr (ParamAttr|None): The parameter attribute for the bias of LSTM unit. If it is set to None or one attribute of ParamAttr, lstm_unit will create ParamAttr as bias_attr. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. gate_activation (function|None): The activation function for gates (actGate). Default: 'fluid.layers.sigmoid' activation (function|None): The activation function for cell (actNode). Default: 'fluid.layers.tanh' forget_bias (float|1.0) : Forget bias used to compute the forget gate dtype(string): Data type used in this unit name(string): Name used to identify parameters and biases Returns: rnn_out(Tensor), last_hidden(Tensor), last_cell(Tensor) - rnn_out is the result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \ if is_bidirec set to True, it's shape will be ( seq_len x batch_sze x hidden_size*2) - last_hidden is the hidden state of the last step of LSTM \ with shape ( num_layers x batch_size x hidden_size ) \ if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size), and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size) to use. - last_cell is the hidden state of the last step of LSTM \ with shape ( num_layers x batch_size x hidden_size ) \ if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size), and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size) to use. Examples: .. code-block:: python import paddle.fluid.layers as layers from paddle.fluid.contrib.layers import basic_lstm batch_size = 20 input_size = 128 hidden_size = 256 num_layers = 2 dropout = 0.5 bidirectional = True batch_first = False input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32') pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32') pre_cell = layers.data( name = "pre_cell", shape=[-1, hidden_size], dtype='float32') sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32') rnn_out, last_hidden, last_cell = basic_lstm( input, pre_hidden, pre_cell, \ hidden_size, num_layers = num_layers, \ sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \ batch_first = batch_first) """ fw_unit_list = [] for i in range(num_layers): new_name = name + "_layers_" + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += "_fw_w_" + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += "_fw_b_" + str(i) else: layer_bias_attr = bias_attr fw_unit_list.append( BasicLSTMUnit( new_name, hidden_size, param_attr=layer_param_attr, bias_attr=layer_bias_attr, gate_activation=gate_activation, activation=activation, forget_bias=forget_bias, dtype=dtype)) if bidirectional: bw_unit_list = [] for i in range(num_layers): new_name = name + "_reverse_layers_" + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += "_bw_w_" + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += "_bw_b_" + str(i) else: layer_bias_attr = param_attr bw_unit_list.append( BasicLSTMUnit( new_name, hidden_size, param_attr=layer_param_attr, bias_attr=layer_bias_attr, gate_activation=gate_activation, activation=activation, forget_bias=forget_bias, dtype=dtype)) if batch_first: input = layers.transpose(input, [1, 0, 2]) mask = None if sequence_length: max_seq_len = layers.shape(input)[0] mask = layers.sequence_mask( sequence_length, maxlen=max_seq_len, dtype='float32') mask = layers.transpose(mask, [1, 0]) direc_num = 1 if bidirectional: direc_num = 2 # convert to [num_layers, 2, batch_size, hidden_size] if init_hidden: init_hidden = layers.reshape( init_hidden, shape=[num_layers, direc_num, -1, hidden_size]) init_cell = layers.reshape( init_cell, shape=[num_layers, direc_num, -1, hidden_size]) # forward direction def get_single_direction_output(rnn_input, unit_list, mask=None, direc_index=0): rnn = StaticRNN() with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) pre_cell = rnn.memory(init=init_cell[i, direc_index]) else: pre_hidden = rnn.memory( batch_ref=rnn_input, shape=[-1, hidden_size]) pre_cell = rnn.memory( batch_ref=rnn_input, shape=[-1, hidden_size]) new_hidden, new_cell = unit_list[i](step_input, pre_hidden, pre_cell) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul( pre_hidden, (step_mask - 1), axis=0) new_cell = layers.elementwise_mul( new_cell, step_mask, axis=0) - layers.elementwise_mul( pre_cell, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.update_memory(pre_cell, new_cell) rnn.step_output(new_hidden) rnn.step_output(new_cell) step_input = new_hidden if dropout_prob != None and dropout_prob > 0.0: step_input = layers.dropout( step_input, dropout_prob=dropout_prob, dropout_implementation='upscale_in_train') rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] last_cell_array = [] rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i * 2] last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) last_cell = rnn_out[i * 2 + 1] last_cell = last_cell[-1] last_cell_array.append(last_cell) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape( last_hidden_output, shape=[num_layers, -1, hidden_size]) last_cell_output = layers.concat(last_cell_array, axis=0) last_cell_output = layers.reshape( last_cell_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output, last_cell_output # seq_len, batch_size, hidden_size fw_rnn_out, fw_last_hidden, fw_last_cell = get_single_direction_output( input, fw_unit_list, mask, direc_index=0) if bidirectional: bw_input = layers.reverse(input, axis=[0]) bw_mask = None if mask: bw_mask = layers.reverse(mask, axis=[0]) bw_rnn_out, bw_last_hidden, bw_last_cell = get_single_direction_output( bw_input, bw_unit_list, bw_mask, direc_index=1) bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0]) rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2) last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1) last_hidden = layers.reshape( last_hidden, shape=[num_layers * direc_num, -1, hidden_size]) last_cell = layers.concat([fw_last_cell, bw_last_cell], axis=1) last_cell = layers.reshape( last_cell, shape=[num_layers * direc_num, -1, hidden_size]) if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden, last_cell else: rnn_out = fw_rnn_out last_hidden = fw_last_hidden last_cell = fw_last_cell if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden, last_cell
def decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data(name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) cond = pd.less_than(x=counter, y=array_len) while_op = pd.While(cond=cond) with while_op.block(): pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) # expand the lod of pre_state to be the same with pre_score pre_state_expanded = pd.sequence_expand(pre_state, pre_score) pre_ids_emb = pd.embedding(input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse) # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=beam_size) # calculate accumulated scores after topk to reduce computation cost accu_scores = pd.elementwise_add(x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search(pre_ids, pre_score, topk_indices, accu_scores, beam_size, end_id=10, level=0) pd.increment(x=counter, value=1, in_place=True) # update the memories pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) # update the break condition: up to the max length or all candidates of # source sentences have ended. length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) # return init_ids, init_scores return translation_ids, translation_scores
def relative_transformer(src_vocab_size, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, label_smooth_eps, use_py_reader=False, is_test=False, params_type="normal", all_data_inputs=None): """ transformer """ if embedding_sharing: assert src_vocab_size == trg_vocab_size, ( "Vocabularies in source and target should be same for weight sharing." ) data_input_names = encoder_data_input_fields + \ decoder_data_input_fields[:-1] + label_data_input_fields + dense_bias_input_fields if use_py_reader: all_inputs = all_data_inputs else: all_inputs = make_all_inputs(data_input_names) enc_inputs_len = len(encoder_data_input_fields) dec_inputs_len = len(decoder_data_input_fields[:-1]) enc_inputs = all_inputs[0:enc_inputs_len] dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len] real_label = all_inputs[enc_inputs_len + dec_inputs_len] weights = all_inputs[enc_inputs_len + dec_inputs_len + 1] reverse_label = all_inputs[enc_inputs_len + dec_inputs_len + 2] enc_output = wrap_encoder(src_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, enc_inputs, params_type=params_type) predict = wrap_decoder(trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, embedding_sharing, dec_inputs, enc_output, is_train=True if not is_test else False, params_type=params_type) # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if label_smooth_eps: label = layers.one_hot(input=real_label, depth=trg_vocab_size) label = label * (1 - label_smooth_eps) + (1 - label) * ( label_smooth_eps / (trg_vocab_size - 1)) label.stop_gradient = True else: label = real_label cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=True if label_smooth_eps else False) weighted_cost = cost * weights sum_cost = layers.reduce_sum(weighted_cost) sum_cost.persistable = True token_num = layers.reduce_sum(weights) token_num.persistable = True token_num.stop_gradient = True avg_cost = sum_cost / token_num sen_count = layers.shape(dec_inputs[0])[0] batch_predict = layers.reshape( predict, shape=[sen_count, -1, ModelHyperParams.trg_vocab_size]) batch_label = layers.reshape(real_label, shape=[sen_count, -1]) batch_weights = layers.reshape(weights, shape=[sen_count, -1, 1]) return sum_cost, avg_cost, token_num, batch_predict, cost, sum_cost, batch_label, batch_weights
def net(self, items_num, hidden_size, step, bs): stdv = 1.0 / math.sqrt(hidden_size) def embedding_layer(input, table_name, emb_dim, initializer_instance=None): emb = fluid.embedding( input=input, size=[items_num, emb_dim], param_attr=fluid.ParamAttr(name=table_name, initializer=initializer_instance), ) return emb sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv) items_emb = embedding_layer(self.items, "emb", hidden_size, sparse_initializer) pre_state = items_emb for i in range(step): pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer. Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer. Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, uniq_max, h] state_adj_in = layers.matmul(self.adj_in, state_in) # [batch_size, uniq_max, h] state_adj_out = layers.matmul( self.adj_out, state_out) # [batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_fc = layers.fc(input=gru_input, name="gru_fc", size=3 * hidden_size, bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit( input=gru_fc, hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size]) seq = layers.gather_nd(final_state, self.seq_index) last = layers.gather_nd(final_state, self.last_index) seq_fc = layers.fc( input=seq, name="seq_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, seq_max, h] last_fc = layers.fc( input=last, name="last_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=1, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [bathc_size, h] seq_fc_t = layers.transpose(seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h] add = layers.elementwise_add(seq_fc_t, last_fc) # [seq_max, batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', default_initializer=fluid.initializer.Constant(value=0.0)) # [h] add = layers.elementwise_add(add, b) # [seq_max, batch_size, h] add_sigmoid = layers.sigmoid(add) # [seq_max, batch_size, h] add_sigmoid = layers.transpose(add_sigmoid, perm=[1, 0, 2]) # [batch_size, seq_max, h] weight = layers.fc( input=add_sigmoid, name="weight_fc", size=1, act=None, num_flatten_dims=2, bias_attr=False, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, seq_max, 1] weight *= self.mask weight_mask = layers.elementwise_mul( seq, weight, axis=0) # [batch_size, seq_max, h] global_attention = layers.reduce_sum(weight_mask, dim=1) # [batch_size, h] final_attention = layers.concat([global_attention, last], axis=1) # [batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="final_attention_fc", size=hidden_size, bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, h] # all_vocab = layers.create_global_var( # shape=[items_num - 1], # value=0, # dtype="int64", # persistable=True, # name="all_vocab") all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32') all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab), dtype='int64') all_emb = fluid.embedding( input=all_vocab, param_attr=fluid.ParamAttr(name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) # [all_vocab, h] logits = layers.matmul(x=final_attention_fc, y=all_emb, transpose_y=True) # [batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy( logits=logits, label=self.label) # [batch_size, 1] self.loss = layers.reduce_mean(softmax) # [1] self.acc = layers.accuracy(input=logits, label=self.label, k=20)
def wrap_decoder(trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, dec_inputs=None, enc_output=None, caches=None, gather_idx=None): """ The wrapper assembles together all needed layers for the decoder. """ if dec_inputs is None: # This is used to implement independent decoder program in inference. trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \ make_all_inputs(decoder_data_input_fields) else: trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs dec_input = prepare_decoder(trg_word, trg_pos, trg_vocab_size, d_model, max_length, prepostprocess_dropout, word_emb_param_name=word_emb_param_names[0] if weight_sharing else word_emb_param_names[1]) dec_output = decoder(dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, caches=caches, gather_idx=gather_idx) # Reshape to 2D tensor to use GEMM instead of BatchedGEMM dec_output = layers.reshape(dec_output, shape=[-1, dec_output.shape[-1]], inplace=True) if weight_sharing: predict = layers.matmul( x=dec_output, y=fluid.default_main_program().global_block().var( word_emb_param_names[0]), transpose_y=True) else: predict = layers.fc(input=dec_output, size=trg_vocab_size, bias_attr=False) if dec_inputs is None: # Return probs for independent decoder program. predict = layers.softmax(predict) return predict
def inference(self, model, inputs, outputs): """ Run inference. Args: inputs(dict): Its key is input name(str) and its value is a Variable. model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`. Returns: dict(str:Variable): Its key is output name(str) and its value is a Variable. """ # prepare while loop max_len = layers.fill_constant(shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True) min_len = layers.fill_constant(shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True) step_idx = layers.fill_constant(shape=[1], dtype="int64", value=0, force_cpu=True) ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx) pos_biases = layers.array_write( layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx) scores = layers.array_write(inputs["init_score"], step_idx) tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx) parent_idx = inputs["parent_idx"] if self.decoding_strategy == "beam_search": beam_size = self.beam_size else: beam_size = 1 eos_penalty = np.zeros(self.vocab_size, dtype="float32") eos_penalty[self.eos_id] = -1e9 eos_penalty = layers.assign(eos_penalty) token_penalty = np.zeros(self.vocab_size, dtype="float32") token_penalty[self.unk_id] = -1e9 if self.mask_id >= 0: token_penalty[self.mask_id] = -1e9 token_penalty = layers.assign(token_penalty) # start while loop cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True) pre_scores = layers.array_read(array=scores, i=step_idx) pos_bias = layers.array_read(array=pos_biases, i=step_idx) pos_bias = layers.gather(input=pos_bias, index=parent_idx) tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx) dtype = tmp_tgt_generation_mask.dtype append_mask = layers.fill_constant_batch_size_like( input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype) tmp_tgt_generation_mask = layers.concat( [tmp_tgt_generation_mask, append_mask], axis=2) pre_mask = tmp_tgt_generation_mask = layers.gather( input=tmp_tgt_generation_mask, index=parent_idx) pre_sent = layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype) if self.continuous_position: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias else: pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) dec_out, _ = model._generation_network( token_ids=pre_ids, type_ids=pre_sent, pos_ids=pre_pos, generation_mask=tmp_tgt_generation_mask, gather_idx=parent_idx) logits = model._calc_logits(dec_out) # ignore unk and mask token if self.ignore_unk: logits = layers.elementwise_add(logits, token_penalty, axis=1) # min dec length min_len_cond = layers.less_than(x=step_idx, y=min_len) def min_len_penalty(): """Plus minimum length penalty.""" return layers.elementwise_add(logits, eos_penalty, axis=1) def no_penalty(): """No penalty.""" return logits logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty) # get probs probs = layers.softmax(logits / self.temperature) if self.decoding_strategy == "beam_search": topk_scores, topk_indices = layers.topk(input=probs, k=beam_size) else: if self.decoding_strategy.startswith("sampling"): sampling_ids = layers.sampling_id(probs, dtype="int") elif self.decoding_strategy.startswith("topk_sampling"): topk_probs, _ = layers.topk(input=probs, k=self.topk) ge_cond = layers.cast( layers.greater_equal( probs, layers.unsqueeze(topk_probs[:, -1], [1])), "float32") old_probs = probs probs = probs * ge_cond / layers.reduce_sum( topk_probs, dim=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") probs = old_probs elif self.decoding_strategy.startswith("topp_sampling"): sorted_probs, sorted_idx = layers.argsort(probs, descending=True) cum_sorted_probs = layers.cumsum(sorted_probs, axis=1, exclusive=True) lt_cond = layers.cast( layers.less_than( cum_sorted_probs, layers.fill_constant_batch_size_like( cum_sorted_probs, cum_sorted_probs.shape, cum_sorted_probs.dtype, self.topp)), "float32") old_probs = probs candidate_probs = sorted_probs * lt_cond probs = candidate_probs / layers.reduce_sum( candidate_probs, dim=-1, keep_dim=True) sampling_ids = layers.sampling_id(probs, dtype="int") sampling_ids = layers.index_sample( sorted_idx, layers.unsqueeze(sampling_ids, [1])) sampling_ids = layers.squeeze(sampling_ids, [1]) probs = old_probs else: raise ValueError(self.decoding_strategy) sampling_scores = layers.one_hot( layers.unsqueeze(sampling_ids, [1]), probs.shape[1]) sampling_scores = sampling_scores * probs - ( 1 - sampling_scores) * 1e3 topk_scores, topk_indices = layers.topk(input=sampling_scores, k=1) pre_len = layers.cast(step_idx, "float32") layers.increment(x=step_idx, value=1.0, in_place=True) cur_len = layers.cast(step_idx, "float32") # update scores if self.length_average: accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len elif self.length_penalty > 0: pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty) cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp else: accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) topk_indices = layers.lod_reset(topk_indices, pre_ids) accu_scores = layers.lod_reset(accu_scores, pre_ids) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=self.eos_id, return_parent_idx=True) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask) layers.array_write(pos_bias, i=step_idx, array=pos_biases) layers.assign(gather_idx, parent_idx) length_cond = layers.less_than(x=step_idx, y=max_len) finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( ids, scores, beam_size=beam_size, end_id=self.eos_id) predictions = { "finished_ids": finished_ids, "finished_scores": finished_scores, "token_ids": inputs["token_ids"], "data_id": inputs["data_id"] } return predictions
def _grammar_step(self, logits, next_cell_states, decode_states, actions, gmr_mask): """跟进文法约束完成一步解码逻辑 Args: logits (Variable): shape = [batch_size, beam_size, vocab_size] next_cell_states (Variable): NULL decode_states (StateWrapper): NULL Returns: TODO Raises: NULL """ # 解码出符合语法规则的 token logits logits, valid_table_mask = self._output_layer( logits, actions, gmr_mask, decode_states.valid_table_mask) # 初始化 vocab size self._vocab_size = logits.shape[-1] self._vocab_size_tensor = layers.fill_constant(shape=[1], dtype='int64', value=logits.shape[-1]) # 计算 log probs,并 mask 掉 finished 部分 step_log_probs = layers.log(layers.softmax(logits)) step_log_probs = self._mask_finished_probs(step_log_probs, decode_states.finished) scores = layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]) topk_scores, topk_indices = layers.topk(input=scores, k=self._beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_indices = layers.reshape(topk_indices, shape=[-1]) # top-k 对应的 beam beam_indices = layers.elementwise_floordiv(topk_indices, self._vocab_size_tensor) # top-k 对应的 token id token_indices = layers.elementwise_mod(topk_indices, self._vocab_size_tensor) # 根据 top k 的来源,重新组织 step_log_probs next_log_probs = nn_utils.batch_gather( layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]), topk_indices) def _beam_gather(x, beam_indices): """reshape x to beam dim, and gather each beam_indices Args: x (TYPE): NULL Returns: Variable """ x = self.split_batch_beams(x) return nn_utils.batch_gather(x, beam_indices) next_cell_states = layers.utils.map_structure( lambda x: _beam_gather(x, beam_indices), next_cell_states) next_finished = _beam_gather(decode_states.finished, beam_indices) next_lens = _beam_gather(decode_states.lengths, beam_indices) next_lens = layers.elementwise_add( next_lens, layers.cast(layers.logical_not(next_finished), next_lens.dtype)) next_finished = layers.logical_or( next_finished, layers.equal(token_indices, self._end_token_tensor)) decode_output = OutputWrapper(topk_scores, token_indices, beam_indices) decode_states = StateWrapper(next_cell_states, next_log_probs, next_finished, next_lens, valid_table_mask) return decode_output, decode_states
def transformer( src_vocab_size, trg_vocab_size, max_length, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, src_pad_idx, trg_pad_idx, pos_pad_idx, ): file_obj = fluid.layers.open_recordio_file( filename='./wmt16.recordio', shapes=[ [batch_size * max_length, 1], [batch_size * max_length, 1], [batch_size * max_length, 1], [batch_size * max_length, 1], [batch_size, n_head, max_length, max_length], [batch_size, n_head, max_length, max_length], [batch_size, n_head, max_length, max_length], [batch_size * max_length, 1], [batch_size * max_length, 1], ], dtypes=[ 'int64', 'int64', 'int64', 'int64', 'float32', 'float32', 'float32', 'int64', 'float32', ], lod_levels=[0] * 9) src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file( file_obj) enc_input = prepare_encoder( src_word, src_pos, src_vocab_size, d_model, src_pad_idx, max_length, dropout_rate, ) enc_output = encoder( enc_input, src_slf_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, ) dec_input = prepare_decoder( trg_word, trg_pos, trg_vocab_size, d_model, trg_pad_idx, max_length, dropout_rate, ) dec_output = decoder( dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, dropout_rate, ) # TODO(guosheng): Share the weight matrix between the embedding layers and # the pre-softmax linear transformation. predict = layers.reshape( x=layers.fc(input=dec_output, size=trg_vocab_size, param_attr=fluid.initializer.Xavier(uniform=False), bias_attr=False, num_flatten_dims=2), shape=[-1, trg_vocab_size], act="softmax") cost = layers.cross_entropy(input=predict, label=gold) weighted_cost = cost * weights return layers.reduce_sum(weighted_cost)
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" beam_size, vocab_size = logits.shape # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size logits_np = logits.numpy() for i in range(beam_size): logits_np[i][17963] = 0 # make [UNK] prob = 0 logits = D.to_variable(logits_np) bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def build(self): args = self.args emb_size = args.embed_size proj_size = args.embed_size hidden_size = args.hidden_size batch_size = args.batch_size num_layers = args.num_layers num_steps = args.num_steps lstm_outputs = [] x_f = layers.data(name="x", shape=[1], dtype='int64', lod_level=1) y_f = layers.data(name="y", shape=[1], dtype='int64', lod_level=1) x_b = layers.data(name="x_r", shape=[1], dtype='int64', lod_level=1) y_b = layers.data(name="y_r", shape=[1], dtype='int64', lod_level=1) init_hiddens_ = layers.data( name="init_hiddens", shape=[1], dtype='float32') init_cells_ = layers.data( name="init_cells", shape=[1], dtype='float32') if args.debug: layers.Print(init_cells_, message='init_cells_', summarize=10) layers.Print(init_hiddens_, message='init_hiddens_', summarize=10) init_hiddens = layers.reshape( init_hiddens_, shape=[2 * num_layers, -1, proj_size]) init_cells = layers.reshape( init_cells_, shape=[2 * num_layers, -1, hidden_size]) init_hidden = layers.slice( init_hiddens, axes=[0], starts=[0], ends=[num_layers]) init_cell = layers.slice( init_cells, axes=[0], starts=[0], ends=[num_layers]) init_hidden_r = layers.slice( init_hiddens, axes=[0], starts=[num_layers], ends=[2 * num_layers]) init_cell_r = layers.slice( init_cells, axes=[0], starts=[num_layers], ends=[2 * num_layers]) if args.use_custom_samples: custom_samples = layers.data( name="custom_samples", shape=[args.n_negative_samples_batch + 1], dtype='int64', lod_level=1) custom_samples_r = layers.data( name="custom_samples_r", shape=[args.n_negative_samples_batch + 1], dtype='int64', lod_level=1) custom_probabilities = layers.data( name="custom_probabilities", shape=[args.n_negative_samples_batch + 1], dtype='float32', lod_level=1) else: custom_samples = None custom_samples_r = None custom_probabilities = None forward, fw_hiddens, fw_hiddens_ori, fw_cells, fw_projs = encoder( x_f, y_f, self.vocab_size, emb_size, init_hidden, init_cell, para_name='fw_', custom_samples=custom_samples, custom_probabilities=custom_probabilities, test_mode=self.test_mode, args=args) backward, bw_hiddens, bw_hiddens_ori, bw_cells, bw_projs = encoder( x_b, y_b, self.vocab_size, emb_size, init_hidden_r, init_cell_r, para_name='bw_', custom_samples=custom_samples_r, custom_probabilities=custom_probabilities, test_mode=self.test_mode, args=args) losses = layers.concat([forward[-1], backward[-1]]) self.loss = layers.reduce_mean(losses) self.loss.permissions = True self.loss.persistable = True if args.debug: x_emb, projection, loss = forward layers.Print(init_cells, message='init_cells', summarize=10) layers.Print(init_hiddens, message='init_hiddens', summarize=10) layers.Print(init_cell, message='init_cell', summarize=10) layers.Print(y_b, message='y_b', summarize=10) layers.Print(x_emb, message='x_emb', summarize=10) layers.Print(projection, message='projection', summarize=10) layers.Print(losses, message='losses', summarize=320) layers.Print(self.loss, message='loss', summarize=320) self.grad_vars = [x_f, y_f, x_b, y_b, self.loss] self.grad_vars_name = ['x', 'y', 'x_r', 'y_r', 'final_loss'] fw_vars_name = ['x_emb', 'proj', 'loss'] + [ 'init_hidden', 'init_cell' ] + ['rnn_out', 'rnn_out2', 'cell', 'cell2', 'xproj', 'xproj2'] bw_vars_name = ['x_emb_r', 'proj_r', 'loss_r'] + [ 'init_hidden_r', 'init_cell_r' ] + [ 'rnn_out_r', 'rnn_out2_r', 'cell_r', 'cell2_r', 'xproj_r', 'xproj2_r' ] fw_vars = forward + [init_hidden, init_cell ] + fw_hiddens + fw_cells + fw_projs bw_vars = backward + [init_hidden_r, init_cell_r ] + bw_hiddens + bw_cells + bw_projs for i in range(len(fw_vars_name)): self.grad_vars.append(fw_vars[i]) self.grad_vars.append(bw_vars[i]) self.grad_vars_name.append(fw_vars_name[i]) self.grad_vars_name.append(bw_vars_name[i]) if args.use_custom_samples: self.feed_order = [ 'x', 'y', 'x_r', 'y_r', 'custom_samples', 'custom_samples_r', 'custom_probabilities' ] else: self.feed_order = ['x', 'y', 'x_r', 'y_r'] self.last_hidden = [ fluid.layers.sequence_last_step(input=x) for x in fw_hiddens_ori + bw_hiddens_ori ] self.last_cell = [ fluid.layers.sequence_last_step(input=x) for x in fw_cells + bw_cells ] self.last_hidden = layers.concat(self.last_hidden, axis=0) self.last_hidden.persistable = True self.last_cell = layers.concat(self.last_cell, axis=0) self.last_cell.persistable = True if args.debug: layers.Print(self.last_cell, message='last_cell', summarize=10) layers.Print(self.last_hidden, message='last_hidden', summarize=10)
def beam_search(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias, bos_id=0, eos_id=1, beam_size=4, max_len=256): def expand_to_beam_size(tensor, beam_size): tensor = layers.reshape(tensor, [tensor.shape[0], 1] + list(tensor.shape[1:])) tile_dims = [1] * len(tensor.shape) tile_dims[1] = beam_size return layers.expand(tensor, tile_dims) def merge_batch_beams(tensor): var_dim_in_state = 2 # count in beam dim tensor = layers.transpose( tensor, list(range(var_dim_in_state, len(tensor.shape))) + list(range(0, var_dim_in_state))) tensor = layers.reshape(tensor, [0] * (len(tensor.shape) - var_dim_in_state) + [batch_size * beam_size]) res = layers.transpose( tensor, list( range((len(tensor.shape) + 1 - var_dim_in_state), len(tensor.shape))) + list(range(0, (len(tensor.shape) + 1 - var_dim_in_state)))) return res def split_batch_beams(tensor): var_dim_in_state = 1 tensor = layers.transpose( tensor, list(range(var_dim_in_state, len(tensor.shape))) + list(range(0, var_dim_in_state))) tensor = layers.reshape(tensor, [0] * (len(tensor.shape) - var_dim_in_state) + [batch_size, beam_size]) res = layers.transpose( tensor, list( range((len(tensor.shape) - 1 - var_dim_in_state), len(tensor.shape))) + list(range(0, (len(tensor.shape) - 1 - var_dim_in_state)))) return res def mask_probs(probs, finished, noend_mask_tensor): finished = layers.cast(finished, dtype=probs.dtype) probs = layers.elementwise_mul(layers.expand( layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]), noend_mask_tensor, axis=-1) - layers.elementwise_mul( probs, (finished - 1), axis=0) return probs def gather(input, indices, batch_pos): topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) return layers.gather_nd(input, topk_coordinates) # run encoder enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) batch_size = enc_output.shape[0] # constant number inf = float(1. * 1e7) max_len = (enc_output.shape[1] + 20) if max_len is None else max_len vocab_size_tensor = layers.fill_constant(shape=[1], dtype="int64", value=self.trg_vocab_size) end_token_tensor = to_variable( np.full([batch_size, beam_size], eos_id, dtype="int64")) noend_array = [-inf] * self.trg_vocab_size noend_array[eos_id] = 0 noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32")) batch_pos = layers.expand( layers.unsqueeze( to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]), [1, beam_size]) predict_ids = [] parent_ids = [] ### initialize states of beam search ### log_probs = to_variable( np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size, dtype="float32")) finished = to_variable( np.full([batch_size, beam_size], 0, dtype="bool")) trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id) trg_src_attn_bias = merge_batch_beams( expand_to_beam_size(trg_src_attn_bias, beam_size)) enc_output = merge_batch_beams( expand_to_beam_size(enc_output, beam_size)) # init states (caches) for transformer, need to be updated according to selected beam caches = [{ "k": layers.fill_constant( shape=[batch_size, beam_size, self.n_head, 0, self.d_key], dtype=enc_output.dtype, value=0), "v": layers.fill_constant( shape=[batch_size, beam_size, self.n_head, 0, self.d_value], dtype=enc_output.dtype, value=0), } for i in range(self.n_layer)] for i in range(max_len): trg_pos = layers.fill_constant(shape=trg_word.shape, dtype="int64", value=i) caches = map_structure(merge_batch_beams, caches) # TODO: modified for dygraph2static logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, enc_output, caches) caches = map_structure(split_batch_beams, caches) step_log_probs = split_batch_beams( fluid.layers.log(fluid.layers.softmax(logits))) step_log_probs = mask_probs(step_log_probs, finished, noend_mask_tensor) log_probs = layers.elementwise_add(x=step_log_probs, y=log_probs, axis=0) log_probs = layers.reshape(log_probs, [-1, beam_size * self.trg_vocab_size]) scores = log_probs topk_scores, topk_indices = fluid.layers.topk(input=scores, k=beam_size) beam_indices = fluid.layers.elementwise_floordiv( topk_indices, vocab_size_tensor) token_indices = fluid.layers.elementwise_mod( topk_indices, vocab_size_tensor) # update states caches = map_structure( lambda x: gather(x, beam_indices, batch_pos), caches) log_probs = gather(log_probs, topk_indices, batch_pos) finished = gather(finished, beam_indices, batch_pos) finished = layers.logical_or( finished, layers.equal(token_indices, end_token_tensor)) trg_word = layers.reshape(token_indices, [-1, 1]) predict_ids.append(token_indices) parent_ids.append(beam_indices) if layers.reduce_all(finished).numpy(): break predict_ids = layers.stack(predict_ids, axis=0) parent_ids = layers.stack(parent_ids, axis=0) finished_seq = layers.transpose( layers.gather_tree(predict_ids, parent_ids), [1, 2, 0]) finished_scores = topk_scores return finished_seq, finished_scores
def grammar_output(inputs, actions, gmr_mask, last_col2tbl_mask, decode_vocab, grammar, name=None, column2table=None): """output logits according to grammar Args: inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1 actions (Variable): shape = [batch_size, max_len]. infer 阶段 max_len 恒为1 gmr_mask (Variable): shape = [batch_size, max_len, grammar_size]. infer 阶段 max_len 恒为1 last_col2tbl_mask (Variable): shape = [batch_size, max_len, max_table]. 解码过程中,上一个step为column时,其对应的 table mask decode_vocab (DecoderDynamicVocab): (table, table_len, column, column_len, value, value_len, column2table_mask). 这里的column2table_mask是跟column一一对应的table mask。 gramamr (Grammar): NULL name (str): Variable 的 name 前缀。用于多次调用时的参数共享。默认为 None,表示参数不会共享。 Returns: (Variable, Variable) output: 词表输出概率 valid_table_mask: 只在预测阶段有效 Raises: NULL """ batch_size = layers.shape(inputs)[0] max_len = inputs.shape[1] vocab_size = grammar.vocab_size action_shape = [batch_size, max_len] act_apply_rule = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_APPLY, dtype='int64') act_stop = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_STOP, dtype='int64') act_select_t = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_SELECT_T, dtype='int64') act_select_c = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_SELECT_C, dtype='int64') act_select_v = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_SELECT_V, dtype='int64') cond_apply_rule = layers.logical_or(layers.equal(actions, act_apply_rule), layers.equal(actions, act_stop)) cond_select_t = layers.equal(actions, act_select_t) cond_select_c = layers.equal(actions, act_select_c) cond_select_v = layers.equal(actions, act_select_v) # expand vocab to [-1, max_len, ...] if max_len == 1: expand_to_seq_len = lambda x: layers.unsqueeze(x, [1]) else: expand_to_seq_len = lambda x: layers.expand(layers.unsqueeze( x, [1]), [1, max_len] + [1] * (len(x.shape) - 1)) table_enc = expand_to_seq_len(decode_vocab.table) table_len = expand_to_seq_len(decode_vocab.table_len) column_enc = expand_to_seq_len(decode_vocab.column) column_len = expand_to_seq_len(decode_vocab.column_len) value_enc = expand_to_seq_len(decode_vocab.value) value_len = expand_to_seq_len(decode_vocab.value_len) column2table_mask = expand_to_seq_len(decode_vocab.column2table_mask) # merge batch & seq_len dim inputs = nn_utils.merge_first_ndim(inputs, n=2) actions = nn_utils.merge_first_ndim(actions, n=2) gmr_mask = nn_utils.merge_first_ndim(gmr_mask, n=2) last_col2tbl_mask = nn_utils.merge_first_ndim(last_col2tbl_mask, n=2) table_enc = nn_utils.merge_first_ndim(table_enc, n=2) table_len = nn_utils.merge_first_ndim(table_len, n=2) column_enc = nn_utils.merge_first_ndim(column_enc, n=2) column_len = nn_utils.merge_first_ndim(column_len, n=2) value_enc = nn_utils.merge_first_ndim(value_enc, n=2) value_len = nn_utils.merge_first_ndim(value_len, n=2) column2table_mask = nn_utils.merge_first_ndim(column2table_mask, n=2) cond_apply_rule = nn_utils.merge_first_ndim(cond_apply_rule, n=2) cond_select_t = nn_utils.merge_first_ndim(cond_select_t, n=2) cond_select_c = nn_utils.merge_first_ndim(cond_select_c, n=2) cond_select_v = nn_utils.merge_first_ndim(cond_select_v, n=2) t_ptr_net = models.PointerNetwork(score_type="affine", name='gmr_output_t_ptr') c_ptr_net = models.PointerNetwork(score_type="affine", name='gmr_output_c_ptr') v_ptr_net = models.PointerNetwork(score_type="affine", name='gmr_output_v_ptr') ## 核心处理逻辑 ## apply_rule_output = _apply_rule(cond_apply_rule, inputs, gmr_mask, grammar, name=name) select_t_output = \ _select_table(cond_select_t, inputs, table_enc, table_len, last_col2tbl_mask, t_ptr_net, grammar) select_c_output, valid_table_mask = \ _select_column(cond_select_c, inputs, column_enc, column_len, c_ptr_net, grammar, column2table_mask) select_v_output = _select_value(cond_select_v, inputs, value_enc, value_len, v_ptr_net, grammar) output = fluider.elementwise_add(apply_rule_output, select_t_output, select_c_output, select_v_output, axis=0) output = layers.reshape(output, shape=[batch_size, max_len, vocab_size]) return output, valid_table_mask
def expand_to_beam_size(tensor, beam_size): tensor = layers.reshape(tensor, [tensor.shape[0], 1] + list(tensor.shape[1:])) tile_dims = [1] * len(tensor.shape) tile_dims[1] = beam_size return layers.expand(tensor, tile_dims)
def multi_head_attention(queries, keys, values, attn_bias, d_key, d_value, d_model, input_mask, n_head=1, dropout_rate=0., cache=None, param_initializer=None, name='multi_head_att'): """ Multi-Head Attention. Note that attn_bias is added to the logit before computing softmax activiation to mask certain selected positions so that they will not considered in attention weights. """ keys = queries if keys is None else keys values = keys if values is None else values def __compute_qkv(queries, keys, values, n_head, d_key, d_value): """ Add linear projection to queries, keys, and values. """ q = layers.fc(input=queries, size=d_key * n_head, num_flatten_dims=len(queries.shape) - 1, param_attr=fluid.ParamAttr( name=name + '_query_fc.w_0', initializer=param_initializer), bias_attr=name + '_query_fc.b_0') k = layers.fc(input=keys, size=d_key * n_head, num_flatten_dims=len(keys.shape) - 1, param_attr=fluid.ParamAttr( name=name + '_key_fc.w_0', initializer=param_initializer), bias_attr=name + '_key_fc.b_0') v = layers.fc(input=values, size=d_value * n_head, num_flatten_dims=len(values.shape) - 1, param_attr=fluid.ParamAttr( name=name + '_value_fc.w_0', initializer=param_initializer), bias_attr=name + '_value_fc.b_0') return q, k, v def __split_heads(x, n_head): """ Reshape the last dimension of inpunt tensor x so that it becomes two dimensions and then transpose. Specifically, input a tensor with shape [bs, max_sequence_length, n_head * hidden_dim] then output a tensor with shape [bs, n_head, max_sequence_length, hidden_dim]. """ hidden_size = x.shape[-1] # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) # permuate the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) def __combine_heads(x): """ Transpose and then reshape the last two dimensions of inpunt tensor x so that it becomes one dimension, which is reverse to __split_heads. """ if len(x.shape) == 3: return x if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. #trans_x.desc.set_shape((-1, 1, n_head, d_value)) return layers.reshape(x=trans_x, shape=[0, 0, d_model], inplace=True) q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) q = to_3d(q) k = to_3d(k) v = to_3d(v) if cache is not None: # use cache and concat time steps # Since the inplace reshape in __split_heads changes the shape of k and # v, which is the cache input for next time step, reshape the cache # input from the previous time step first. k = cache["k"] = layers.concat( [layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1) v = cache["v"] = layers.concat( [layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1) out, _ = sparse_scaled_dot_product_attention(q, k, v, input_mask, dropout_rate, n_head, d_key, d_value) out = to_2d(out) # Project back to the model size. proj_out = layers.fc(input=out, size=d_model, num_flatten_dims=len(out.shape) - 1, param_attr=fluid.ParamAttr( name=name + '_output_fc.w_0', initializer=param_initializer), bias_attr=name + '_output_fc.b_0') return proj_out, _