Beispiel #1
0
    def _mask_finished_probs(self, probs, finished):
        """mask finished beams. it makes
            1. all finished beams probs to be -inf, except end_token which is 0
            2. unfinished beams to remain unchanged

        Args:
            probs (Variable): with shape [batch_size, vocab_size]
            finished (Variable): with shape [batch_size]

        Returns: Variable

        Raises: NULL

        """
        # 初始化 no-end mask
        noend_array = [-INF] * self._vocab_size
        noend_array[self._end_token] = 0
        self._noend_mask_tensor = layers.assign(np.array(noend_array, "float32"))

        finished = layers.cast(finished, dtype=probs.dtype)
        # finished --> 0; not finished --> -1
        not_finished = fluider.increment(finished, value=-1)
        # shape = [batch_size, vocab_size]
        finished_expended = layers.expand(layers.unsqueeze(finished, [1]), [1, self._vocab_size])
        probs = layers.elementwise_mul(finished_expended, self._noend_mask_tensor, axis=-1) - \
                layers.elementwise_mul(probs, not_finished, axis=0)
        return probs
def layer_norm(x,
               begin_norm_axis=1,
               epsilon=1e-12,
               param_attr=None,
               bias_attr=None):
    """
    Replace build-in layer_norm op with this function
    """
    helper = LayerHelper('layer_norm', **locals())
    mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
    shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
    variance = layers.reduce_mean(
        layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
    r_stdev = layers.rsqrt(variance + epsilon)
    norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)

    param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
    param_dtype = norm_x.dtype
    scale = helper.create_parameter(
        attr=param_attr,
        shape=param_shape,
        dtype=param_dtype,
        default_initializer=fluid.initializer.Constant(1.))
    bias = helper.create_parameter(
        attr=bias_attr,
        shape=param_shape,
        dtype=param_dtype,
        is_bias=True,
        default_initializer=fluid.initializer.Constant(0.))

    out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
    out = layers.elementwise_add(x=out, y=bias, axis=-1)

    return out
Beispiel #3
0
def attn_flow(q_enc, p_enc, p_ids_name, args):
    """Bidirectional Attention layer"""
    tag = p_ids_name + "__"
    drnn = layers.DynamicRNN()
    with drnn.block():
        h_cur = drnn.step_input(p_enc)
        u_all = drnn.static_input(q_enc)
        h_expd = layers.sequence_expand(x=h_cur, y=u_all)
        s_t_mul = layers.elementwise_mul(x=u_all, y=h_expd, axis=0)
        s_t_sum = layers.reduce_sum(input=s_t_mul, dim=1, keep_dim=True)
        s_t_re = layers.reshape(s_t_sum, shape=[-1, 0])
        s_t = layers.sequence_softmax(input=s_t_re)
        u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0)
        u_expr = layers.sequence_pool(input=u_expr, pool_type='sum')

        b_t = layers.sequence_pool(input=s_t_sum, pool_type='max')
        drnn.output(u_expr, b_t)
    U_expr, b = drnn()
    b_norm = layers.sequence_softmax(input=b)
    h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0)
    h_expr = layers.sequence_pool(input=h_expr, pool_type='sum')

    H_expr = layers.sequence_expand(x=h_expr, y=p_enc)
    H_expr = layers.lod_reset(x=H_expr, y=p_enc)
    h_u = layers.elementwise_mul(x=p_enc, y=U_expr, axis=0)
    h_h = layers.elementwise_mul(x=p_enc, y=H_expr, axis=0)

    g = layers.concat(input=[p_enc, U_expr, h_u, h_h], axis=1)
    return dropout(g, args)
Beispiel #4
0
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size, para_name, args):
    """Util function for pointer network"""

    def linear(inputs, para_name, args):
        return layers.fc(input=inputs,
                         size=size,
                         param_attr=fluid.ParamAttr(name=para_name + '_w'),
                         bias_attr=fluid.ParamAttr(name=para_name + '_b'))

    input_cat = layers.concat([hidden_t_prev, x_t], axis=1)
    forget_gate = layers.sigmoid(x=linear(input_cat, para_name + '_lstm_f',
                                          args))
    input_gate = layers.sigmoid(x=linear(input_cat, para_name + '_lstm_i',
                                         args))
    output_gate = layers.sigmoid(x=linear(input_cat, para_name + '_lstm_o',
                                          args))
    cell_tilde = layers.tanh(x=linear(input_cat, para_name + '_lstm_c', args))

    cell_t = layers.sums(input=[
        layers.elementwise_mul(
            x=forget_gate, y=cell_t_prev), layers.elementwise_mul(
                x=input_gate, y=cell_tilde)
    ])

    hidden_t = layers.elementwise_mul(x=output_gate, y=layers.tanh(x=cell_t))

    return hidden_t, cell_t
    def _create_mask(self, input_mask, append_head=False, auto_regressive=False):
        """
        Create attention mask.

        @param : input_mask
        @type : Variable(shape: [batch_size, max_seq_len])

        @param : auto_regressive
        @type : bool
        """
        input_mask = fluid.layers.unsqueeze(input=input_mask, axes=[2])
        seq_len = input_mask.shape[1]

        input_mask = layers.cast(input_mask, self._dtype)
        mask1 = layers.expand(input_mask, [1, 1, seq_len])
        mask2 = layers.transpose(mask1, [0, 2, 1])
        mask = layers.elementwise_mul(mask1, mask2)

        if append_head:
            mask = layers.concat([mask[:, :1, :], mask], axis=1)
            mask = layers.concat([mask[:, :, :1], mask], axis=2)
            seq_len += 1

        if auto_regressive:
            seq_mask = self.sequence_mask[:seq_len, :seq_len]
            mask = layers.elementwise_mul(mask, seq_mask)

        mask = 1 - mask
        return mask
 def mask_probs(probs, finished, noend_mask_tensor):
     finished = layers.cast(finished, dtype=probs.dtype)
     probs = layers.elementwise_mul(layers.expand(
         layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]),
                                    noend_mask_tensor,
                                    axis=-1) - layers.elementwise_mul(
                                        probs, (finished - 1), axis=0)
     return probs
Beispiel #7
0
    def get_single_direction_output(rnn_input,
                                    encode_hidden,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        #print(rnn_input.shape)
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                else:
                    pre_hidden = rnn.memory(batch_ref=rnn_input,
                                            shape=[-1, hidden_size],
                                            ref_batch_dim_idx=1)
                encode_h = encode_hidden[i]
                pre_encode_hidden = layers.concat([pre_hidden, encode_h], axis=1)
                new_hidden = unit_list[i](step_input, pre_encode_hidden)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask, axis=0) - layers.elementwise_mul(
                        pre_hidden, (step_mask - 1), axis=0)
                rnn.update_memory(pre_hidden, new_hidden)

                rnn.step_output(new_hidden)

                step_input = new_hidden
                if dropout_prob is not None and dropout_prob > 0.0:
                    step_input = layers.dropout(step_input, dropout_prob=dropout_prob, )

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        all_hidden_array = []  # 增加这个来得到所有隐含状态
        rnn_output = rnn_out[-1]

        for i in range(num_layers):
            last_hidden = rnn_out[i]
            all_hidden_array.append(last_hidden)
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)

        all_hidden_array = layers.concat(all_hidden_array, axis=0)
        all_hidden_array = layers.reshape(all_hidden_array, shape=[num_layers, input.shape[0], -1, hidden_size])
        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(last_hidden_output, shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output, all_hidden_array
    def _birnn_encoder(self, inputs, input_len, name_lens, name_pos,
                       name_tok_len):
        """forward

        Args:
            inputs (Variable): shape=[batch_size, max_seq_len, hidden_size]
            input_len (Variable): shape=[batch_size]
            name_lens (Variable): shape=[batch_size]
            name_pos (Variable): shape=[batch_size, max_name_len, max_tokens]
            name_tok_len (Variable): shape=[batch_size, max_name_len]

        Returns: TODO

        Raises: NULL

        """
        rnn_output, rnn_final_state = self._rnn_encoder.forward(
            inputs, input_len)

        max_name_len = name_pos.shape[1]
        name_begin = name_pos[:, :, 0]

        name_repr_mask = layers.sequence_mask(name_lens,
                                              max_name_len,
                                              dtype=name_tok_len.dtype)
        len_delta = layers.elementwise_mul(name_tok_len - 1,
                                           name_repr_mask,
                                           axis=0)
        name_end = name_begin + len_delta

        if self._bidirectional:
            name_fwd_repr_gathered = nn_utils.batch_gather_2d(
                rnn_output, name_end)[:, :, :self._hidden_size]
            name_bwd_repr_gathered = nn_utils.batch_gather_2d(
                rnn_output, name_begin)[:, :, self._hidden_size:]
            name_repr_gathered = layers.concat(
                input=[name_fwd_repr_gathered, name_bwd_repr_gathered],
                axis=-1)
            new_hidden_size = self._hidden_size * 2
        else:
            name_repr_gathered = layers.gather_nd(rnn_output, name_end)
            new_hidden_size = self._hidden_size

        name_repr_tmp = layers.reshape(
            name_repr_gathered, shape=[-1, max_name_len, new_hidden_size])
        name_repr_mask = layers.cast(name_repr_mask, dtype=name_repr_tmp.dtype)
        name_repr = layers.elementwise_mul(name_repr_tmp,
                                           name_repr_mask,
                                           axis=0)

        return name_repr, None
Beispiel #9
0
def sag_pool(gw, feature, ratio, graph_id, dataset, name, activation=L.tanh):
    """Implementation of self-attention graph pooling (SAGPool)

    This is an implementation of the paper SELF-ATTENTION GRAPH POOLING
    (https://arxiv.org/pdf/1904.08082.pdf)

    Args:
        gw: Graph wrapper object.

        feature: A tensor with shape (num_nodes, feature_size).

        ratio: The pooling ratio of nodes we want to select.

        graph_id: The graphs that the nodes belong to. 

        dataset: To differentiate FRANKENSTEIN dataset and other datasets.

        name: The name of SAGPool layer.
        
        activation: The activation function.

    Return:
        new_feature: A tensor with shape (num_nodes, feature_size), and the unselected
                     nodes' feature is masked by zero.

        ratio_length: The selected node numbers of each graph.

    """
    if dataset == "FRANKENSTEIN":
        gcn_ = gcn
    else:
        gcn_ = norm_gcn

    score = gcn_(gw=gw,
                 feature=feature,
                 hidden_size=1,
                 activation=None,
                 norm=gw.node_feat["norm"],
                 name=name)
    score = L.squeeze(score, axes=[])
    perm, ratio_length = topk_pool(gw, score, graph_id, ratio)

    mask = L.zeros_like(score)
    mask = L.cast(mask, dtype="float32")
    updates = L.ones_like(perm)
    updates = L.cast(updates, dtype="float32")
    mask = L.scatter(mask, perm, updates)
    new_feature = L.elementwise_mul(feature, mask, axis=0)
    temp_score = activation(score)
    new_feature = L.elementwise_mul(new_feature, temp_score, axis=0)
    return new_feature, ratio_length
Beispiel #10
0
 def forward(self, x):
     """ Forward process of LayerNorm. """
     mean = layers.reduce_mean(x,
                               dim=list(range(self._begin_norm_axis, len(x.shape))),
                               keep_dim=True)
     shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
     variance = layers.reduce_mean(layers.square(shift_x),
                                   dim=list(range(self._begin_norm_axis, len(x.shape))),
                                   keep_dim=True)
     r_stdev = layers.rsqrt(variance + self._epsilon)
     norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
     out = layers.elementwise_mul(x=norm_x, y=self._scale_w, axis=-1)
     out = layers.elementwise_add(x=out, y=self._bias_w, axis=-1)
     return out
Beispiel #11
0
    def forward(self, input, pre_hidden, pre_cell):
        concat_input_hidden = layers.concat([input, pre_hidden], 1)
        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)

        gate_input = layers.elementwise_add(gate_input, self._bias)
        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
        new_cell = layers.elementwise_add(
            layers.elementwise_mul(
                pre_cell,
                layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)

        return new_hidden, new_cell
Beispiel #12
0
def _select_table(condition,
                  inputs,
                  table_enc,
                  table_len,
                  table_mask_by_col,
                  ptr_net,
                  grammar,
                  name=None):
    """select_table.

    Args:
        condition (TYPE): NULL
        inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1
        table_enc (TYPE): NULL
        table_len (TYPE): NULL
        ptr_net (TYPE): NULL
        grammar (TYPE): NULL
        name (str):
        table_mask_by_col (Variable):

    Returns: TODO

    Raises: NULL
    """
    condition = layers.cast(condition, dtype='float32')

    table_mask_by_len = layers.sequence_mask(table_len,
                                             maxlen=grammar.MAX_TABLE,
                                             dtype='float32')
    table_mask_by_len = layers.reshape(table_mask_by_len,
                                       [-1, grammar.MAX_TABLE])
    table_mask_by_col = layers.reshape(table_mask_by_col,
                                       [-1, grammar.MAX_TABLE])
    table_mask = layers.elementwise_mul(table_mask_by_len, table_mask_by_col)
    predicts = ptr_net.forward(inputs, table_enc, table_mask)

    zeros_l = tensor.fill_constant_batch_size_like(
        predicts,
        shape=[-1, grammar.grammar_size],
        dtype='float32',
        value=-INF)
    zeros_r = tensor.fill_constant_batch_size_like(
        predicts,
        shape=[-1, grammar.MAX_COLUMN + grammar.MAX_VALUE],
        dtype='float32',
        value=-INF)
    final_output = tensor.concat([zeros_l, predicts, zeros_r], axis=-1)
    true_final_output = layers.elementwise_mul(final_output, condition, axis=0)
    return true_final_output
Beispiel #13
0
def _select_column(condition,
                   inputs,
                   column_enc,
                   column_len,
                   ptr_net,
                   grammar,
                   column2table_mask,
                   name=None):
    """select_column.

    Args:
        condition (TYPE): NULL
        inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1
        column_enc (TYPE): NULL
        column_len (TYPE): NULL
        ptr_net (TYPE): NULL
        grammar (TYPE): NULL
        column2table_mask (Variable):
        name (str):

    Returns: TODO

    Raises: NULL
    """
    condition = layers.cast(condition, dtype='float32')

    column_mask = layers.sequence_mask(column_len,
                                       maxlen=grammar.MAX_COLUMN,
                                       dtype='float32')
    column_mask = layers.reshape(column_mask, [-1, grammar.MAX_COLUMN])
    predicts = ptr_net.forward(inputs, column_enc, column_mask)

    pred_ids = layers.argmax(predicts, axis=-1)
    valid_table_mask = nn_utils.batch_gather(column2table_mask, pred_ids)

    ## concat zeros to vocab size
    zeros_l = tensor.fill_constant_batch_size_like(
        predicts,
        shape=[-1, grammar.grammar_size + grammar.MAX_TABLE],
        dtype='float32',
        value=-INF)
    zeros_r = tensor.fill_constant_batch_size_like(
        predicts, shape=[-1, grammar.MAX_VALUE], dtype='float32', value=-INF)
    final_output = tensor.concat([zeros_l, predicts, zeros_r], axis=-1)
    true_final_output = layers.elementwise_mul(final_output, condition, axis=0)
    true_valid_table_mask = layers.elementwise_mul(valid_table_mask,
                                                   condition,
                                                   axis=0)
    return true_final_output, true_valid_table_mask
Beispiel #14
0
def _process_type_leaf(condition, decoder, grammar_stack, next_inputs,
                       finished):
    """Process when output type is LEAF

    Args:
        condition (TYPE): NULL
        decoder (TYPE): NULL
        grammar_stack (StackData): (gmr_stack_data, gmr_stack_pos)
        next_inputs (DecoderInputsWrapper): (input_var, action, grammar_mask)
        finished (TYPE): NULL

    Returns: None

    Raises: NULL
    """
    ## pop stack
    next_output, valid_pos, gmr_stack_tmp = data_structure.Stack.pop(
        grammar_stack, mask=True, in_place=False)
    valid_pos = fluider.squeeze(valid_pos, [1])

    ## update next grammar mask
    next_actions = layers.elementwise_mul(decoder.grammar_action(next_output),
                                          layers.cast(
                                              valid_pos,
                                              dtype=next_inputs.action.dtype),
                                          axis=0)
    next_gmr_mask = layers.elementwise_mul(
        decoder.grammar_mask(next_output),
        layers.cast(valid_pos, dtype=next_inputs.gmr_mask.dtype),
        axis=0)

    ## save result, while condition is True
    new_gmr_stack_data, new_gmr_stack_pos, new_actions, new_gmr_mask = nn_utils.ifelse(
        condition,
        [gmr_stack_tmp.data, gmr_stack_tmp.pos, next_actions, next_gmr_mask], [
            grammar_stack.data, grammar_stack.pos, next_inputs.action,
            next_inputs.gmr_mask
        ])

    layers.utils.map_structure(
        layers.assign,
        [new_gmr_stack_data, new_gmr_stack_pos, next_actions, new_gmr_mask], [
            grammar_stack.data, grammar_stack.pos, next_inputs.action,
            next_inputs.gmr_mask
        ])
    layers.logical_or(finished,
                      layers.logical_and(condition,
                                         layers.logical_not(valid_pos)),
                      out=finished)
Beispiel #15
0
 def attention(self, hidden, encoder_output, encoder_output_proj,
               encoder_padding_mask):
     # 定义attention用以计算context,即 c_i,这里使用Bahdanau attention机制
     decoder_state_proj = layers.unsqueeze(
         layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1])
     # 拿解码器的一个向量,和编码器的所有输出,进行一个结合/混合/融合/交融/关联
     mixed_state = fluid.layers.elementwise_add(
         encoder_output_proj,
         layers.expand(decoder_state_proj,
                       [1, layers.shape(decoder_state_proj)[1], 1]))
     # 解码器的一个向量,和编码器的所有输出,进行一个结合/混合/融合/交融/关联 后,进行全连接转成一个数值关系
     attn_scores = layers.squeeze(
         layers.fc(input=mixed_state,
                   size=1,
                   num_flatten_dims=2,
                   bias_attr=False), [2])
     if encoder_padding_mask is not None:
         attn_scores = layers.elementwise_add(attn_scores,
                                              encoder_padding_mask)
     # 数值关系softmax,变成了权重关系
     attn_scores = layers.softmax(attn_scores)
     # 加权平均权重,就是解码器的一个向量一顿操作后,拿到的上下文向量
     context = layers.reduce_sum(layers.elementwise_mul(encoder_output,
                                                        attn_scores,
                                                        axis=0),
                                 dim=1)
     return context
Beispiel #16
0
        def custom_dynamic_rnn(p_vec, init_state, decoder_size):
            context = layers.fc(input=p_vec,
			    size=decoder_size,
			    act=None)

	    drnn = layers.DynamicRNN()
	    with drnn.block():
		H_s = drnn.step_input(p_vec)
		ctx = drnn.static_input(context)

		c_prev = drnn.memory(init=init_state, need_reorder=True)
		m_prev = drnn.memory(init=init_state, need_reorder=True)
		m_prev1 = layers.fc(input=m_prev, size=decoder_size, act=None)
		m_prev1 = layers.sequence_expand(x=m_prev1, y=ctx)

		Fk = ctx + m_prev1
		Fk = layers.fc(input=Fk, size=decoder_size, act='tanh')
		logits = layers.fc(input=Fk, size=1, act=None)

		scores = layers.sequence_softmax(input=logits)
		attn_ctx = layers.elementwise_mul(x=ctx, y=scores, axis=0)
		attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum')
		hidden_t, cell_t = lstm_step(attn_ctx, hidden_t_prev=m_prev1, cell_t_prev=c_prev, size=decoder_size)

		drnn.update_memory(ex_mem=m_prev, new_mem=hidden_t)
		drnn.update_memory(ex_mem=c_prev, new_mem=cell_t)
      
		drnn.output(scores)
	    beta = drnn()
            return beta
def compute_position_embedding(radians, speaker_position_rate):
    """Compute sin/cos interleaved matrix from the radians.
    
    Arg:
        radians (Variable): shape(n_vocab, embed_dim), dtype float32, the radians matrix.
        speaker_position_rate (Variable): shape(B, ), speaker positioning rate.
    
    Returns:
        Variable: shape(B, n_vocab, embed_dim), the sin, cos interleaved matrix.
    """
    _, embed_dim = radians.shape
    batch_size = speaker_position_rate.shape[0]
    scaled_radians = F.elementwise_mul(F.expand(F.unsqueeze(radians, [0]),
                                                [batch_size, 1, 1]),
                                       speaker_position_rate,
                                       axis=0)

    odd_mask = (np.arange(embed_dim) % 2).astype(np.float32)
    odd_mask = dg.to_variable(odd_mask)

    out = odd_mask * F.cos(scaled_radians) \
        + (1 - odd_mask) * F.sin(scaled_radians)
    out = F.concat(
        [F.zeros((batch_size, 1, embed_dim), radians.dtype), out[:, 1:, :]],
        axis=1)
    return out
Beispiel #18
0
def input_true(x, condition, reverse=False):
    """input instances in x, while corrensponding condition is true

    Args:
        x (Variable): shape = [batch_size, ...]
        condition (Variable): shape = [batch_size, 1]
        reverse (Variable): Default is False

    Returns: TODO

    Raises: NULL
    """
    x_dtype = x.dtype
    if x_dtype == PaddleVarType.bool:
        x = layers.cast(x, dtype='int32')

    if condition.dtype != x.dtype:
        condition = layers.cast(condition, dtype=x.dtype)

    if reverse:
        condition = 1.0 - condition

    output = layers.elementwise_mul(x, condition, axis=0)

    if x_dtype == PaddleVarType.bool:
        output = layers.cast(output, dtype=x_dtype)

    return output
Beispiel #19
0
def compute_l2_normalized_weight(v, g, dim):
    shape = v.shape
    ndim = len(shape)

    if dim is None:
        v_normalized = v / (F.reduce_sum(F.square(v)) + 1e-12)
    elif dim == 0:
        param_matrix = F.reshape(v, (shape[0], np.prod(shape[1:])))
        v_normalized = F.l2_normalize(param_matrix, axis=1)
    elif dim == -1 or dim == ndim - 1:
        param_matrix = F.reshape(v, (np.prod(shape[:-1]), shape[-1]))
        v_normalized = F.l2_normalize(param_matrix, axis=0)
    else:
        perm = list(range(ndim))
        perm[0] = dim
        perm[dim] = 0
        transposed_param = F.transpose(v, perm)
        param_matrix = F.reshape(
            transposed_param,
            (transposed_param.shape[0], np.prod(transposed_param.shape[1:])))
        v_normalized = F.l2_normalize(param_matrix, axis=1)
        v_normalized = F.transpose(v_normalized, perm)
    v_normalized = F.reshape(v_normalized, shape)
    weight = F.elementwise_mul(v_normalized, g, axis=dim)
    return weight
def decoder_step(gru_unit,
                 cue_gru_unit,
                 step_in,
                 hidden,
                 input_size,
                 hidden_size,
                 memory,
                 memory_mask,
                 knowledge,
                 mask=None):
    """ decoder step """
    # get attention out
    # get hidden top layers
    top_hidden = layers.slice(hidden, axes=[0], starts=[0], ends=[1])
    top_hidden = layers.squeeze(top_hidden, axes=[0])
    top_hidden = layers.unsqueeze(top_hidden, axes=[1])

    weight_memory, attn = dot_attention(top_hidden, memory, memory_mask)

    step_in = layers.unsqueeze(step_in, axes=[1])
    rnn_input_list = [step_in, weight_memory]
    if weight_memory.shape[0] == -1:
        knowledge_1 = layers.reshape(knowledge, shape=weight_memory.shape)
    else:
        knowledge_1 = knowledge
    cue_input_list = [knowledge_1, weight_memory]
    output_list = [weight_memory]

    rnn_input = layers.concat(rnn_input_list, axis=2)

    rnn_input = layers.squeeze(rnn_input, axes=[1])
    rnn_output, rnn_last_hidden = gru_unit(rnn_input, hidden, mask)

    cue_input = layers.concat(cue_input_list, axis=2)
    cue_input = layers.squeeze(cue_input, axes=[1])
    cue_rnn_out, cue_rnn_last_hidden = cue_gru_unit(cue_input, hidden, mask)

    h_y = layers.tanh(
        fc(rnn_last_hidden, hidden_size, hidden_size, name="dec_fc1"))
    h_cue = layers.tanh(
        fc(cue_rnn_last_hidden, hidden_size, hidden_size, name="dec_fc2"))

    concate_y_cue = layers.concat([h_y, h_cue], axis=2)
    k = layers.sigmoid(fc(concate_y_cue, hidden_size * 2, 1, name='dec_fc3'))

    new_hidden = h_y * k - h_cue * (k - 1.0)

    new_hidden_tmp = layers.transpose(new_hidden, perm=[1, 0, 2])
    output_list.append(new_hidden_tmp)

    real_out = layers.concat(output_list, axis=2)

    if mask:
        mask_tmp = layers.unsqueeze(mask, axes=[0])
        new_hidden = layers.elementwise_mul((new_hidden - hidden),
                                            mask_tmp,
                                            axis=0)
        new_hidden += hidden

    return real_out, new_hidden
Beispiel #21
0
def _weight_norm(v, g, dim):
    shape = v.shape
    ndims = len(shape)

    if dim is None:
        v_normalized = v / (F.sqrt(F.reduce_sum(F.square(v))) + 1e-12)
    elif dim == 0:
        p_matrix = F.reshape(v, (shape[0], -1))
        v_normalized = F.l2_normalize(p_matrix, axis=1)
        v_normalized = F.reshape(v_normalized, shape)
    elif dim == -1 or dim == ndims - 1:
        p_matrix = F.reshape(v, (-1, shape[-1]))
        v_normalized = F.l2_normalize(p_matrix, axis=0)
        v_normalized = F.reshape(v_normalized, shape)
    else:
        perm = list(range(ndims))
        perm[0] = dim
        perm[dim] = 0
        p_transposed = F.transpose(v, perm)
        transposed_shape = p_transposed.shape
        p_matrix = F.reshape(p_transposed, (p_transposed.shape[0], -1))
        v_normalized = F.l2_normalize(p_matrix, axis=1)
        v_normalized = F.reshape(v_normalized, transposed_shape)
        v_normalized = F.transpose(v_normalized, perm)
    weight = F.elementwise_mul(v_normalized,
                               g,
                               axis=dim if dim is not None else -1)
    return weight
Beispiel #22
0
    def dot_product_pooling(k, v, attn_bias, dropout_rate):
        """
        Scaled Dot-Product Attention
        :param k:  (batch_size, n_head, key_len, 1)
        :param v:  (batch_size, n_head, key_len, dim_per_head)
        :param attn_bias:  (batch_size, n_head, key_len, key_len)
        :param dropout_rate:
        :param is_test:
        :return:
        """
        product = layers.squeeze(k, axes=[3])  # (batch_size, n_head, key_len)
        if attn_bias:
            # (batch_size, n_head, 1, key_len)
            attn_bias_sliced = fluid.layers.slice(attn_bias,
                                                  axes=[2],
                                                  starts=[0],
                                                  ends=[1])
            product += layers.squeeze(attn_bias_sliced,
                                      axes=[2
                                            ])  # (batch_size, n_head, key_len)

        weights = layers.softmax(product)  # (batch_size, n_head, key_len)
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)

        pooling_out = layers.elementwise_mul(
            x=v, y=weights,
            axis=0)  # (batch_size, n_head, key_len, dim_per_head)
        pooling_out = layers.reduce_sum(
            pooling_out, dim=[2])  # (batch_size, n_head, dim_per_head)

        return pooling_out
Beispiel #23
0
def _apply_rule(condition, inputs, gmr_mask, grammar, name=None):
    """apply_rule.

    Args:
        condition (TYPE): NULL
        inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1
        gmr_mask (TYPE): NULL
        grammar (TYPE): NULL

    Returns: TODO

    Raises: NULL
    """
    fc_name = None
    if name is not None:
        fc_name = name + '_apply_rule_fc'

    condition = layers.cast(condition, dtype='float32')
    gmr_output = layers.fc(inputs,
                           size=grammar.grammar_size,
                           **nn_utils.param_attr(fc_name,
                                                 INIT_SCALE,
                                                 need_bias=True))
    gmr_output_masked = layers.elementwise_add(gmr_output, gmr_mask)

    zeros = layers.fill_constant_batch_size_like(
        gmr_output_masked,
        shape=[-1, grammar.MAX_TABLE + grammar.MAX_COLUMN + grammar.MAX_VALUE],
        dtype='float32',
        value=-INF)
    final_output = tensor.concat([gmr_output_masked, zeros], axis=-1)
    true_final_output = layers.elementwise_mul(final_output, condition, axis=0)
    return true_final_output
    def func(self, place):
        # the shape of input variable should be clearly specified, not inlcude -1.
        shape = [2, 3, 4, 5]
        eps = 0.005
        dtype = np.float64

        x = layers.data('x', shape, False, dtype)
        y = layers.data('y', shape, False, dtype)
        x.persistable = True
        y.persistable = True
        out = layers.elementwise_mul(x, y)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        y_arr = np.random.uniform(-1, 1, shape).astype(dtype)

        gradient_checker.triple_grad_check([x, y],
                                           out,
                                           x_init=[x_arr, y_arr],
                                           place=place,
                                           eps=eps)
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
        gradient_checker.triple_grad_check_for_dygraph(self.multiply_wrapper,
                                                       [x, y],
                                                       out,
                                                       x_init=[x_arr, y_arr],
                                                       place=place)
        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
Beispiel #25
0
    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
        """
        Scaled Dot-Product Attention
        [[
          0 L*L -inf
          -inf -inf
        ]]maxLen*maxLen
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product)
        ############################
        # add code
        layers.Print(attn_bias, message="The content of input layer:")

        attn_mask = attn_bias == 0
        attn_mask = layers.cast(attn_mask, 'float64')
        layers.Print(weights)
        weights = layers.elementwise_mul(attn_mask, weights)
        layers.Print(weights)

        #         weights = layers.elementwise_mul(weights, attn_mask)
        ############################
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)
        out = layers.matmul(weights, v)
        return out
Beispiel #26
0
    def __init__(self, x, y, y_aux, cfg):
        self.program = fluid.default_main_program().clone()
        with fluid.program_guard(self.program):
            model = ACGAN(cfg.latent_size, cfg.num_classes)
            self.fake, self.aux = model.network_d(x, name='d')

            self.fake_loss = layers.sigmoid_cross_entropy_with_logits(
                x=self.fake, label=y)
            self.aux_loss = layers.softmax_with_cross_entropy(logits=self.aux,
                                                              label=y_aux)
            self.unweighted_loss = layers.reduce_sum(self.fake_loss +
                                                     self.aux_loss)
            self.infer_program = self.program.clone(for_test=True)

            # we don't want the discriminator to also maximize the classification
            # accuracy of the auxiliary classifier on generated images, so we
            # don't train discriminator to produce class labels for generated
            # images (see https://openreview.net/forum?id=rJXTf9Bxg).
            # To preserve sum of sample weights for the auxiliary classifier,
            # we assign sample weight of 2 to the real images.

            fake_loss_weight = layers.ones(shape=[cfg.batch_size * 2, 1],
                                           dtype='float32')
            aux_loss_weight_zeros = layers.zeros(shape=[cfg.batch_size, 1],
                                                 dtype='float32')
            aux_loss_weight_twos = layers.fill_constant(
                shape=[cfg.batch_size, 1], value=2.0, dtype='float32')
            aux_loss_weight = layers.concat(
                [aux_loss_weight_twos, aux_loss_weight_zeros])

            self.fake_loss = layers.elementwise_mul(self.fake_loss,
                                                    fake_loss_weight)
            self.aux_loss = layers.elementwise_mul(self.aux_loss,
                                                   aux_loss_weight)

            self.loss = layers.reduce_sum(self.fake_loss) + layers.reduce_sum(
                self.aux_loss)

            vars = []
            for var in self.program.list_vars():
                if fluid.io.is_parameter(var) and (var.name.startswith("d")):
                    vars.append(var.name)
            optimizer = fluid.optimizer.Adam(learning_rate=cfg.adam_lr,
                                             beta1=cfg.adam_beta_1,
                                             name="net_d")
            optimizer.minimize(self.loss, parameter_list=vars)
Beispiel #27
0
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        # clip by value first
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(
                    p):
                params_and_grads.append((p, g))
                continue
            new_grad = layers.clip(x=g,
                                   min=-self.clip_value,
                                   max=self.clip_value)
            params_and_grads.append((p, new_grad))
        params_grads = params_and_grads

        # clip by global norm
        params_and_grads = []
        sum_square_list = []
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(
                    p):
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)
            sum_square_list.append(sum_square)

        # all parameters have been filterd out
        if len(sum_square_list) == 0:
            return params_grads

        global_norm_var = layers.concat(sum_square_list)
        global_norm_var = layers.reduce_sum(global_norm_var)
        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(shape=[1],
                                               dtype='float32',
                                               value=self.clip_norm)
        clip_var = layers.elementwise_div(x=max_global_norm,
                                          y=layers.elementwise_max(
                                              x=global_norm_var,
                                              y=max_global_norm))
        for p, g in params_grads:
            if g is None:
                continue
            if self._need_clip_func is not None and not self._need_clip_func(
                    p):
                params_and_grads.append((p, g))
                continue
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

        return params_and_grads
Beispiel #28
0
    def forward(self, input, state):
        #logging.info("input shape: {}".format(input.shape))
        pre_hidden, pre_cell = state
        #logging.info("pre hidden shape: {}".format(pre_hidden.shape))
        #logging.info("pre cell shape: {}".format(pre_cell.shape))
        # i,f,c,o 四个值均有Wx+Wh+b 即W(x+h)+b
        # 因此:
        # 实际相乘为[x, b]·W+b
        # x,b 横向相连, shape为[batch_size, input_size+hidden_size]
        # W的shape为[input_size+hidden_size, 4*hidden_size]
        # b的shape为[4*hidden_size,]

        # 横向连接
        # shape: [batch_size, input_size+hidden_size]
        concat_input_hidden = L.concat([input, pre_hidden], axis=1)
        #logging.info("x concat h shape: {}".format(concat_input_hidden.shape))

        # 计算Wx+Wh+b
        # shape: [batch_size, 4*hidden_size]
        gate_input = L.matmul(x=concat_input_hidden, y=self._weight)
        #logging.info("[x, b]·W shape: {}".format(gate_input.shape))

        # shape: [batch_size, 4*hidden_size]
        gate_input = L.elementwise_add(gate_input, self._bias)
        #logging.info("[x, b]·W+b shape: {}".format(gate_input.shape))

        # i,f,c,o四值按最后一维分开 因此每个的最后一维都是hidden_size
        i, f, c, o = L.split(gate_input, num_or_sections=4, dim=-1)

        # new_c = pre_c·sigmoid(f+forget_bias) + sigmoid(i)·tanh(c)
        # shape: [batch_size, hidden_size]
        new_cell = L.elementwise_add(
            L.elementwise_mul(
                pre_cell,
                L.sigmoid(L.elementwise_add(f, self._forget_bias))),
            L.elementwise_mul(L.sigmoid(i), L.tanh(c))
            )
        #logging.info("new_cell shape: {}".format(new_cell.shape))

        # new_h = tanh(new_c)*sigmoid(o)
        # shape: [batch_size, hidden_size]
        new_hidden = L.tanh(new_cell) * L.sigmoid(o)
        #logging.info("new_hidden shape: {}".format(new_hidden.shape))

        return new_hidden, [new_hidden, new_cell]
Beispiel #29
0
    def build_model(self, enc_input, dec_input, tgt_label, label_weights):
        """Build the model with source encoding and target decoding"""

        enc_word_output, enc_sen_output = self.encode(enc_input)
        dec_output = self.decode(dec_input, enc_word_output, enc_sen_output)

        predict_token_idx = layers.argmax(dec_output, axis=-1)
        correct_token_idx = layers.cast(layers.equal(
            tgt_label, layers.reshape(predict_token_idx, shape=[-1, 1])),
                                        dtype='float32')
        weighted_correct = layers.elementwise_mul(x=correct_token_idx,
                                                  y=label_weights,
                                                  axis=0)
        sum_correct = layers.reduce_sum(weighted_correct)
        sum_correct.stop_gradient = True

        # Padding index do not contribute to the total loss. The weights is used to
        # cancel padding index in calculating the loss.
        if self._label_smooth_eps:
            # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing
            # the enforcement that the last dimension of label must be 1.
            tgt_label = layers.label_smooth(label=layers.one_hot(
                input=tgt_label, depth=self.voc_size),
                                            epsilon=self._label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=dec_output,
            label=tgt_label,
            soft_label=True if self._label_smooth_eps else False)

        weighted_cost = layers.elementwise_mul(x=cost, y=label_weights, axis=0)
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(label_weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num

        graph_vars = {
            "loss": avg_cost,
            "sum_correct": sum_correct,
            "token_num": token_num,
        }
        for k, v in graph_vars.items():
            v.persistable = True

        return graph_vars
Beispiel #30
0
 def body_func(step_idx, pre_ids, pre_scores, gather_idx, caches,
               trg_src_attn_bias):
     # gather cell states corresponding to selected parent
     pre_caches = map_structure(
         lambda x: layers.gather(x, index=gather_idx), caches)
     pre_src_attn_bias = layers.gather(trg_src_attn_bias,
                                       index=gather_idx)
     pre_pos = layers.elementwise_mul(
         x=layers.fill_constant_batch_size_like(
             input=pre_src_attn_bias,  # cann't use lod tensor here
             value=1,
             shape=[-1, 1],
             dtype=pre_ids.dtype),
         y=step_idx,
         axis=0)
     logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias),
                           trg_vocab_size,
                           max_in_len,
                           n_layer,
                           n_head,
                           d_key,
                           d_value,
                           d_model,
                           d_inner_hid,
                           prepostprocess_dropout,
                           attention_dropout,
                           relu_dropout,
                           preprocess_cmd,
                           postprocess_cmd,
                           weight_sharing,
                           enc_output=enc_output,
                           caches=pre_caches,
                           bos_idx=bos_idx)
     # intra-beam topK
     topk_scores, topk_indices = layers.topk(
         input=layers.softmax(logits), k=beam_size)
     accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                          y=pre_scores,
                                          axis=0)
     # beam_search op uses lod to differentiate branches.
     accu_scores = layers.lod_reset(accu_scores, pre_ids)
     # topK reduction across beams, also contain special handle of
     # end beams and end sentences(batch reduction)
     selected_ids, selected_scores, gather_idx = layers.beam_search(
         pre_ids=pre_ids,
         pre_scores=pre_scores,
         ids=topk_indices,
         scores=accu_scores,
         beam_size=beam_size,
         end_id=eos_idx,
         return_parent_idx=True)
     step_idx = layers.increment(x=step_idx, value=1.0, in_place=False)
     layers.array_write(selected_ids, i=step_idx, array=ids)
     layers.array_write(selected_scores, i=step_idx, array=scores)
     return (step_idx, selected_ids, selected_scores, gather_idx,
             pre_caches, pre_src_attn_bias)