예제 #1
0
 def multiFunc(self, arg1):
     # load or create the inputs we need
     multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes)
     bit_map = C.constant(self.bit_map)
     max_bits = self.bit_map.max()
     shape = multiIn.shape
     reformed = C.reshape(multiIn, (-1,))
     # lets compute the means we need
     # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits,
     # it is the difference between the previous bits approximation and the true value.
     carry_over = multiIn
     approx = C.element_times(multiIn, 0)
     # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization
     for i in range(max_bits):
         # determine which values of the input should be binarized to i bits or more
         hot_vals = C.greater(bit_map, i)
         # select only the values which we need to binarize
         valid_vals = C.element_select(hot_vals, carry_over, 0)
         # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels)
         mean = C.element_divide(C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1))
         # reshape the mean to match the dimensionality of the input
         mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1))
         # binarize the carry over
         bits = C.greater(carry_over, 0)
         bits = C.element_select(bits, bits, -1)
         bits = C.element_select(hot_vals, bits, 0)
         # add in the equivalent binary representation to the approximation
         approx = C.plus(approx, C.element_times(mean, bits))
         # compute the new carry over
         carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over)
         
     return approx, multiIn
예제 #2
0
 def new_attention(encoder_hidden_state, decoder_hidden_state):
     # encode_hidden_state: [#, e] [h]
     # decoder_hidden_state: [#, d] [H]
     unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs
     # unpacked_encoder_hidden_state: [#] [*=e, h]
     # valid_mask: [#] [*=e]
     projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state)
     # projected_encoder_hidden_state: [#, d] [*=e, attention_dim]
     broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state)
     # broadcast_valid_mask: [#, d] [*=e]
     projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state)
     # projected_decoder_hidden_state: [#, d] [attention_dim]
     tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state)
     # tanh_output: [#, d] [*=e, attention_dim]
     attention_logits = attn_proj_tanh(tanh_output)
     # attention_logits = [#, d] [*=e, 1]
     minus_inf = C.constant(-1e+30)
     masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf)
     # masked_attention_logits = [#, d] [*=e]
     attention_weights = C.softmax(masked_attention_logits, axis=0)
     attention_weights = Label('attention_weights')(attention_weights)
     # attention_weights = [#, d] [*=e]
     attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0)
     # attended_encoder_hidden_state = [#, d] [1, h]
     output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1))
     # output = [#, d], [h]
     return output
예제 #3
0
 def signFunc(self, arg):
     # create an input variable that matches the dimension of the input argument        
     signIn = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes)
     # create the first stage of the sign function, check if input is greater than zero
     actionfunc = C.greater(signIn, 0)
     # return the second stage of the sign function, replace any 0s with -1s
     return C.element_select(actionfunc, actionfunc, -1), signIn
예제 #4
0
파일: attention.py 프로젝트: haixpham/cntkx
    def attention(query, key, value):
        dk = C.reduce_sum(C.ones_like(query))  # cannot use sequence.last, will conflict with recurrence
        # dk: [#, *] [1, ] and value = int(dim_of_query)

        unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True)  # [#] [-3, key_dim]
        unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True)  # [#] [-3, value_dim]

        broadcasted_key = C.sequence.broadcast_as(unpacked_key, query)  # [#, *] [-3, key_dim]
        scaled = C.times_transpose(query, broadcasted_key) / dk
        # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim
        # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score

        # masked out invalid temporal connections to obey_sequence_order
        if obey_sequence_order and max_seq_len:
            unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs
            # unpacked_scaled: [#] [-3, -3]  <== matrix will be top right diagonally zero-ed
            # scaled_mask: [#] [-3,]

            minus_inf = C.constant(-1e+30)
            valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0))  # [] [max_seq, max_seq]
            valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled)  # [#] [max_seq, max_seq]
            valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0)  # [#] [-3, -3]
            unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf)  # [#] [-3, -3]
            scaled = C.to_sequence_like(unpacked_scaled, query)  # [#, *] [-3]

        elif obey_sequence_order and not max_seq_len:
            raise ValueError("max_seq_len must be defined when obey_sequence_order is True")

        attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query))  # [#, *] [value_dim,]
        return attended
예제 #5
0
    def scale_dot_product_attention_block(self, contextQ, contextV, contextK,
                                          name):

        Q = C.placeholder(shape=(2 * self.hidden_dim, ),
                          dynamic_axes=[self.b_axis, self.q_axis])
        V = C.placeholder(shape=(2 * self.hidden_dim, ),
                          dynamic_axes=[self.b_axis, self.q_axis])
        K = C.placeholder(shape=(2 * self.hidden_dim, ),
                          dynamic_axes=[self.b_axis, self.q_axis])

        Ql = C.layers.Dense(100)(Q)
        Vl = C.layers.Dense(100)(V)
        Kl = C.layers.Dense(100)(K)

        kvw, kvw_mask = C.sequence.unpack(Kl, padding_value=0).outputs
        vvw, _ = C.sequence.unpack(Vl, padding_value=0).outputs
        KT = C.swapaxes(kvw)

        S = C.reshape(C.times(Ql, KT) / math.sqrt(100), -1)
        kvw_mask_expanded = C.sequence.broadcast_as(kvw_mask, Ql)
        S = C.softmax(
            C.element_select(kvw_mask_expanded, S, C.constant(-1e+30)))
        att = C.times(S, vvw)

        return C.as_block(att, [(Q, contextQ), (V, contextV),
                                (K, contextK)], 'sdp_attention_block' + name,
                          'sdp_attention_block' + name)
예제 #6
0
    def attention(encoded, network):
        abk = dense(network)
        a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures)
        # print("abk shape:", a.shape, b.shape, k.shape)
        # a, b, k: [#, n] [nb_mixture, 1]
        # context: [#, c] [char_ohe]

        encoded_unpacked = C.sequence.unpack(encoded, padding_value=0, no_mask_output=True)
        # context_unpacked: [#] [*=c, char_ohe]
        u = Cx.sequence.position(encoded)  # position gives shape=(1, )
        # u: [#, c], [1]
        u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs
        # u_values: [#] [*=c, 1]
        # u_valid: [#] [*=c]
        u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k))
        # u_values_broadcast: [#, n] [1, *=c]
        u_valid_broadcast = C.sequence.broadcast_as(C.reshape(u_valid, (1,), 1), k)
        # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point

        # print("u_values_broadcast shape:", u_values_broadcast.shape)
        # print("abk shape:", a.shape, b.shape, k.shape)
        phi = window_weight(a, b, k, u_values_broadcast)
        # phi: [#, n] [*=c, 1]
        zero = C.constant(0)
        phi = C.element_select(u_valid_broadcast, phi, zero, name="phi")
        # phi: [#, n] [*=c, 1]
        attended = C.reduce_sum(phi * C.sequence.broadcast_as(encoded_unpacked, phi), axis=0)
        # [#, n] [1, char_ohe]
        # print("attended_context shape:", attended_context.shape)
        output = C.squeeze(attended, name="GaussianWindowAttention")
        # [#, n] [char_ohe]
        return output
예제 #7
0
    def simi_attention(self, input, memory):
        '''
        return:
        memory weighted vectors over input [#,c][d]
        weight
        '''
        input_ph = C.placeholder()  # [#,c][d]
        mem_ph = C.placeholder()  # [#,q][d]

        input_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1)
        mem_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1)
        bias = C.Parameter(shape=(2 * self.hidden_dim, ), init=0.0)
        weight_dense = Dense(1, bias=False, input_rank=1)

        proj_inp = input_dense(input_ph)  # [#,c][d]
        proj_mem = mem_dense(mem_ph)  # [#,q][d]
        unpack_memory, mem_mask = C.sequence.unpack(
            proj_mem, 0).outputs  # [#][*=q, d] [#][*=q]
        expand_mem = C.sequence.broadcast_as(unpack_memory,
                                             proj_inp)  # [#,c][*=q,d]
        expand_mask = C.sequence.broadcast_as(mem_mask, proj_inp)  # [#,c][*=q]
        matrix = C.reshape(weight_dense(C.tanh(proj_inp + expand_mem + bias)),
                           (-1, ))  # [#,c][*=q]
        matrix = C.element_select(expand_mask, matrix, -1e30)
        logits = C.softmax(matrix, axis=0)  # [#,c][*=q]
        weight_mem = C.reduce_sum(C.reshape(logits, (-1, 1)) * expand_mem,
                                  axis=0)  # [#,c][d]
        weight_mem = C.reshape(weight_mem, (-1, ))

        return C.as_block(C.combine(weight_mem, logits), [(input_ph, input),
                                                          (mem_ph, memory)],
                          'simi_attention', 'simi_attention')
예제 #8
0
 def signFunc(self, arg):
     # create an input variable that matches the dimension of the input argument
     signIn = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes)
     # create the first stage of the sign function, check if input is greater than zero
     actionfunc = C.greater(signIn, 0)
     # return the second stage of the sign function, replace any 0s with -1s
     return C.element_select(actionfunc, actionfunc, -1), signIn
예제 #9
0
    def multiFunc(self, arg1):
        multiIn = C.input(shape=arg1.shape, dynamic_axes=arg1.dynamic_axes)
        bit_map = C.constant(self.bit_map)
        max_bits = self.bit_map.max()
        carry_over = multiIn
        approx = C.element_times(multiIn, 0)
        for i in range(max_bits):
            hot_vals = C.greater(bit_map, i)
            valid_vals = C.element_select(hot_vals, carry_over, 0)
            mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)),
                                    C.reduce_sum(hot_vals))
            bits = C.greater(carry_over, 0)
            bits = C.element_select(bits, bits, -1)
            bits = C.element_select(hot_vals, bits, 0)
            approx = C.plus(approx, C.element_times(mean, bits))
            carry_over = C.plus(
                C.element_times(C.element_times(-1, bits), mean), carry_over)

        return approx, multiIn
예제 #10
0
 def multiFunc(self, arg1):
     multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes)
     bit_map = C.constant(self.bit_map)
     max_bits = self.bit_map.max()
     shape = multiIn.shape
     reformed = C.reshape(multiIn, (-1,))
     carry_over = multiIn
     approx = C.element_times(multiIn, 0)
     for i in range(max_bits):
         hot_vals = C.greater(bit_map, i)
         valid_vals = C.element_select(hot_vals, carry_over, 0)
         mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)), C.reduce_sum(hot_vals))
         bits = C.greater(carry_over, 0)
         bits = C.element_select(bits, bits, -1)
         bits = C.element_select(hot_vals, bits, 0)
         approx = C.plus(approx, C.element_times(mean, bits))
         carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over)
         
     return approx, multiIn
예제 #11
0
    def inner_padded(x, y, p):
        padded, valid_x = _inner(x, y)

        # replace zero pad by scatter with padding token
        if p is not None:
            broadcasted_padding_token = C.sequence.broadcast_as(p, padded)
            padded = C.element_select(1 - valid_x, broadcasted_padding_token,
                                      padded)

        return padded  # [*, long_seq] [short_seq_dim, ]
예제 #12
0
    def inner(a):
        not_negative = C.greater_equal(a, 0)
        sign = C.element_select(not_negative, not_negative, -1)

        abs_x = C.abs(a)

        # A&S formula 7.1.26
        t = 1.0 / (1.0 + p * a)
        y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * C.exp(
            -abs_x * abs_x)
        return C.element_times(sign, y)
예제 #13
0
def test_Select(flag, if_true, if_false, tmpdir):
    flag = np.asarray(flag, dtype=np.float32)
    if_true = np.asarray(if_true, dtype=np.float32)
    if_false = np.asarray(if_false, dtype=np.float32)

    model = C.element_select(flag, if_true, if_false)
    verify_no_input(model, tmpdir, 'Select_0')

    flag_var = C.input_variable(np.shape(flag))
    if_true_var = C.input_variable(np.shape(if_true))
    if_false_var = C.input_variable(np.shape(if_false))

    model = C.element_select(flag_var, if_true, if_false)
    verify_one_input(model, flag, tmpdir, 'Select_1_flag')

    model = C.element_select(flag, if_true_var, if_false)
    verify_one_input(model, if_true, tmpdir, 'Select_1_if_true')

    model = C.element_select(flag, if_true, if_false_var)
    verify_one_input(model, if_false, tmpdir, 'Select_1_if_false')
예제 #14
0
def test_Select(flag, if_true, if_false, tmpdir):
    flag = np.asarray(flag, dtype=np.float32)
    if_true = np.asarray(if_true, dtype=np.float32)
    if_false = np.asarray(if_false, dtype=np.float32)

    model = C.element_select(flag, if_true, if_false)
    verify_no_input(model, tmpdir, 'Select_0')

    flag_var = C.input_variable(np.shape(flag))
    if_true_var = C.input_variable(np.shape(if_true))
    if_false_var = C.input_variable(np.shape(if_false))

    model = C.element_select(flag_var, if_true, if_false)
    verify_one_input(model, flag, tmpdir, 'Select_1_flag')

    model = C.element_select(flag, if_true_var, if_false)
    verify_one_input(model, if_true, tmpdir, 'Select_1_if_true')

    model = C.element_select(flag, if_true, if_false_var)
    verify_one_input(model, if_false, tmpdir, 'Select_1_if_false')
예제 #15
0
def true_density(z):
    z1, z2 = z[0], z[1]

    w1 = lambda x: C.sin(2 * np.pi * x/4)
    u = 0.5 * C.square((z2 - w1(z1))/0.4)
    dummy = C.ones_like(u) * 1e7

    # u = C.element_select(C.less_equal(z1,4), u, dummy)
    cond = C.less_equal(z1,4)
    u = C.element_select(cond, u, dummy) # u = cond*u + (1-cond)*dummy

    return C.exp(-u)
예제 #16
0
    def attention_layer(self, context, query, layer):

        q_processed = C.placeholder(shape=(2*self.hidden_dim,))
        p_processed = C.placeholder(shape=(2*self.hidden_dim,))

        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform())
        v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform())

        # seq[tensor[2d]] p_len x 2d
        wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim))

        # q_len x 2d
        wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim))
        
        # seq[tensor[q_len]]
        S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1))

        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed)

        # seq[tensor[q_len]]
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))
        
        # seq[tensor[q_len]]
        A = C.softmax(S, axis=0)

        # seq[tensor[2d]]
        swap_qvw = C.swapaxes(qvw)
        cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1))

        # seq[tensor[4d]]
        uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq)
        
        # seq[tensor[4d]]
        gt = C.tanh(C.times(uc_concat, wg))
        
        # seq[tensor[4d]]
        uc_concat_star = gt * uc_concat
 
        # seq[tensor[4d]]
        vp = C.layers.Sequential([
            C.layers.Dropout(self.dropout),
            OptimizedRnnStack(self.hidden_dim, bidirectional=True, 
                use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star)
        
        return C.as_block(
            vp,
            [(p_processed, context), (q_processed, query)],
            'attention_layer',
            'attention_layer')
    def multiFunc(self, arg1):
        # load or create the inputs we need
        multiIn = C.input(shape=arg1.shape, dynamic_axes=arg1.dynamic_axes)
        bit_map = C.constant(self.bit_map)
        max_bits = self.bit_map.max()
        shape = multiIn.shape
        reformed = C.reshape(multiIn, (-1, ))
        # lets compute the means we need
        # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits,
        # it is the difference between the previous bits approximation and the true value.
        carry_over = multiIn
        approx = C.element_times(multiIn, 0)
        # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization
        for i in range(max_bits):
            # determine which values of the input should be binarized to i bits or more
            hot_vals = C.greater(bit_map, i)
            # select only the values which we need to binarize
            valid_vals = C.element_select(hot_vals, carry_over, 0)
            # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels)
            mean = C.element_divide(
                C.reduce_sum(C.reshape(C.abs(valid_vals),
                                       (valid_vals.shape[0], -1)),
                             axis=1),
                C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)),
                             axis=1))
            # reshape the mean to match the dimensionality of the input
            mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1))
            # binarize the carry over
            bits = C.greater(carry_over, 0)
            bits = C.element_select(bits, bits, -1)
            bits = C.element_select(hot_vals, bits, 0)
            # add in the equivalent binary representation to the approximation
            approx = C.plus(approx, C.element_times(mean, bits))
            # compute the new carry over
            carry_over = C.plus(
                C.element_times(C.element_times(-1, bits), mean), carry_over)

        return approx, multiIn
예제 #18
0
    def attention_layer(self, context, query):
        q_processed = C.placeholder(shape=(2 * self.hidden_dim, ))
        c_processed = C.placeholder(shape=(2 * self.hidden_dim, ))

        #convert query's sequence axis to static
        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        # This part deserves some explanation
        # It is the attention layer
        # In the paper they use a 6 * dim dimensional vector
        # here we split it in three parts because the different parts
        # participate in very different operations
        # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u)
        ws1 = C.parameter(shape=(2 * self.hidden_dim, 1),
                          init=C.glorot_uniform())
        ws2 = C.parameter(shape=(2 * self.hidden_dim, 1),
                          init=C.glorot_uniform())
        ws3 = C.parameter(shape=(1, 2 * self.hidden_dim),
                          init=C.glorot_uniform())
        att_bias = C.parameter(shape=(), init=0)

        wh = C.times(c_processed, ws1)
        wu = C.reshape(C.times(qvw, ws2), (-1, ))
        whu = C.reshape(
            C.reduce_sum(c_processed *
                         C.sequence.broadcast_as(qvw * ws3, c_processed),
                         axis=1), (-1, ))
        S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias
        # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max
        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed)
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))
        q_attn = C.reshape(C.softmax(S), (-1, 1))
        #q_attn = print_node(q_attn)
        c2q = C.reshape(
            C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn,
                         axis=0), (-1))

        max_col = C.reduce_max(S)
        c_attn = C.sequence.softmax(max_col)

        htilde = C.sequence.reduce_sum(c_processed * c_attn)
        q2c = C.sequence.broadcast_as(htilde, c_processed)
        q2c_out = c_processed * q2c

        att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out)

        return C.as_block(att_context, [(c_processed, context),
                                        (q_processed, query)],
                          'attention_layer', 'attention_layer')
예제 #19
0
def attention_pooling(inputs, inputs_mask, inputs_weights, decode, decode_weights, keys):
    """
    inputs: shape=(n, dim)
    inputs_weight: shape=(dim, dim)
    decode: shape=(1, dec_dim)
    decode_weights: shape=(dec_dim, dim)
    keys: shape=(dim, 1)
    
    """
    w_in = C.times(inputs, inputs_weights)  #shape=(n, dim)
    w_dec = C.times(decode, decode_weights) #shape=(dim, 1)
    S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in)) #shape=(n, dim)
    S = C.element_select(inputs_mask, S, C.constant(-1e+30))
    S = C.times(S, keys) #shape=(n)
    S = C.ops.sequence.softmax(S, name="softmax")
    attention = C.reduce_sum(inputs * S, axis=0)
    return attention
예제 #20
0
def attention_pooling(inputs, inputs_mask, inputs_weights, decode,
                      decode_weights, keys):
    """
    inputs: shape=(n, dim)
    inputs_weight: shape=(dim, dim)
    decode: shape=(1, dec_dim)
    decode_weights: shape=(dec_dim, dim)
    keys: shape=(dim, 1)
    
    """
    w_in = C.times(inputs, inputs_weights)  #shape=(n, dim)
    w_dec = C.times(decode, decode_weights)  #shape=(dim, 1)
    S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in))  #shape=(n, dim)
    S = C.element_select(inputs_mask, S, C.constant(-1e+30))
    S = C.times(S, keys)  #shape=(n)
    S = C.ops.sequence.softmax(S, name="softmax")
    attention = C.reduce_sum(inputs * S, axis=0)
    return attention
예제 #21
0
    def attention_layer(self, context, query, dimc, dimq, common_dim):
        q_processed = C.placeholder(shape=(dimq, ))
        c_processed = C.placeholder(shape=(dimc, ))

        #convert query's sequence axis to static
        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w4 * (h.*u)
        ws1 = C.parameter(shape=(dimc, 1), init=C.glorot_uniform())
        ws2 = C.parameter(shape=(dimq, 1), init=C.glorot_uniform())
        ws4 = C.parameter(shape=(1, common_dim), init=C.glorot_uniform())
        att_bias = C.parameter(shape=(), init=0)

        wh = C.times(c_processed, ws1)  # [#,c][1]
        wu = C.reshape(C.times(qvw, ws2), (-1, ))  # [#][*]
        # qvw*ws4: [#][*,200], whu:[#,c][*]
        whu = C.reshape(C.reduce_sum(
            c_processed[:common_dim] *\
            C.sequence.broadcast_as(qvw[:,:common_dim] * ws4, c_processed), axis=1), (-1,))
        S1 = wh + C.sequence.broadcast_as(wu,
                                          c_processed) + att_bias  # [#,c][*]
        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed)
        S1 = C.element_select(qvw_mask_expanded, S1, C.constant(-1e+30))
        q_attn = C.reshape(C.softmax(S1), (-1, 1))  # [#,c][*,1]
        c2q = C.reshape(
            C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn,
                         axis=0), (-1))  # [#,c][200]

        max_col = C.reduce_max(S1)  # [#,c][1] 最大的q中的单词
        c_attn = C.sequence.softmax(max_col)  # [#,c][1] 对c中的每一个单词做softmax

        htilde = C.sequence.reduce_sum(c_processed * c_attn)  # [#][200]
        q2c = C.sequence.broadcast_as(htilde, c_processed)  # [#,c][200]
        q2c_out = c_processed[:common_dim] * q2c[:common_dim]

        # 原始文档,题目表示,文章重点表示,匹配度表示,文章上下文表示
        att_context_reg = C.splice(c_processed, c2q, q2c_out,
                                   c_processed[:common_dim] * c2q[:common_dim])
        res = C.combine(att_context_reg, C.reshape(q_attn, (-1, )))
        return \
        C.as_block(res,
            [(c_processed, context), (q_processed, query)],
            'attention_layer',
            'attention_layer')
예제 #22
0
    def dot_attention(self, inputs, memory, dim):
        '''
        @inputs: [#,c][d] a sequence need attention
        @memory(key): [#,q][d] a sequence input refers to compute similarity(weight)
        @value: [#,q][d] a sequence input refers to weighted sum
        @output: [#,c][d] attention vector
        '''
        input_ph = C.placeholder()
        input_mem = C.placeholder()
        with C.layers.default_options(
                bias=False,
                activation=C.relu):  # all the projections have no bias
            attn_proj_enc = C.layers.Dense(dim,
                                           init=glorot_uniform(),
                                           input_rank=1,
                                           name="Wqu")
            attn_proj_dec = C.layers.Dense(dim,
                                           init=glorot_uniform(),
                                           input_rank=1)

        inputs_ = attn_proj_enc(input_ph)  # [#,c][d]
        memory_ = attn_proj_dec(input_mem)  # [#,q][d]
        unpack_memory, mem_mask = C.sequence.unpack(
            memory_, 0).outputs  # [#][*=q, d], [#][*=q]
        unpack_memory_expand = C.sequence.broadcast_as(unpack_memory,
                                                       inputs_)  # [#,c][*=q,d]

        matrix = C.times_transpose(inputs_, unpack_memory_expand) / (
            dim**0.5)  # [#,c][*=q]
        mem_mask_expand = C.sequence.broadcast_as(mem_mask,
                                                  inputs_)  # [#,c][*=q]
        matrix = C.element_select(mem_mask_expand, matrix,
                                  C.constant(-1e+30))  # [#,c][*=q]
        logits = C.reshape(C.softmax(matrix), (-1, 1))  # [#,c][*=q,1]
        # [#,c][*=q, d]
        memory_expand = C.sequence.broadcast_as(
            C.sequence.unpack(input_mem, 0, no_mask_output=True), input_ph)
        weighted_att = C.reshape(C.reduce_sum(logits * memory_expand, axis=0),
                                 (-1, ))  # [#,c][d]

        return C.as_block(C.combine(weighted_att,
                                    logits), [(input_ph, inputs),
                                              (input_mem, memory)],
                          'dot attention', 'dot attention')
예제 #23
0
파일: __init__.py 프로젝트: haixpham/cntkx
    def inner(a):
        # a: [#, *] [static_axes, num_classes]

        k_values, k_indices = C.top_k(a, k=k, axis=axis).outputs
        # k_indices [#, *] [static_axes, k]

        b = C.one_hot(k_indices, num_classes)
        # b: [#, *] [static_axes, k, num_classes]

        valid_probabilities = C.squeeze(C.reduce_sum(b, axis=-2), axes=(-2, ))
        # valid_probabilities: [#, *] [static_axes, num_classes]

        # k largest probabilies are retained, everything else is set to -inf and will not be sampled
        minus_inf = C.constant(-1e+30)
        d = a * valid_probabilities
        e = C.element_select(d, d, minus_inf)
        # e: [#, *] [static_axes, num_classes]

        # sample from top_k distribution once
        s = sample(e, axis=axis, name=name)
        # s: [#, *] [static_axes, num_classes]
        return s
예제 #24
0
    def build(self):
        input_kernel = C.Parameter(shape=(self._input_size, self._hidden_dim),
                                   init=self._input_initializer)
        recur_kernel = C.Parameter(shape=(self._hidden_dim, ),
                                   init=self._recurrent_initializer)
        bias = C.Parameter(shape=(self._hidden_dim), init=0)
        if self._recurrent_min_abs > 0:
            abs_kernel = C.abs(recur_kernel)
            min_abs_kernel = C.element_max(abs_kernel, self._recurrent_min_abs)
            recur_kernel = min_abs_kernel * C.element_select(
                C.greater_equal(recur_kernel, C.constant(0)), C.constant(1),
                C.constant(-1))
        if self._recurrent_max_abs:
            recur_kernel = C.clip(recur_kernel, -self._recurrent_max_abs,
                                  self._recurrent_max_abs)

        @C.Function
        def runit(h, x):
            h_t = C.times(x, input_kernel) + bias + recur_kernel * h
            return h_t

        return runit
예제 #25
0
파일: __init__.py 프로젝트: junjieqian/CNTK
def element_select(flag, value_if_true, value_if_false, name=''):
    '''
    return either value_if_true or value_if_false based on the value of flag.
    If flag != 0 value_if_true is returned, otherwise value_if_false.
    Behaves analogously to numpy.where(...).

    Example:
        >>> C.eval(C.cond([-10, -1, 0, 0.3, 100], [1, 10, 100, 1000, 10000], [ 2, 20, 200, 2000, 20000]))
        [array([[  1.00000000e+00,   1.00000000e+01,   2.00000000e+02,
                   1.00000000e+03,   1.00000000e+04]])]

    Args:
        flag: tensor
        value_if_true: tensor
        value_if_false: tensor
        name (str): the name of the node in the network          
    Returns:
        :class:`cntk.Function`
    '''    
    from cntk import element_select
    flag = sanitize_input(flag)
    value_if_true = sanitize_input(value_if_true)
    value_if_false = sanitize_input(value_if_false)
    return element_select(flag, value_if_true, value_if_false, name).output()    
예제 #26
0
    def attention_layer(self, context, query, dim):
        input_ph = C.placeholder(shape=(dim, ))
        input_mem = C.placeholder(shape=(dim, ))
        with C.layers.default_options(bias=False, activation=C.relu):
            attn_proj_enc = C.layers.Dense(self.hidden_dim,
                                           init=glorot_uniform(),
                                           input_rank=1,
                                           name="Wqu")
            attn_proj_dec = C.layers.Dense(self.hidden_dim,
                                           init=glorot_uniform(),
                                           input_rank=1)

        inputs_ = attn_proj_enc(input_ph)  # [#,c][d]
        memory_ = attn_proj_dec(input_mem)  # [#,q][d]

        cln_mem_ph = C.placeholder()  # [#,q][?=d]
        cln_inp_ph = C.placeholder()  # [#,c][?=d]
        unpack_inputs, inputs_mask = C.sequence.unpack(
            cln_inp_ph, 0).outputs  # [#][*=c,d] [#][*=c]
        expand_inputs = C.sequence.broadcast_as(unpack_inputs,
                                                cln_mem_ph)  # [#,q][*=c,d]
        matrix = C.reshape(
            C.times_transpose(cln_mem_ph, expand_inputs) /
            (self.hidden_dim**0.5), (-1, ))  # [#,q][*=c]
        matrix = C.element_select(
            C.sequence.broadcast_as(inputs_mask, cln_mem_ph), matrix,
            C.constant(-1e30))
        logits = C.softmax(matrix, axis=0, name='level 1 weight')  # [#,q][*=c]
        trans_expand_inputs = C.transpose(expand_inputs,
                                          [1, 0])  # [#,q][d,*=c]
        q_over_c = C.reshape(
            C.reduce_sum(logits * trans_expand_inputs, axis=1),
            (-1, )) / (self.hidden_dim**0.5)  # [#,q][d]
        new_q = C.splice(cln_mem_ph, q_over_c)  # [#,q][2*d]
        # over
        unpack_matrix, matrix_mask = C.sequence.unpack(
            matrix, 0).outputs  # [#][*=q,*=c] [#][*=q]
        inputs_mask_s = C.to_sequence(C.reshape(inputs_mask,
                                                (-1, 1)))  # [#,c'][1]
        trans_matrix = C.to_sequence_like(C.transpose(unpack_matrix, [1, 0]),
                                          inputs_mask_s)  # [#,c'][*=q]
        trans_matrix = C.sequence.gather(trans_matrix,
                                         inputs_mask_s)  # [#,c2][*=q]
        trans_matrix = C.element_select(
            C.sequence.broadcast_as(matrix_mask, trans_matrix), trans_matrix,
            C.constant(-1e30))
        logits2 = C.softmax(trans_matrix, axis=0,
                            name='level 2 weight')  # [#,c2][*=c]
        unpack_new_q, new_q_mask = C.sequence.unpack(
            new_q, 0).outputs  # [#][*=q,2*d] [#][*=q]
        expand_new_q = C.transpose(
            C.sequence.broadcast_as(unpack_new_q, trans_matrix),
            [1, 0])  # [#,c2][2d,*=q]
        c_over_q = C.reshape(C.reduce_sum(logits2 * expand_new_q, axis=1),
                             (-1, )) / (2 * self.hidden_dim)**0.5  # [#,c2][2d]
        c_over_q = C.reconcile_dynamic_axes(c_over_q, cln_inp_ph)

        weighted_q = c_over_q.clone(C.CloneMethod.share, {
            cln_mem_ph: memory_,
            cln_inp_ph: inputs_
        })  # [#,c][2d]
        c2c = q_over_c.clone(C.CloneMethod.share, {
            cln_mem_ph: inputs_,
            cln_inp_ph: inputs_
        })  # [#,c][2d]

        att_context = C.splice(input_ph, weighted_q, c2c)  # 2d+2d+2d

        return C.as_block(att_context, [(input_ph, context),
                                        (input_mem, query)], 'attention_layer',
                          'attention_layer')
예제 #27
0
 def word_level_drop(self, doc):
     # doc [#, c][d]
     seq_shape = C.sequence.is_first(doc)
     u = C.random.uniform_like(seq_shape, seed=98052)
     mask = C.element_select(C.greater(u, 0.08), 1.0, 0)
     return doc * mask
예제 #28
0
def pad_ctc_labels(ctc_labels, network_output):
    """ Pads the shorter truth label sequence to the same sequence length as the network output.
    This should be used when the final sequence length of the network output cannot be determined
    beforehand during the pre-processing of the ctc_labels. Thus, the padding is done during training runtime
    instead of during the data pipeline processing.

    The padding token would be the last sequence element of `ctc_labels`. `ctc_labels` should be
    a one hot encoded vector sequence. The padding token will have the value of 1 in its one-hot encoded vector.

    Example:
        # first example
        labels = C.sequence.input_variable(10)
        network_outputs = model(...)

        padded_labels = pad_ctc_labels(labels, network_outputs)


        # second example
        a = C.sequence.input_variable(3, sequence_axis=ax1)
        b = C.sequence.input_variable(6, sequence_axis=ax2)

        c = pad_ctc_labels(a, b)

        padding_token = np.array([0, 0, 1])
        n1 = [np.array([[0, 2, 0],
                        [2, 0, 0],
                        [0, 0, 2], ]).astype(np.float32), ]

        n2 = [np.random.random((20, 6)).astype(np.float32),
              np.random.random((22, 6)).astype(np.float32),
              np.random.random((24, 6)).astype(np.float32), ]

        n1 = n1 * len(n2)

        results = c.eval({a: n1, b: n2})

        for seq, result in zip(n2, results):

            for r in results[3:]:
                assert np.all(r == padding_token)

            assert result.shape[0] == seq.shape[0]

    Arguments:
        ctc_labels: one-hot-encoded ctc labels tensor
        network_output: output from model network

    Returns:
        :class:`~cntk.ops.functions.Function`
        a sequence tensor with the same sequence axis as network_output and ctc padded

    """
    last_labels = C.sequence.last(
        ctc_labels
    )  # last token has one-hot-encode value of 2 for ctc training
    last_labels = C.element_select(last_labels, 1,
                                   0)  # replace value of 2 with 1

    padded_labels = pad_to(ctc_labels,
                           network_output,
                           padding_token=last_labels)
    return padded_labels
def create_network(input_vocab_dim, label_vocab_dim):
    # network complexity; initially low for faster testing
    hidden_dim = 256
    num_layers = 1

    # Source and target inputs to the model
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')
    raw_input = sequence.input(shape=(input_vocab_dim), sequence_axis=input_seq_axis, name='raw_input')
    raw_labels = sequence.input(shape=(label_vocab_dim), sequence_axis=label_seq_axis, name='raw_labels')

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = sequence.slice(raw_labels, 1, 0) # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)        # <s>

    is_first_label = sequence.is_first(label_sequence)       # <s> 0 0 0 ...
    label_sentence_start_scattered = sequence.scatter(
        label_sentence_start, is_first_label)

    # Encoder
    encoder_outputH = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value)

    thought_vectorH = sequence.first(encoder_outputH)
    thought_vectorC = sequence.first(encoder_outputC)

    thought_vector_broadcastH = sequence.broadcast_as(
        thought_vectorH, label_sequence)
    thought_vector_broadcastC = sequence.broadcast_as(
        thought_vectorC, label_sequence)

    # Decoder
    decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence

    decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(
        decoder_history_hook))

    decoder_outputH = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hookH = past_value
            recurrence_hookC = past_value
        else:
            isFirst = sequence.is_first(label_sequence)
            recurrence_hookH = lambda operand: element_select(
                isFirst, thought_vector_broadcastH, past_value(operand))
            recurrence_hookC = lambda operand: element_select(
                isFirst, thought_vector_broadcastC, past_value(operand))

        (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization(
            decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC)

    decoder_output = decoder_outputH

    # Softmax output layer
    z = linear_layer(stabilize(decoder_output), label_vocab_dim)

    # Criterion nodes
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # network output for decoder history
    net_output = hardmax(z)

    # make a clone of the graph where the ground truth is replaced by the network output
    ng = z.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output})

    return {
        'raw_input' : raw_input,
        'raw_labels' : raw_labels,
        'ce' : ce,
        'pe' : errs,
        'ng' : ng,
        'output': z
    }