Ejemplo n.º 1
0
    def bi_sru_layer(self, sru_1, index):
        f_1_f = C.sigmoid(sru_1[0 * self.param2:1 * self.param2] +
                          self.list_bias[0 + index * 4])
        r_1_f = C.sigmoid(sru_1[1 * self.param2:2 * self.param2] +
                          self.list_bias[1 + index * 4])
        c_1_f_r = (1 - f_1_f) * sru_1[2 * self.param2:3 * self.param2]
        dec_c_1_f = C.layers.ForwardDeclaration('f_' + str(index))
        var_c_1_f = C.sequence.delay(dec_c_1_f, initial_state=0, time_step=1)
        nex_c_1_f = var_c_1_f * f_1_f + c_1_f_r
        dec_c_1_f.resolve_to(nex_c_1_f)
        h_1_f = r_1_f * C.tanh(nex_c_1_f) + (
            1 - r_1_f) * sru_1[3 * self.param2:4 * self.param2]

        f_1_b = C.sigmoid(sru_1[4 * self.param2:5 * self.param2] +
                          self.list_bias[2 + index * 4])
        r_1_b = C.sigmoid(sru_1[5 * self.param2:6 * self.param2] +
                          self.list_bias[3 + index * 4])
        c_1_b_r = (1 - f_1_b) * sru_1[6 * self.param2:7 * self.param2]
        dec_c_1_b = C.layers.ForwardDeclaration('b_' + str(index))
        var_c_1_b = C.sequence.delay(dec_c_1_b, time_step=-1)
        nex_c_1_b = var_c_1_b * f_1_b + c_1_b_r
        dec_c_1_b.resolve_to(nex_c_1_b)
        h_1_b = r_1_b * C.tanh(nex_c_1_b) + (
            1 - r_1_b) * sru_1[7 * self.param2:8 * self.param2]

        x = C.splice(h_1_f, h_1_b)
        return x
Ejemplo n.º 2
0
    def grid_lstm_func(m_t_1_k, m_tk_1, c_t_1_k, c_tk_1, x_tk):
        common_11 = C.times(m_t_1_k, W_t_im) + C.times(
            m_tk_1, W_k_im) + C.times(c_t_1_k, W_t_ic) + C.times(
                c_tk_1, W_k_ic)
        i_t_tk = C.sigmoid(C.times(x_tk, W_t_ix) + common_11 + b_t_i)
        i_k_tk = C.sigmoid(C.times(x_tk, W_k_ix) + common_11 + b_k_i)

        common_12 = C.times(m_t_1_k, W_t_fm) + C.times(
            m_tk_1, W_k_fm) + C.times(c_t_1_k, W_t_fc) + C.times(
                c_tk_1, W_k_fc)
        f_t_tk = C.sigmoid(C.times(x_tk, W_t_fx) + common_12 + b_t_f)
        f_k_tk = C.sigmoid(C.times(x_tk, W_k_fx) + common_12 + b_k_f)

        c_t_tk = C.element_times(f_t_tk, c_t_1_k) + C.element_times(
            i_t_tk,
            C.tanh(
                C.times(x_tk, W_t_cx) + C.times(m_t_1_k, W_t_cm) +
                C.times(m_tk_1, W_k_cm) + b_t_c))  # (13)
        c_k_tk = C.element_times(f_k_tk, c_tk_1) + C.element_times(
            i_k_tk,
            C.tanh(
                C.times(x_tk, W_k_cx) + C.times(m_t_1_k, W_t_cm) +
                C.times(m_tk_1, W_k_cm) + b_k_c))  # (14)

        common_15 = C.times(m_t_1_k, W_t_om) + C.times(
            m_tk_1, W_k_om) + C.times(c_t_tk, W_t_oc) + C.times(
                c_k_tk, W_k_oc)
        o_t_tk = C.sigmoid(C.times(x_tk, W_t_ox) + common_15 + b_t_o)
        o_k_tk = C.sigmoid(C.times(x_tk, W_k_ox) + common_15 + b_k_o)

        m_t_tk = C.element_times(o_t_tk, C.tanh(c_t_tk))
        m_k_tk = C.element_times(o_k_tk, C.tanh(c_k_tk))

        return (m_t_tk, m_k_tk, c_t_tk, c_k_tk)
Ejemplo n.º 3
0
    def attention_layer(self, context, query, layer):

        q_processed = C.placeholder(shape=(2*self.hidden_dim,))
        p_processed = C.placeholder(shape=(2*self.hidden_dim,))

        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform())
        v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform())

        # seq[tensor[2d]] p_len x 2d
        wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim))

        # q_len x 2d
        wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim))
        
        # seq[tensor[q_len]]
        S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1))

        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed)

        # seq[tensor[q_len]]
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))
        
        # seq[tensor[q_len]]
        A = C.softmax(S, axis=0)

        # seq[tensor[2d]]
        swap_qvw = C.swapaxes(qvw)
        cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1))

        # seq[tensor[4d]]
        uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq)
        
        # seq[tensor[4d]]
        gt = C.tanh(C.times(uc_concat, wg))
        
        # seq[tensor[4d]]
        uc_concat_star = gt * uc_concat
 
        # seq[tensor[4d]]
        vp = C.layers.Sequential([
            C.layers.Dropout(self.dropout),
            OptimizedRnnStack(self.hidden_dim, bidirectional=True, 
                use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star)
        
        return C.as_block(
            vp,
            [(p_processed, context), (q_processed, query)],
            'attention_layer',
            'attention_layer')
Ejemplo n.º 4
0
 def attention(h_enc, h_dec):
     history_axis = h_dec  # we use history_axis wherever we pass this only for the sake of passing its axis
     # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders
     # --- encoder state window
     (h_enc, h_enc_valid) = PastValueWindow(
         attention_span, axis=attention_axis,
         go_backwards=go_backwards)(h_enc).outputs
     h_enc_proj = attn_proj_enc(h_enc)
     # window must be broadcast to every decoder time step
     h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis)
     h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis)
     # --- decoder state
     # project decoder hidden state
     h_dec_proj = attn_proj_dec(h_dec)
     tanh_out = C.tanh(h_dec_proj +
                       h_enc_proj)  # (attention_span, attention_dim)
     u = attn_proj_tanh(tanh_out)  # (attention_span, 1)
     u_masked = u + (
         h_enc_valid - 1
     ) * 50  # logzero-out the unused elements for the softmax denominator  TODO: use a less arbitrary number than 50
     attention_weights = C.softmax(
         u_masked, axis=attention_axis)  #, name='attention_weights')
     attention_weights = Label('attention_weights')(attention_weights)
     # now take weighted sum over the encoder state vectors
     h_att = C.reduce_sum(C.element_times(h_enc_proj, attention_weights),
                          axis=attention_axis)
     h_att = attn_final_stab(h_att)
     return h_att
Ejemplo n.º 5
0
def build_graph(self_attention,
                self_penalty,
                embeded_dim=60,
                h_dim=150,
                d_a=350,
                r=30):

    with C.layers.default_options(init=C.xavier()):
        embeded = C.layers.Embedding(embeded_dim)(x)
        embeded = C.layers.Stabilizer()(embeded)

        H = create_birnn(C.layers.GRU(h_dim), C.layers.GRU(h_dim))(embeded)

        if self_attention:
            Ws1 = C.parameter(shape=(d_a, 2 * h_dim), name="Ws1")
            Ws2 = C.parameter(shape=(r, d_a), name="Ws2")
            A = C.softmax(C.times(Ws2, C.tanh(C.times_transpose(Ws1, H))))
            H = C.times(A, H)  # the M in the paper

            if self_penalty:
                I = C.constant(np.eye(r), dtype=np.float32)
                P = C.times_transpose(A, A) - I  # r*r
                p = C.reduce_sum(C.abs(C.element_times(
                    P, P)))  # frobenius norm **2

        y_ = C.layers.Dense(200, activation=C.ops.relu)(H)

        # y_pre = C.layers.Dense(num_labels, activation = None)(y_)
        def selfAtt(x):
            y_pre = C.layers.Dense(num_labels, activation=None)(y_)
            return y_pre

        if self_penalty:
            selfAtt.p = p
        return selfAtt
Ejemplo n.º 6
0
def attention_weight(h_enc, h_dec, inputs_dim):
    enc = C.layers.Dense(inputs_dim, name='out_start')(h_enc)
    dec = C.sequence.broadcast_as(
        C.layers.Dense(inputs_dim, name='out_start')(h_dec), enc)
    att_weight = C.layers.Dense(1, name='out_start')(C.tanh(enc + dec))
    att_weight = C.sequence.softmax(att_weight)
    return att_weight
Ejemplo n.º 7
0
    def simi_attention(self, input, memory):
        '''
        return:
        memory weighted vectors over input [#,c][d]
        weight
        '''
        input_ph = C.placeholder()  # [#,c][d]
        mem_ph = C.placeholder()  # [#,q][d]

        input_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1)
        mem_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1)
        bias = C.Parameter(shape=(2 * self.hidden_dim, ), init=0.0)
        weight_dense = Dense(1, bias=False, input_rank=1)

        proj_inp = input_dense(input_ph)  # [#,c][d]
        proj_mem = mem_dense(mem_ph)  # [#,q][d]
        unpack_memory, mem_mask = C.sequence.unpack(
            proj_mem, 0).outputs  # [#][*=q, d] [#][*=q]
        expand_mem = C.sequence.broadcast_as(unpack_memory,
                                             proj_inp)  # [#,c][*=q,d]
        expand_mask = C.sequence.broadcast_as(mem_mask, proj_inp)  # [#,c][*=q]
        matrix = C.reshape(weight_dense(C.tanh(proj_inp + expand_mem + bias)),
                           (-1, ))  # [#,c][*=q]
        matrix = C.element_select(expand_mask, matrix, -1e30)
        logits = C.softmax(matrix, axis=0)  # [#,c][*=q]
        weight_mem = C.reduce_sum(C.reshape(logits, (-1, 1)) * expand_mem,
                                  axis=0)  # [#,c][d]
        weight_mem = C.reshape(weight_mem, (-1, ))

        return C.as_block(C.combine(weight_mem, logits), [(input_ph, input),
                                                          (mem_ph, memory)],
                          'simi_attention', 'simi_attention')
Ejemplo n.º 8
0
 def new_attention(encoder_hidden_state, decoder_hidden_state):
     # encode_hidden_state: [#, e] [h]
     # decoder_hidden_state: [#, d] [H]
     unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs
     # unpacked_encoder_hidden_state: [#] [*=e, h]
     # valid_mask: [#] [*=e]
     projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state)
     # projected_encoder_hidden_state: [#, d] [*=e, attention_dim]
     broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state)
     # broadcast_valid_mask: [#, d] [*=e]
     projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state)
     # projected_decoder_hidden_state: [#, d] [attention_dim]
     tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state)
     # tanh_output: [#, d] [*=e, attention_dim]
     attention_logits = attn_proj_tanh(tanh_output)
     # attention_logits = [#, d] [*=e, 1]
     minus_inf = C.constant(-1e+30)
     masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf)
     # masked_attention_logits = [#, d] [*=e]
     attention_weights = C.softmax(masked_attention_logits, axis=0)
     attention_weights = Label('attention_weights')(attention_weights)
     # attention_weights = [#, d] [*=e]
     attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0)
     # attended_encoder_hidden_state = [#, d] [1, h]
     output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1))
     # output = [#, d], [h]
     return output
Ejemplo n.º 9
0
def test_tanh_2():
    cntk_op = C.tanh([0.])
    cntk_ret = cntk_op.eval()

    ng_op, _ = CNTKImporter().import_model(cntk_op)
    ng_ret = ng.transformers.make_transformer().computation(ng_op)()

    assert np.isclose(cntk_ret, ng_ret).all()
Ejemplo n.º 10
0
    def rnet_output_layer(self, attention_context, query):

        att_context = C.placeholder(shape=(2*self.hidden_dim,))
        q_processed = C.placeholder(shape=(2*self.hidden_dim,))

        wuq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        whp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wha = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform())
        bias = C.parameter(shape=(2*self.hidden_dim), init=C.glorot_uniform())

        whp_end = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wha_end = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        v_end = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform())

        # sequence[tensor[1]] q_len x 1
        s0 = C.times(C.tanh(C.times(q_processed, wuq) + bias), v)
        a0 = C.sequence.softmax(s0)
        rQ = C.sequence.reduce_sum(a0 * q_processed)
        
        # sequence[tensor[1]] plen x 1 
        ts = C.reshape(C.times(C.tanh(
            C.times(att_context, whp) + C.times(C.sequence.broadcast_as(rQ, att_context), wha)), v), (-1))

        # sequence[tensor[1]]
        ta = C.sequence.softmax(ts)

        # sequence[2d] 1 x 2d
        c0 = C.reshape(C.sequence.reduce_sum(ta * att_context), (2*self.hidden_dim))
        
        # sequence[tensor[2d]]
        ha1 = C.layers.blocks.GRU(2*self.hidden_dim)(rQ, c0)

        # sequence[tensor[1]] plen x 1
        s1 = C.reshape(C.times(C.tanh(C.times(att_context, whp_end) + C.times(
            C.sequence.broadcast_as(ha1, att_context), wha_end)), v_end), (-1))

        # sequence[tensor[1]] plen x 1
        a1 = C.sequence.softmax(s1)

        return C.as_block(
            C.combine([ts, s1]),
            [(att_context, attention_context), (q_processed, query)],
            'output_layer',
            'output_layer')
Ejemplo n.º 11
0
def test_tanh_3():
    cntk_op = C.tanh(
        [-0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0.])
    cntk_ret = cntk_op.eval()

    ng_op, _ = CNTKImporter().import_model(cntk_op)
    ng_ret = ng.transformers.make_transformer().computation(ng_op)()

    assert np.isclose(cntk_ret, ng_ret).all()
Ejemplo n.º 12
0
    def LSTMCell(x, y, dh, dc):
        '''LightLSTM Cell'''

        b = C.parameter(shape=(4 * cell_dim), init=0)
        W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform())
        H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform())

        # projected contribution from input x, hidden, and bias
        proj4 = b + C.times(x, W) + C.times(dh, H)

        it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim)
        bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim)
        ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim)
        ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim)

        it = C.sigmoid(it_proj)  # input gate
        bit = it * C.tanh(bit_proj)

        ft = C.sigmoid(ft_proj)  # forget gate
        bft = ft * dc

        ct = bft + bit
        ot = C.sigmoid(ot_proj)  # output gate
        ht = ot * C.tanh(ct)

        # projected contribution from input y, hidden, and bias
        proj4_2 = b + C.times(y, W) + C.times(ht, H)

        it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim)
        bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim)
        ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim)
        ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim)

        it_2 = C.sigmoid(it_proj_2)  # input gate
        bit_2 = it_2 * C.tanh(bit_proj_2)

        ft_2 = C.sigmoid(ft_proj_2)  # forget gate
        bft_2 = ft_2 * ct

        ct2 = bft_2 + bit_2
        ot_2 = C.sigmoid(ot_proj_2)  # output gate
        ht2 = ot_2 * C.tanh(ct2)
        return (ht, ct, ht2, ct2)
Ejemplo n.º 13
0
def lstm_func(output_dim, cell_dim, x, input_dim, prev_state_h, prev_state_c):

    # input gate (t)
    it_w = C.times(C.parameter((cell_dim, input_dim)), x)
    it_b = C.parameter((cell_dim))
    it_h = C.times(C.parameter((cell_dim, output_dim)), prev_state_h)
    it_c = C.parameter((cell_dim)) * prev_state_c
    it = C.sigmoid((it_w + it_b + it_h + it_c), name='it')

    # applied to tanh of input
    bit_w = C.times(C.parameter((cell_dim, input_dim)), x)
    bit_h = C.times(C.parameter((cell_dim, output_dim)), prev_state_h)
    bit_b = C.parameter((cell_dim))
    bit = it * C.tanh(bit_w + (bit_h + bit_b))

    # forget-me-not gate (t)
    ft_w = C.times(C.parameter((cell_dim, input_dim)), x)
    ft_b = C.parameter((cell_dim))
    ft_h = C.times(C.parameter((cell_dim, output_dim)), prev_state_h)
    ft_c = C.parameter((cell_dim)) * prev_state_c
    ft = C.sigmoid((ft_w + ft_b + ft_h + ft_c), name='ft')

    # applied to cell(t-1)
    bft = ft * prev_state_c

    # c(t) = sum of both
    ct = bft + bit

    # output gate
    ot_w = C.times(C.parameter((cell_dim, input_dim)), x)
    ot_b = C.parameter((cell_dim))
    ot_h = C.times(C.parameter((cell_dim, output_dim)), prev_state_h)
    ot_c = C.parameter((cell_dim)) * prev_state_c
    ot = C.sigmoid((ot_w + ot_b + ot_h + ot_c), name='ot')

    # applied to tanh(cell(t))
    ht = ot * C.tanh(ct)

    # return cell value and hidden state
    return ct, ht
Ejemplo n.º 14
0
def lstm_func(output_dim, cell_dim, x, input_dim, prev_state_h, prev_state_c):
        
    # input gate (t)
    it_w = C.times(x,C.parameter((input_dim, cell_dim)))
    it_b = C.parameter((1,cell_dim))
    it_h = C.times(prev_state_h,C.parameter((output_dim, cell_dim)))
    it_c = C.parameter((1,cell_dim)) * prev_state_c        
    it = C.sigmoid((it_w + it_b + it_h + it_c), name='it')

    # applied to tanh of input    
    bit_w = C.times(x,C.parameter((input_dim,cell_dim)))
    bit_h = C.times(prev_state_h,C.parameter((output_dim,cell_dim)))
    bit_b = C.parameter((1,cell_dim))
    bit = it * C.tanh(bit_w + (bit_h + bit_b))
        
    # forget-me-not gate (t)
    ft_w = C.times(x, C.parameter((input_dim,cell_dim)))
    ft_b = C.parameter((1,cell_dim))
    ft_h = C.times(prev_state_h,C.parameter((output_dim,cell_dim)))
    ft_c = C.parameter((1,cell_dim)) * prev_state_c        
    ft = C.sigmoid((ft_w + ft_b + ft_h + ft_c), name='ft')

    # applied to cell(t-1)
    bft = ft * prev_state_c
        
    # c(t) = sum of both
    ct = bft + bit
        
    # output gate
    ot_w = C.times(x, C.parameter((input_dim,cell_dim)))
    ot_b = C.parameter((1,cell_dim))
    ot_h = C.times(prev_state_h,C.parameter((output_dim,cell_dim)))
    ot_c = C.parameter((1,cell_dim)) * prev_state_c        
    ot = C.sigmoid((ot_w + ot_b + ot_h + ot_c), name='ot')
       
    # applied to tanh(cell(t))
    ht = ot * C.tanh(ct)
        
    # return cell value and hidden state
    return ct, ht
Ejemplo n.º 15
0
    def createNetwork(self, inputEmb, preHidden, preMem):
        WX = C.times(inputEmb, self.W) + self.Wb
        UH = C.times(preHidden, self.U) + self.Ub

        I = C.sigmoid(
            C.slice(WX, -1, 0, self.hiddenSize) +
            C.slice(UH, -1, 0, self.hiddenSize))
        O = C.sigmoid(
            C.slice(WX, -1, self.hiddenSize, self.hiddenSize * 2) +
            C.slice(UH, -1, self.hiddenSize, self.hiddenSize * 2))
        F = C.sigmoid(
            C.slice(WX, -1, self.hiddenSize * 2, self.hiddenSize * 3) +
            C.slice(UH, -1, self.hiddenSize * 2, self.hiddenSize * 3))
        N = C.tanh(
            C.slice(WX, -1, self.hiddenSize * 3, self.hiddenSize * 4) +
            C.slice(UH, -1, self.hiddenSize * 3, self.hiddenSize * 4))

        NI = C.element_times(N, I)
        FM = C.element_times(F, preMem)
        CurMem = NI + FM
        CurH = C.element_times(C.tanh(CurMem), O)
        return (CurH, CurMem)
Ejemplo n.º 16
0
    def unit(dh, dc, x):
        ''' dh: out_dim, dc:4096, x:input_dim'''
        proj4 = b + times(x, W) + times(dh, H)
        it_proj  = proj4[0:1*stacked_dim]  # split along stack_axis
        bit_proj = proj4[1*stacked_dim: 2*stacked_dim]
        ft_proj  = proj4[2*stacked_dim: 3*stacked_dim]
        ot_proj  = proj4[3*stacked_dim: 4*stacked_dim]

        it = C.sigmoid(it_proj)        # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * C.tanh(bit_proj)              # applied to tanh of input network

        ft = C.sigmoid (ft_proj)        # forget-me-not gate(t)
        bft = ft * dc                                 # applied to cell(t-1)

        ct = bft + bit                                # c(t) is sum of both

        ot = C.sigmoid (ot_proj)    # output gate(t)
        ht = ot * C.tanh(ct)                     # applied to tanh(cell(t))

        c = ct                                        # cell value
        h = ht
        proj_h = C.times(h, proj_W) # out_dim
        return (proj_h, c) 
Ejemplo n.º 17
0
    def createNetwork(self, inputEmb, preHidden, preMem=None):
        WrX = C.times(inputEmb, self.Wr) + self.Wrb
        UrH = C.times(preHidden, self.Ur)
        R = C.sigmoid(WrX + UrH)

        WzX = C.times(inputEmb, self.Wz) + self.Wzb
        UzH = C.times(preHidden, self.Uz)
        Z = C.sigmoid(WzX + UzH)

        UH = C.times(preHidden, self.U) + self.Ub
        UHR = C.element_times(UH, R)

        WX = C.times(inputEmb, self.W) + self.Wb
        HTilde = C.tanh(WX + UHR)

        CurH = C.element_times(HTilde, 1 - Z) + C.element_times(preHidden, Z)
        return (CurH, None)
Ejemplo n.º 18
0
def attention_pooling(inputs, inputs_mask, inputs_weights, decode, decode_weights, keys):
    """
    inputs: shape=(n, dim)
    inputs_weight: shape=(dim, dim)
    decode: shape=(1, dec_dim)
    decode_weights: shape=(dec_dim, dim)
    keys: shape=(dim, 1)
    
    """
    w_in = C.times(inputs, inputs_weights)  #shape=(n, dim)
    w_dec = C.times(decode, decode_weights) #shape=(dim, 1)
    S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in)) #shape=(n, dim)
    S = C.element_select(inputs_mask, S, C.constant(-1e+30))
    S = C.times(S, keys) #shape=(n)
    S = C.ops.sequence.softmax(S, name="softmax")
    attention = C.reduce_sum(inputs * S, axis=0)
    return attention
Ejemplo n.º 19
0
def attention_pooling(inputs, inputs_mask, inputs_weights, decode,
                      decode_weights, keys):
    """
    inputs: shape=(n, dim)
    inputs_weight: shape=(dim, dim)
    decode: shape=(1, dec_dim)
    decode_weights: shape=(dec_dim, dim)
    keys: shape=(dim, 1)
    
    """
    w_in = C.times(inputs, inputs_weights)  #shape=(n, dim)
    w_dec = C.times(decode, decode_weights)  #shape=(dim, 1)
    S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in))  #shape=(n, dim)
    S = C.element_select(inputs_mask, S, C.constant(-1e+30))
    S = C.times(S, keys)  #shape=(n)
    S = C.ops.sequence.softmax(S, name="softmax")
    attention = C.reduce_sum(inputs * S, axis=0)
    return attention
Ejemplo n.º 20
0
    def createNetwork(self, inputEmb, preHidden):
        WX = C.times(inputEmb, self.W) + self.Wb
        UH = C.times(preHidden, self.U) + self.Ub

        R = C.sigmoid(
            C.slice(WX, -1, 0, self.hiddenSize) +
            C.slice(UH, -1, 0, self.hiddenSize))
        Z = C.sigmoid(
            C.slice(WX, -1, self.hiddenSize, self.hiddenSize * 2) +
            C.slice(UH, -1, self.hiddenSize, self.hiddenSize * 2))

        UHR = C.element_times(
            C.slice(UH, -1, self.hiddenSize * 2, self.hiddenSize * 3), R)
        HTilde = C.tanh(
            C.slice(WX, -1, self.hiddenSize * 2, self.hiddenSize * 3) + UHR)

        CurH = C.element_times(HTilde, 1 - Z) + C.element_times(preHidden, Z)
        return CurH
Ejemplo n.º 21
0
def tanh(x, name=''):
    '''
    Computes the element-wise tanh of `x`: 

    The output tensor has the same shape as `x`.
    
    Example:
        >>> C.eval(C.tanh([[1,2],[3,4]]))
        [array([[[ 0.761594,  0.964028],
                 [ 0.995055,  0.999329]]])]
    
    Args:
        x: numpy array or any :class:`cntk.Function` that outputs a tensor
        name (str): the name of the node in the network
    Returns:
        :class:`cntk.Function`
    '''
    from cntk import tanh
    x = sanitize_input(x)
    return tanh(x, name).output()    
Ejemplo n.º 22
0
def tanh(x, name=''):
    '''
    Computes the element-wise tanh of `x`: 

    The output tensor has the same shape as `x`.
    
    Example:
        >>> C.eval(C.tanh([[1,2],[3,4]]))
        [array([[[ 0.761594,  0.964028],
                 [ 0.995055,  0.999329]]])]
    
    Args:
        x: numpy array or any :class:`cntk.Function` that outputs a tensor
        name (str): the name of the node in the network
    Returns:
        :class:`cntk.Function`
    '''
    from cntk import tanh
    x = sanitize_input(x)
    return tanh(x, name).output()    
Ejemplo n.º 23
0
 def func(x_var):
     x = C.placeholder()
     WT = C.Parameter((
         dim,
         dim,
     ),
                      init=transform_weight_initializer,
                      name=name + '_WT')
     bT = C.Parameter(dim,
                      init=transform_bias_initializer,
                      name=name + '_bT')
     WU = C.Parameter((
         dim,
         dim,
     ),
                      init=update_weight_initializer,
                      name=name + '_WU')
     bU = C.parameter(dim, init=update_bias_initializer, name=name + '_bU')
     transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT)
     update = C.tanh(C.times(x, WU, name=name + '_U') + bU)
     return C.as_block(update * transform_gate + (1 - transform_gate) * x,
                       [(x, x_var)], 'SingleInner', 'SingleInner' + name)
Ejemplo n.º 24
0
    def createAttentionNet(self, hiddenSrc, curHiddenTrg, srcLength):
        srcHiddenSize = Config.SrcHiddenSize * 2
        hsw = C.times(hiddenSrc, self.Was)
        htw = C.times(curHiddenTrg, self.Wat)
        hst = C.reshape(
            hsw, shape=(srcLength, Config.BatchSize * Config.TrgHiddenSize)
        ) + C.reshape(htw, shape=(1, Config.BatchSize * Config.TrgHiddenSize))
        hstT = C.reshape(C.tanh(hst),
                         shape=(srcLength * Config.BatchSize,
                                Config.TrgHiddenSize))
        attScore = C.reshape(C.times(hstT, self.Wav),
                             shape=(srcLength, Config.BatchSize))
        maskOut = (C.slice(self.maskMatrixSrc, 0, 0, srcLength) - 1) * 99999999
        nAttScore = attScore + maskOut
        attProb = C.reshape(C.softmax(nAttScore, axis=0),
                            shape=(srcLength, Config.BatchSize, 1))
        attVector = hiddenSrc * attProb
        contextVector = C.reduce_sum(C.reshape(
            attVector, shape=(srcLength, Config.BatchSize * srcHiddenSize)),
                                     axis=0)
        contextVector = C.reshape(contextVector,
                                  shape=(1, Config.BatchSize, srcHiddenSize))

        return (contextVector, attProb)
Ejemplo n.º 25
0
def test_Tanh(tmpdir):
    model = C.tanh([[1,2],[3,4]])
    verify_no_input(model, tmpdir, 'Tanh_0')
Ejemplo n.º 26
0
def test_Tanh(tmpdir, dtype):
    with C.default_options(dtype=dtype):
        model = C.tanh(np.array([[1, 2], [3, 4]]).astype(dtype))
        verify_no_input(model, tmpdir, 'Tanh_0')
Ejemplo n.º 27
0
def test_Tanh(tmpdir, dtype):
    with C.default_options(dtype = dtype):
        model = C.tanh(np.array([[1,2],[3,4]]).astype(dtype))
        verify_no_input(model, tmpdir, 'Tanh_0')
Ejemplo n.º 28
0
#%%
def true_density(z):
    z1, z2 = z[0], z[1]

    w1 = lambda x: C.sin(2 * np.pi * x/4)
    u = 0.5 * C.square((z2 - w1(z1))/0.4)
    dummy = C.ones_like(u) * 1e7

    # u = C.element_select(C.less_equal(z1,4), u, dummy)
    cond = C.less_equal(z1,4)
    u = C.element_select(cond, u, dummy) # u = cond*u + (1-cond)*dummy

    return C.exp(-u)

#%%
h = lambda x: C.tanh(x)
h_prime = lambda x: 1 - C.square(C.tanh(x))

base_dist = MultivariateNormalDiag(loc=[0., 0.], scale_diag=[1., 1.])
z_0 = C.input_variable(base_dist.size(), name='sampled')
z_prev = z_0
sum_log_det_jacob = 0.

initializer = C.initializer.uniform(1)
for i in range(K):
    u = C.parameter((2), name='u', init=initializer)
    w = C.parameter((2), name='w', init=initializer)
    b = C.parameter((1), name='b', init=initializer)

    psi = h_prime(C.dot(w, z_prev)+b) * w
    det_jacob = C.abs(1 + C.dot(u, psi))
Ejemplo n.º 29
0
def LSTM(shape,
         _inf,
         cell_shape=None,
         use_peepholes=False,
         init=_default_initializer,
         init_bias=0,
         enable_self_stabilization=False):  # (x, (h, c))
    has_projection = cell_shape is not None
    has_aux = False

    if has_aux:
        UntestedBranchError("LSTM, has_aux option")
    if enable_self_stabilization:
        UntestedBranchError("LSTM, enable_self_stabilization option")

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape

    #stack_axis = -1  #
    stack_axis = 0  # BUGBUG: should be -1, i.e. the fastest-changing one, to match BS
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[0]
    cell_shape_list[stack_axis] = stacked_dim * 4
    cell_shape_stacked = tuple(
        cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b = Parameter(cell_shape_stacked, init=init_bias, name='b')  # a bias
    W = Parameter(_inf.shape + cell_shape_stacked, init=init,
                  name='W')  # input
    A = Parameter(_inf.shape + cell_shape_stacked, init=init,
                  name='A') if has_aux else None  # aux input (optional)
    H = Parameter(shape + cell_shape_stacked, init=init,
                  name='H')  # hidden-to-hidden
    Ci = Parameter(
        cell_shape, init=init, name='Ci'
    ) if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Cf = Parameter(
        cell_shape, init=init, name='Cf'
    ) if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Co = Parameter(
        cell_shape, init=init, name='Co'
    ) if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}

    Wmr = ParameterTensor(
        cell_shape + shape, init=init, init_value_scale=init_value_scale
    ) if has_projection else None  # final projection

    Sdh = Stabilizer(_inf=_inf.with_shape(
        shape)) if enable_self_stabilization else Identity(
            _inf=_inf.with_shape(shape))
    Sdc = Stabilizer(_inf=_inf.with_shape(
        cell_shape)) if enable_self_stabilization else Identity(
            _inf=_inf.with_shape(cell_shape))
    Sct = Stabilizer(_inf=_inf.with_shape(
        cell_shape)) if enable_self_stabilization else Identity(
            _inf=_inf.with_shape(cell_shape))
    Sht = Stabilizer(_inf=_inf.with_shape(
        shape)) if enable_self_stabilization else Identity(
            _inf=_inf.with_shape(shape))

    def create_hc_placeholder():
        return (Placeholder(_inf=_inf.with_shape(shape), name='hPh'),
                Placeholder(_inf=_inf.with_shape(cell_shape),
                            name='cPh'))  # (h, c)

    # parameters to model function
    x = Placeholder(_inf=_inf, name='lstm_block_arg')
    prev_state = create_hc_placeholder()

    # formula of model function
    dh, dc = prev_state

    dhs = Sdh(dh)  # previous values, stabilized
    dcs = Sdc(dc)
    # note: input does not get a stabilizer here, user is meant to do that outside

    # projected contribution from input(s), hidden, and bias
    proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \
            b + times(x, W) + times(dhs, H)

    it_proj = slice(proj4, stack_axis, 0 * stacked_dim,
                    1 * stacked_dim)  # split along stack_axis
    bit_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
    ft_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim)
    ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

    # add peephole connection if requested
    def peep(x, c, C):
        return x + C * c if use_peepholes else x

    it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
    bit = it * tanh(bit_proj)  # applied to tanh of input network

    ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
    bft = ft * dc  # applied to cell(t-1)

    ct = bft + bit  # c(t) is sum of both

    ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
    ht = ot * tanh(ct)  # applied to tanh(cell(t))

    c = ct  # cell value
    h = times(Sht(ht), Wmr) if has_projection else \
        ht

    _name_node(h, 'h')
    if _trace_layers:
        _log_node(h)  # this looks right
    _name_node(c, 'c')

    # TODO: figure out how to do scoping, and also rename all the apply... to expression
    apply_x_h_c = combine([h, c])
    # return to caller a helper function to create placeholders for recurrence
    apply_x_h_c.create_placeholder = create_hc_placeholder
    _name_and_extend_Function(apply_x_h_c, 'LSTM')
    return apply_x_h_c
Ejemplo n.º 30
0
def test_Tanh(tmpdir):
    model = C.tanh([[1, 2], [3, 4]])
    verify_no_input(model, tmpdir, 'Tanh_0')
Ejemplo n.º 31
0
def test_tanh():
    assert_cntk_ngraph_isclose(C.tanh([-2, -1., 0., 1., 2.]))
    assert_cntk_ngraph_isclose(C.tanh([0.]))
    assert_cntk_ngraph_isclose(
        C.tanh([-0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0.]))
Ejemplo n.º 32
0
def attention_weight(h_enc, h_dec, inputs_dim):
    enc = C.layers.Dense(inputs_dim, name='out_start')(h_enc)
    dec = C.sequence.broadcast_as(C.layers.Dense(inputs_dim, name='out_start')(h_dec), enc)
    att_weight = C.layers.Dense(1, name='out_start')(C.tanh(enc+dec))
    att_weight = C.sequence.softmax(att_weight)
    return att_weight      
Ejemplo n.º 33
0
 def createDecoderInitNetwork(self, srcSentEmb):
     WIS = C.times(srcSentEmb, self.WI) + self.WIb
     return C.tanh(WIS)
Ejemplo n.º 34
0
 def inner(a):
     return a * C.tanh(C.softplus(a))
Ejemplo n.º 35
0
def dcgan_generator(h):
    with C.layers.default_options(init=C.normal(0.02),
                                  pad=True,
                                  bias=False,
                                  map_rank=1,
                                  use_cntk_engine=True):
        h = C.reshape(h, (-1, 1, 1))

        h = ConvolutionTranspose2D((4, 4),
                                   1024,
                                   pad=False,
                                   strides=1,
                                   output_shape=(4, 4))(h)
        h = BatchNormalization()(h)
        h = C.relu(h)

        h = ConvolutionTranspose2D(
            (5, 5),
            512,
            strides=2,
            output_shape=(img_height // 32, img_width // 32))(h)
        h = BatchNormalization()(h)
        h = C.relu(h)

        h = ConvolutionTranspose2D(
            (5, 5),
            256,
            strides=2,
            output_shape=(img_height // 16, img_width // 16))(h)
        h = BatchNormalization()(h)
        h = C.relu(h)

        h = ConvolutionTranspose2D(
            (5, 5),
            128,
            strides=2,
            output_shape=(img_height // 8, img_width // 8))(h)
        h = BatchNormalization()(h)
        h = C.relu(h)

        h = ConvolutionTranspose2D(
            (5, 5),
            64,
            strides=2,
            output_shape=(img_height // 4, img_width // 4))(h)
        h = BatchNormalization()(h)
        h = C.relu(h)

        h = ConvolutionTranspose2D(
            (5, 5),
            32,
            strides=2,
            output_shape=(img_height // 2, img_width // 2))(h)
        h = BatchNormalization()(h)
        h = C.relu(h)

        h = ConvolutionTranspose2D((5, 5),
                                   3,
                                   strides=2,
                                   bias=True,
                                   output_shape=(img_height, img_width))(h)
        h = C.tanh(h)

        return h
Ejemplo n.º 36
0
def gelu(x):
    return 0.5 * x * (
        1 + C.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * C.pow(x, 3))))