def gated_attention_gru_layer(self, context, query):
        q_processed = C.placeholder(shape=(2*self.hidden_dim,))
        c_processed = C.placeholder(shape=(2*self.hidden_dim,))

        #gate weight
        Wg = C.parameter(shape=(4*self.hidden_dim, 4*self.hidden_dim))
        att_gru = C.layers.GRU(2*self.hidden_dim)
        attention_model = C.layers.AttentionModel(self.hidden_dim, name='attention_model')
        
        @C.Function
        def out_func0(att_input, enc_input):
            enc_input2 = enc_input
            @C.Function
            def gru_with_attentioin(dh, x):
                c_att = attention_model(att_input, x)
                x = C.splice(x, c_att)
                x = C.element_times(x, C.sigmoid(C.times(x, Wg)))
                return att_gru(dh, x)
            att_context = Recurrence(gru_with_attentioin)(enc_input2)
            return att_context
        att_context = out_func0(q_processed, c_processed)
        return C.as_block(
            att_context,
            [(c_processed, context), (q_processed, query)],
            'gated_attention_gru_layer',
            'gated_attention_gru_layer')
Exemple #2
0
def lightlstm(input_dim, cell_dim):
    x = C.placeholder(name='x')
    dh = C.placeholder(name='dh')
    dc = C.placeholder(name='dc')
    x1 = C.slice(x, -1, input_dim * 0, input_dim * 1)
    x2 = C.slice(x, -1, input_dim * 1, input_dim * 2)

    def LSTMCell(x, y, dh, dc):
        '''LightLSTM Cell'''

        b = C.parameter(shape=(4 * cell_dim), init=0)
        W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform())
        H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform())

        # projected contribution from input x, hidden, and bias
        proj4 = b + C.times(x, W) + C.times(dh, H)

        it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim)
        bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim)
        ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim)
        ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim)

        it = C.sigmoid(it_proj)  # input gate
        bit = it * C.tanh(bit_proj)

        ft = C.sigmoid(ft_proj)  # forget gate
        bft = ft * dc

        ct = bft + bit
        ot = C.sigmoid(ot_proj)  # output gate
        ht = ot * C.tanh(ct)

        # projected contribution from input y, hidden, and bias
        proj4_2 = b + C.times(y, W) + C.times(ht, H)

        it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim)
        bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim)
        ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim)
        ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim)

        it_2 = C.sigmoid(it_proj_2)  # input gate
        bit_2 = it_2 * C.tanh(bit_proj_2)

        ft_2 = C.sigmoid(ft_proj_2)  # forget gate
        bft_2 = ft_2 * ct

        ct2 = bft_2 + bit_2
        ot_2 = C.sigmoid(ot_proj_2)  # output gate
        ht2 = ot_2 * C.tanh(ct2)
        return (ht, ct, ht2, ct2)

    Cell = LSTMCell(x1, x2, dh, dc)

    actualDh = past_value(Cell[2])
    actualDc = past_value(Cell[3])

    Cell[0].replace_placeholders(
        {dh: actualDh.output, dc: actualDc.output})
    return C.splice(Cell[0], Cell[2], axis=-1)
    def input_layer(self,cgw,cnw,cc,qgw,qnw,qc):
        cgw_ph = C.placeholder()
        cnw_ph = C.placeholder()
        cc_ph  = C.placeholder()
        qgw_ph = C.placeholder()
        qnw_ph = C.placeholder()
        qc_ph  = C.placeholder()

        input_chars = C.placeholder(shape=(1,self.word_size,self.c_dim))
        input_glove_words = C.placeholder(shape=(self.wg_dim,))
        input_nonglove_words = C.placeholder(shape=(self.wn_dim,))

        # we need to reshape because GlobalMaxPooling/reduce_max is retaining a trailing singleton dimension
        # todo GlobalPooling/reduce_max should have a keepdims default to False
        embedded = C.splice(
            C.reshape(self.charcnn(input_chars), self.convs),
            self.embed()(input_glove_words, input_nonglove_words), name='splice_embed')
        processed = C.layers.Sequential([For(range(2), lambda: OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn'))])(embedded)
        
        qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse)
        cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse)
        
        q_processed = processed.clone(C.CloneMethod.share, {input_chars:qce, input_glove_words:qgw_ph, input_nonglove_words:qnw_ph})
        c_processed = processed.clone(C.CloneMethod.share, {input_chars:cce, input_glove_words:cgw_ph, input_nonglove_words:cnw_ph})
        return C.as_block(
            C.combine([c_processed, q_processed]),
            [(cgw_ph, cgw),(cnw_ph, cnw),(cc_ph, cc),(qgw_ph, qgw),(qnw_ph, qnw),(qc_ph, qc)],
            'input_layer',
            'input_layer')
 def output_layer(self, query, match_context):
     q_processed = C.placeholder(shape=(2*self.hidden_dim,))
     mat_context = C.placeholder(shape=(2*self.hidden_dim,))
     
     #output layer
     r_q = question_pooling(q_processed, 2*self.hidden_dim) #shape n*(2*self.hidden_dim)
     p1_logits = attention_weight(mat_context, r_q, 2*self.hidden_dim)
     attention_pool = C.sequence.reduce_sum(p1_logits * mat_context)
     state = C.layers.GRU(2*self.hidden_dim)(attention_pool, r_q)
     p2_logits = attention_weight(mat_context, state, 2*self.hidden_dim)
     
     @C.Function
     def start_ave_point(p1_logits, p2_logits, point):
         @C.Function
         def start_ave(last, now):
             now = now + last - last
             new_start = now * C.sequence.gather(p2_logits, point)
             point = C.sequence.future_value(point)
             return new_start
         start_logits_ave = C.layers.Recurrence(start_ave)(p1_logits)
         return start_logits_ave
     point = C.sequence.is_first(p1_logits)
     point = C.layers.Sequential([For(range(2), lambda: C.layers.Recurrence(C.plus))])(point)
     point = C.greater(C.constant(16), point)
     start_logits_ave = start_ave_point(p1_logits, p2_logits, point)
     
     @C.Function
     def end_ave_point(p1_logits, p2_logits, point):
         @C.Function
         def end_ave(last, now):
             now = now + last - last
             new_end = now * C.sequence.gather(p2_logits, point)
             point = C.sequence.past_value(point)
             return new_end
         end_logits_ave = C.layers.Recurrence(end_ave, go_backwards=True)(p2_logits)
         return end_logits_ave
     point = C.sequence.is_last(p1_logits)
     point = C.layers.Sequential([For(range(2), lambda: C.layers.Recurrence(C.plus, go_backwards=True))])(point)
     point = C.greater(C.constant(16),point)
     end_logits_ave = end_ave_point(p1_logits, p2_logits, point)
     
     start_logits = seq_hardmax(start_logits_ave)
     end_logits = seq_hardmax(end_logits_ave)
     '''
     start_logits = seq_hardmax(p1_logits)
     end_logits = seq_hardmax(p2_logits)
     '''
     return C.as_block(
         C.combine([start_logits, end_logits]),
         [(q_processed, query), (mat_context, match_context)],
         'output_layer',
         'output_layer')
Exemple #5
0
def test_get_data_type():
    pa32 = C.parameter(init=np.asarray(2, dtype=np.float32))
    pa64 = C.parameter(init=np.asarray(2, dtype=np.float64))
    pl = C.placeholder(shape=(2))
    c = C.constant(value=3.0)
    n32 = AA(1, dtype=np.float32)
    n64 = AA(1, dtype=np.float64)

    assert get_data_type(pa32) == np.float32
    assert get_data_type(pa32, n32) == np.float32
    assert get_data_type(n32, n32) == np.float32
    assert get_data_type(n32, n64) == np.float64
    assert get_data_type(pl, n64) == np.float64
    assert get_data_type(pl, n32) == np.float32
    assert get_data_type(pl, pl) is None
    # variable's type shall take precedence over provided data
    assert get_data_type(pa32, n64) == np.float32
    assert get_data_type(pa64, n64) == np.float64
    assert get_data_type(pa32, pl, n64) == np.float32
    assert get_data_type(pa64, pl, n64) == np.float64
    
    assert get_data_type(np.float64(1)) == np.float64
    assert get_data_type(np.float32(1)) == np.float32
    assert get_data_type(np.int64(1)) == np.float32  # special case for cntk
    assert get_data_type(1) == np.float32
    assert get_data_type(1.0) == np.float32
Exemple #6
0
def test_clone_with_slice():
    i1 = C.input_variable((2,2), name='i1')
    i2 = C.input_variable((2,2), name='i2')
    x = C.splice(i1, i2, axis=0)
    W = C.constant(1, (4,1), name='W')
    y = C.convolution(W, x)
    assert(y.shape == (4,2))

    from ..functions import CloneMethod
    x1 = C.input_variable((2,1), name='x1')
    x2 = C.input_variable((2,1), name='x2')
    p1 = C.placeholder()
    p2 = C.placeholder()
    y_cloned = y.clone('clone', {i1:p1, i2:p2})
    y2 = y_cloned(x1, x2)
    assert(y2.shape == (4,1))
Exemple #7
0
def test_op_sequence_reduce_sum(device_id, precision):
    a = C.sequence.input_variable(shape=(1,), dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name='a')

    sequence_sum_a_plus_sequence_sum_a = C.sequence.reduce_sum(a) + C.sequence.reduce_sum(a)

    a_data = [AA([[2]], dtype=PRECISION_TO_TYPE[precision]),
              AA([[2], [3]], dtype=PRECISION_TO_TYPE[precision]),
              AA([[2], [3], [4]], dtype=PRECISION_TO_TYPE[precision])]

    actual_grad = sequence_sum_a_plus_sequence_sum_a.grad({a: a_data}, [a])
    assert np.array_equal(actual_grad[0], np.asarray([[2.]]))
    assert np.array_equal(actual_grad[1], np.asarray([[2.], [2.]]))
    assert np.array_equal(actual_grad[2], np.asarray([[2.], [2.], [2.]]))

    res = sequence_sum_a_plus_sequence_sum_a.eval({a: a_data})
    assert np.array_equal(res[0], np.asarray([4.]))
    assert np.array_equal(res[1], np.asarray([10.]))
    assert np.array_equal(res[2], np.asarray([18.]))

    # Verify that calling sequence reduction on a placeholder with known
    # shape but unknown dynamic axes does not result in a problem
    p = C.placeholder(shape=(1,))
    r = C.sequence.reduce_sum(p)
    r.replace_placeholder(a)

    res = r.eval({a: a_data})
    assert np.array_equal(res[0], np.asarray([2.]))
    assert np.array_equal(res[1], np.asarray([5.]))
    assert np.array_equal(res[2], np.asarray([9.]))
Exemple #8
0
def test_block_with_unused_outputs():
    p1 = C.placeholder()
    p3 = C.placeholder()
    func1 = C.as_block(p1 + 1, [(p1, p3)], 'plus_func_1')
    p2 = C.placeholder()
    p4 = C.placeholder()
    func2 = C.as_block(p2 + 1, [(p2, p4)], 'plus_func_2')
    p5 = C.placeholder()
    func3 = C.as_block(C.combine([func2]), [(p4, p5)], 'empty_block')
    input_var1 = C.input_variable(shape=())
    input_var2 = C.input_variable(shape=())
    block = C.as_block(C.combine([func1, func3]), [(p3, input_var1), (p5, input_var2)], 'multi_output_block')

    eval_root = C.combine([block.outputs[0]])
    result = eval_root.eval({input_var1 : np.asarray([3], dtype=np.float32), input_var2 : np.asarray([-3], dtype=np.float32)})
    assert np.array_equal(result, [ 4.])
Exemple #9
0
def test_sequence_unpack_basic(device_id):
    dev = cntk_device(device_id)

    # Unpack a placeholder
    p = C.placeholder()
    p_unpacked_outputs = C.sequence.unpack(p, padding_value=0).outputs
    assert len(p_unpacked_outputs) == 2

    x = C.input_variable((C.FreeDimension, 2, 3), is_sparse=False)
    x_seq_lens = C.input_variable(())
    x_seq = C.to_sequence(x, x_seq_lens)
    x_seq_unpacked = C.sequence.unpack(x_seq, padding_value=-1000.0)
    x_seq_unpacked_value_output = x_seq_unpacked.outputs[0]
    x_seq_unpacked_mask_output = x_seq_unpacked.outputs[1]
    assert len(x_seq_unpacked_value_output.dynamic_axes) == 1
    assert x_seq_unpacked_value_output.shape == (C.FreeDimension, 2, 3)

    seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]]
    seq2_data = [[0, 1, 1], [1, 1, 0]]
    x_data = [np.asarray(seq1_data, dtype=np.float32), np.asarray([seq2_data, [[-100.0, -100.0, -100.0], [-100.0, -100.0, -100.0]]], dtype=np.float32)]
    x_seq_lens_data = np.asarray([2, 1], dtype=np.float32)
    result = x_seq_unpacked.eval({x : x_data, x_seq_lens : x_seq_lens_data}, device=dev)
    value = result[x_seq_unpacked_value_output]
    mask = result[x_seq_unpacked_mask_output]
    assert np.array_equal(value[0], seq1_data)
    assert np.array_equal(value[1], [seq2_data, [[-1000.0, -1000.0, -1000.0], [-1000.0, -1000.0, -1000.0]]])
    assert np.array_equal(mask, [[1, 1], [1, 0]])
 def matching_attention_layer(self, attention_context):
     att_context = C.placeholder(shape=(2*self.hidden_dim,))
     #matching layer
     matching_model = C.layers.AttentionModel(attention_dim=self.hidden_dim, name='attention_model')
     #gate weight
     Wg = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim))
     #gru
     att_gru = C.layers.GRU(self.hidden_dim)
     @C.Function
     def out_func1(att_input, enc_input):
         enc_input2 = enc_input
         @C.Function
         def bigru_with_match(dh, x):
             c_att = matching_model(att_input, dh)
             x = C.splice(x, c_att)
             x = C.element_times(x, C.sigmoid(C.times(x, Wg)))
             return att_gru(dh, x)
         return C.splice(C.layers.Recurrence(bigru_with_match)(enc_input2),
                     C.layers.Recurrence(bigru_with_match, go_backwards=True)(enc_input2),
                     name="bigru_with_match")
     match_context = out_func1(att_context, att_context)
     return C.as_block(
         match_context,
         [(att_context, attention_context)],
         'matching_attention_layer',
         'matching_attention_layer')
Exemple #11
0
def test_recurrance_with_udf_without_layers():
    name = "SimpleUdf"
    def udf(a):
        return C.user_function(SimpleUdf(a, name=name))

    # input varibale and the data.
    x = C.sequence.input_variable(needs_gradient=True,shape=(2,))
    x0 = np.reshape(np.arange(16.0, dtype=np.float32),(2,4,2))
    print(x0)

    # creates a recurrent loop.
    p = C.placeholder(shape=(2,))
    past= C.sequence.past_value(p)
    z = udf(x) * udf(past)  + C.Parameter((2,), init=[1,1])
    z.replace_placeholders({p:z.outputs[0]})

    #C.logging.graph.plot(z, "recurrent.pdf")
    out = z.eval({x:x0})
    print(out)
    expected_out = [np.array([1,1,3,4,13,21,79,148], dtype=np.float32).reshape(4,2),np.array([1,1,11,12,133,157,1863,2356], dtype=np.float32).reshape(4,2)]
    assert np.array_equal(out, expected_out)

    gradient, result= z.grad({x: x0}, wrt=[x], outputs=[z.output])
    print(result)
    assert np.array_equal(result, expected_out)

    expected_grad = [np.array([0,0,29,41,21,32,13,21], dtype=np.float32).reshape(4,2),np.array([0,0,181,209,165,192,133,157], dtype=np.float32).reshape(4,2)]
    print(gradient)
    assert np.array_equal(gradient, expected_grad)
Exemple #12
0
def BinaryConvolution(operand,
                      filter_shape,
                      num_filters=1,
                      channels = 1,
                      init=C.glorot_uniform(),
                      pad=False,
                      strides=1,
                      bias=True,
                      init_bias=0,
                      op_name='BinaryConvolution', name=''):
    """ arguments:
            operand: tensor to convolve
            filter_shape: tuple indicating filter size
            num_filters: number of filters to use 
            channels: number of incoming channels
            init: type of initialization to use for weights
    """
    kernel_shape = (num_filters, channels) + filter_shape
    W = C.parameter(shape=kernel_shape, init=init, name="filter")

    binary_convolve_operand_p = C.placeholder(operand.shape, operand.dynamic_axes, name="operand")
    binary_convolve = C.convolution(CustomMultibit(W, 1), CustomMultibit(binary_convolve_operand_p, 1), auto_padding=[False, pad, pad], strides=[strides])
    r = C.as_block(binary_convolve, [(binary_convolve_operand_p, operand)], 'binary_convolve')

    bias_shape = (num_filters, 1, 1)
    b = C.parameter(shape=bias_shape, init=init_bias, name="bias")
    r = r + b

    # apply learnable param relu
    P = C.parameter(shape=r.shape, init=init, name="prelu")
    r = C.param_relu(P, r)
    return r
Exemple #13
0
def test_recurrence_shape_inference():
    i = C.sequence.input_variable((2,))
    p = C.placeholder()
    p_past = C.sequence.past_value(p)
    p_past_plus_i = p_past + i

    p_past_plus_i.replace_placeholder(p_past_plus_i.output)
    assert p_past_plus_i.output.shape == (2,)
Exemple #14
0
    def returnFunction():
        left_val = [[10,2]]
        right_val = [[2],[3]]

        p = placeholder(shape=(1,2))
        op = times(p, right_val)
        c = constant(left_val)

        return op.replace_placeholders({p:c})
Exemple #15
0
def create_model():
    x = C.placeholder()
    with C.layers.default_options(initial_state=0.1):
        e = C.layers.Embedding(emb_dim, name='embed')(x)
        negRnn = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=True)(e)
        posRnn = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(e)
        h = C.splice(posRnn, negRnn)
        out = C.layers.Dense(num_labels, name='classify')(h)
        return out
Exemple #16
0
    def convolution(operand):
        
        bcv_operand_p = C.placeholder(
            operand.shape, operand.dynamic_axes, name="operand")
        
        bcv = C.convolution(
                    CustomMultibit(W, 1), 
                    CustomMultibit(bcv_operand_p, 1), 
                    auto_padding=[False, pad, pad], 
                    strides=[strides])

        return  C.as_block(bcv, [(bcv_operand_p, operand)], name)
Exemple #17
0
def test_clone_with_function_in_substitution_map():
    input_dim = 1
    proj_dim = 2
    x = C.input_variable((input_dim,))
    w = C.parameter((input_dim, proj_dim))
    t = C.times(x, w)
    b = C.parameter((proj_dim))
    t_plus_b = t + b

    p = C.placeholder()
    just_b = t_plus_b.clone('clone', {t : p})
    t_plus_b_clone = just_b.clone('share', {p : t})
Exemple #18
0
def test_ext_eval_7_placeholder():
    dim = 4
    p = C.parameter(shape=(dim,), init=10, name='p')
    i = C.sequence.input_variable(dim, needs_gradient=True, name='i_var')
    pl = C.placeholder()
    m = C.user_function(MyPlus(pl, C.constant(3)))
    z = m + p
    z.replace_placeholder(i)

    input_data = np.random.rand(dim)
    result = z.eval([input_data])
    assert np.allclose(result[0][0], input_data + 3 + 10)
Exemple #19
0
def test_outputs():
    fwd_state = C.placeholder("placeholder")
    prev_state = C.sequence.past_value(fwd_state, name="prev_state")
    z = C.abs(prev_state, "abs")
    output = z.output
    z = z.replace_placeholders({fwd_state: z.output})

    fwd_state = None
    prev_state = None
    z = None

    for arg in output.owner.arguments:
        print("Argument name: {}, argument owner name {}".format(arg.name, arg.owner.name))
Exemple #20
0
def test_replace_save_restoreinplace_constant(tmpdir):
    from cntk import placeholder

    c1 = C.constant(value=0)
    c2 = C.constant(value=0)
    c3 = C.constant(value=0)
    p1 = placeholder(name="placeholder1")
    p2 = placeholder(name="placeholder2")
    result = (c1 * p1) * c2 + c3 + p2

    p3 = placeholder(name="placeholder3")
    p4 = placeholder(name="placeholder4")
    block = C.ops.as_block(result, [(p2, p4), (p1, p3)], "test_block")

    arg_map = { p3: C.constant(value=0) }
    block.replace_placeholders(arg_map)

    model_filename = str(tmpdir / 'simple_block.mod')
    block.save(model_filename)
    block.restore(model_filename)

    assert len(block.placeholders) == 1
def test_free_static_axis_in_recurrence():
    x = C.sequence.input_variable((C.FreeDimension, 2))
    out_placeholder = C.placeholder()
    out_past = C.sequence.past_value(out_placeholder)
    wh = C.parameter(init=np.asarray([[2, 5], [1, 3]], dtype=np.float32))
    wx = C.parameter(init=np.asarray([[1, 4], [2, 5]], dtype=np.float32))
    out = C.times(x, wx) + C.times(out_past, wh)
    out.replace_placeholders({out_placeholder : out})
    
    x_data = np.asarray([[0.5, 0.2], [-0.7, 1.2]], np.float32)
    w_grad, out_val = out.grad({x : x_data}, wrt=[wh, wx], outputs=[out])
    assert np.allclose(out_val, [[[[0.9, 3.], [1.7, 3.2]]]])
    assert np.allclose(w_grad[wx], [[-0.2, -0.2], [1.4, 1.4]])
def LocalResponseNormalization(k, n, alpha, beta, name=''):
    x = C.placeholder(name='lrn_arg')
    x2 = C.square(x)
    # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
    x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1)
    W = C.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W')
    # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
    y = C.convolution (W, x2s)
    # reshape back to remove the fake singleton reduction dimension
    b = C.reshape(y, C.InferredDimension, 0, 2)
    den = C.exp(beta * C.log(k + b))
    apply_x = C.element_divide(x, den)
    return apply_x
Exemple #23
0
def test_replace_placeholder_s():
    left_val = [[10,2]]
    right_val = [[2],[3]]

    p = C.placeholder(shape=(1,2))
    c = C.constant(left_val)

    op = C.times(p, right_val)
    op.replace_placeholders({p:c})
    assert op.eval() == 26

    op = C.times(p, right_val)
    op.replace_placeholder(c)
    assert op.eval() == 26
Exemple #24
0
def test_squeeze(operand_shape, axis, device_id, precision):
    operand = np.arange(np.prod(operand_shape)).reshape(operand_shape).astype('f')
    expected = np.squeeze(operand, axis)

    expected_forward = [expected]
    expected_backward = {
        'arg': [np.ones_like(operand)],
    }

    from .. import squeeze, placeholder
    p = C.placeholder()
    squeeze_with_axis = C.squeeze(p, axis)
    _test_unary_op(precision, device_id, squeeze_with_axis, operand,
                   expected_forward, expected_backward)
def BNBiRecurrence(fwd, bwd, test_dual=True): # special version that calls one shared BN instance at two places, for testing BN param tying
    F = Recurrence(fwd)
    G = Recurrence(fwd, go_backwards=True)
    BN = BatchNormalization(normalization_time_constant=-1)
    x = placeholder()
    # The following code applies the same BN function object twice.
    # When running whole-corpus estimation of means/vars, this must lead to the same estimate
    # although it is estimated on twice the amount of data (each sample is used twice).
    # Hence, this is the test that proves that the parameter sharing works.
    x1 = BN(x)
    x2 = BN(x) if test_dual else x1
    # In double precision with corpus aggregation, these lead to the same result.
    apply_x = splice (F(x1), G(x2))
    return apply_x
Exemple #26
0
def test_expand_dims(operand_shape, axis, device_id, precision):
    if axis is None or isinstance(axis, tuple):
        return
    operand = np.arange(np.prod(operand_shape)).reshape(operand_shape).astype('f')
    expected = np.expand_dims(operand, axis)

    expected_forward = [expected]
    expected_backward = {
        'arg': [np.ones_like(operand)],
    }

    from .. import expand_dims, placeholder
    p = C.placeholder()
    expand_dims_with_axis = C.expand_dims(p, axis)
    _test_unary_op(precision, device_id, expand_dims_with_axis, operand,
                   expected_forward, expected_backward)
Exemple #27
0
def test_op_as_block(input_shape, output_shape, expected_output_shape, device_id, precision):
    # We test using reshape as the operation that is encapsulated in a block

    dev = cntk_device(device_id)
    from cntk.internal import sanitize_dtype_cntk
    from .. import reshape, element_times, as_block

    num_tensor_elements = np.multiply.reduce(input_shape)
    input_tensor = np.arange(num_tensor_elements, dtype=PRECISION_TO_TYPE[precision]).reshape(input_shape)
    input_reshaped = input_tensor.reshape(expected_output_shape)

    a_placeholder = C.placeholder();
    a_reshaped = reshape(a_placeholder, output_shape)

    const_input_reshaped = constant(input_reshaped, device=dev)
    block_composite = element_times(a_reshaped, const_input_reshaped, name='element_times_inside_block')
    
    a = C.input_variable(shape=input_tensor.shape,
                dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]),
                needs_gradient=True,
                name='a')

    input_op = as_block(block_composite, [(a_placeholder, a)], 'reshape_test_op', block_instance_name='reshape_test_op')

    # Test some basic methods related to blocks
    assert input_op.is_composite
    block_primitive = input_op.root_function.find_by_name('reshape_test_op')
    assert block_primitive.name == 'reshape_test_op'
    assert block_primitive.is_primitive
    assert block_primitive.is_block
    element_times_inside_block = block_primitive.block_root.find_by_name('element_times_inside_block')
    assert element_times_inside_block.name == 'element_times_inside_block'
    assert element_times_inside_block.is_primitive
    block_arguments_map = block_primitive.block_arguments_mapping
    assert len(block_arguments_map) == 1

    expected_forward = [input_reshaped**2]
    expected_backward = {a: input_tensor}

    # create batch
    input_tensor.shape = (1,) + input_tensor.shape

    forward_input = {a: input_tensor}

    unittest_helper(input_op,
                    forward_input, expected_forward, expected_backward,
                    device_id=device_id, precision=precision)
Exemple #28
0
def create_model(base_model_file, feature_node_name, last_hidden_node_name, num_classes, input_features, freeze=False):
    # Load the pretrained classification net and find nodes
    base_model   = load_model(base_model_file)
    feature_node = find_by_name(base_model, feature_node_name)
    last_node    = find_by_name(base_model, last_hidden_node_name)

    # Clone the desired layers with fixed weights
    cloned_layers = combine([last_node.owner]).clone(
        CloneMethod.freeze if freeze else CloneMethod.clone,
        {feature_node: placeholder(name='features')})

    # Add new dense layer for class prediction
    feat_norm  = input_features - Constant(114)
    cloned_out = cloned_layers(feat_norm)
    z          = Dense(num_classes, activation=None, name=new_output_node_name) (cloned_out)

    return z
Exemple #29
0
    def create_trainer(use_sparse, device):
        a = C.sequence.input_variable(shape=input_shape, is_sparse=use_sparse, name='input')
        w_i = C.parameter(init=w_init_i, device=dev)
        a_projection = times(a, w_i)

        p_o = C.placeholder()
        h = C.sequence.past_value(p_o)
        w_h = C.parameter(init=w_init_h, device=dev)
        h_projection = times(h, w_h)        
        z = a_projection + h_projection
        z = z.replace_placeholder(z)
        z = reshape(z, label_shape)

        l = C.sequence.input_variable(shape=label_shape, is_sparse=use_sparse, name='label')
        loss = cross_entropy_with_softmax(z, l, axis=-1)
        trainer = C.Trainer(z, (loss, None), C.sgd(z.parameters, lr=C.learning_rate_schedule(0.7, C.UnitType.sample)))
        return (a, l, w_i, w_h, trainer)
Exemple #30
0
def test_op_broadcast_as_in_loop(device_id):

    a_data = [AA([1]), AA([2]), AA([3])]
    b_data = [AA([[2]]), AA([[2], [3]]), AA([[2], [3], [4]])]

    a = C.input_variable(shape=(1,), name='a')
    b = C.sequence.input_variable(shape=(1,), name='b')

    out_placeholder = C.placeholder()
    out_delayed = C.sequence.past_value(out_placeholder, time_step=5)
    out_delayed_plus_b = out_delayed + b
    out = C.sequence.broadcast_as(a, out_delayed_plus_b)
    out.replace_placeholder(out)

    res = out.eval({a: a_data, b: b_data})
    assert np.array_equal(res[0], np.asarray([[1.]]))
    assert np.array_equal(res[1], np.asarray([[2.], [2.]]))
    assert np.array_equal(res[2], np.asarray([[3.], [3.], [3.]]))
def ForwardDeclaration(name='forward_declaration'):
    '''
    Helper for recurrent network declarations.
    Returns a placeholder variable with an added method ``resolve_to()`` to be called
    at the end to close the loop.
    This is used for explicit graph building with recurrent connections.

    Example:
     >>> # create a graph with a recurrent loop to compute the length of an input sequence
     >>> from cntk.layers.typing import *
     >>> x = C.input_variable(**Sequence[Tensor[2]])
     >>> ones_like_input = C.sequence.broadcast_as(1, x)  # sequence of scalar ones of same length as input
     >>> out_fwd = ForwardDeclaration()  # placeholder for the state variables
     >>> out = C.sequence.past_value(out_fwd, initial_state=0) + ones_like_input
     >>> out_fwd.resolve_to(out)
     >>> length = C.sequence.last(out)
     >>> x0 = np.reshape(np.arange(6,dtype=np.float32),(1,3,2))
     >>> x0
         array([[[ 0.,  1.],
                 [ 2.,  3.],
                 [ 4.,  5.]]], dtype=float32)
     >>> length(x0)
         array([ 3.], dtype=float32)

    Returns:
        :class:`~cntk.variables.Variable`: a placeholder variable with a method ``resolve_to()`` that resolves it to another variable
    '''
    var_fwd = placeholder(name=name)

    def resolve_to(var):
        #from cntk import cntk_py
        #if isinstance(var, cntk_py.Function):
        #    var.replace_placeholders({var_fwd: var.output})  # resolves var_fwd := var
        #else:
        # TODO: ^^ should no longer be needed; delete once confirmed
        var.owner.replace_placeholders({var_fwd:
                                        var})  # resolves var_fwd := var

    var_fwd.resolve_to = resolve_to
    return var_fwd
def BinaryConvolution(operand,
                      filter_shape,
                      num_filters=1,
                      channels=1,
                      init=C.glorot_uniform(),
                      pad=False,
                      strides=1,
                      bias=True,
                      init_bias=0,
                      op_name='BinaryConvolution',
                      name=''):
    """ arguments:
            operand: tensor to convolve
            filter_shape: tuple indicating filter size
            num_filters: number of filters to use 
            channels: number of incoming channels
            init: type of initialization to use for weights
    """
    kernel_shape = (num_filters, channels) + filter_shape
    W = C.parameter(shape=kernel_shape, init=init, name="filter")

    binary_convolve_operand_p = C.placeholder(operand.shape,
                                              operand.dynamic_axes,
                                              name="operand")
    binary_convolve = C.convolution(CustomMultibit(W, 1),
                                    CustomMultibit(binary_convolve_operand_p,
                                                   1),
                                    auto_padding=[False, pad, pad],
                                    strides=[strides])
    r = C.as_block(binary_convolve, [(binary_convolve_operand_p, operand)],
                   'binary_convolve')

    bias_shape = (num_filters, 1, 1)
    b = C.parameter(shape=bias_shape, init=init_bias, name="bias")
    r = r + b

    # apply learnable param relu
    P = C.parameter(shape=r.shape, init=init, name="prelu")
    r = C.param_relu(P, r)
    return r
Exemple #33
0
def test_recurrance_with_udf_without_layers():
    name = "SimpleUdf"

    def udf(a):
        return C.user_function(SimpleUdf(a, name=name))

    # input varibale and the data.
    x = C.sequence.input_variable(needs_gradient=True, shape=(2, ))
    x0 = np.reshape(np.arange(16.0, dtype=np.float32), (2, 4, 2))
    print(x0)

    # creates a recurrent loop.
    p = C.placeholder(shape=(2, ))
    past = C.sequence.past_value(p)
    z = udf(x) * udf(past) + C.Parameter((2, ), init=[1, 1])
    z.replace_placeholders({p: z.outputs[0]})

    #C.logging.graph.plot(z, "recurrent.pdf")
    out = z.eval({x: x0})
    print(out)
    expected_out = [
        np.array([1, 1, 3, 4, 13, 21, 79, 148],
                 dtype=np.float32).reshape(4, 2),
        np.array([1, 1, 11, 12, 133, 157, 1863, 2356],
                 dtype=np.float32).reshape(4, 2)
    ]
    assert np.array_equal(out, expected_out)

    gradient, result = z.grad({x: x0}, wrt=[x], outputs=[z.output])
    print(result)
    assert np.array_equal(result, expected_out)

    expected_grad = [
        np.array([0, 0, 29, 41, 21, 32, 13, 21],
                 dtype=np.float32).reshape(4, 2),
        np.array([0, 0, 181, 209, 165, 192, 133, 157],
                 dtype=np.float32).reshape(4, 2)
    ]
    print(gradient)
    assert np.array_equal(gradient, expected_grad)
Exemple #34
0
 def func(x_var):
     x = C.placeholder()
     WT = C.Parameter((
         dim,
         dim,
     ),
                      init=transform_weight_initializer,
                      name=name + '_WT')
     bT = C.Parameter(dim,
                      init=transform_bias_initializer,
                      name=name + '_bT')
     WU = C.Parameter((
         dim,
         dim,
     ),
                      init=update_weight_initializer,
                      name=name + '_WU')
     bU = C.parameter(dim, init=update_bias_initializer, name=name + '_bU')
     transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT)
     update = C.tanh(C.times(x, WU, name=name + '_U') + bU)
     return C.as_block(update * transform_gate + (1 - transform_gate) * x,
                       [(x, x_var)], 'SingleInner', 'SingleInner' + name)
 def func(x_var):
     x = C.placeholder()
     WT = C.Parameter((
         dim,
         dim,
     ),
                      init=transform_weight_initializer,
                      name=name + '_WT')
     bT = C.Parameter(dim,
                      init=transform_bias_initializer,
                      name=name + '_bT')
     WU = C.Parameter((
         dim,
         dim,
     ),
                      init=update_weight_initializer,
                      name=name + '_WU')
     bU = C.Parameter(dim, init=update_bias_initializer, name=name + '_bU')
     transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT)
     update = C.relu(C.times(x, WU, name=name + '_U') + bU)
     return C.as_block(x + transform_gate * (update - x), [(x, x_var)],
                       'HighwayBlock', 'HighwayBlock' + name)
Exemple #36
0
def test_topk_backward(device_id, precision):
    def check_grad_last_axis(input, root, indices, output):
        d = input.shape[-1]
        k = indices.shape[-1]
        expected_output = np.zeros_like(input).reshape(-1,d)
        ind = np.reshape(indices, (-1,k))
        r = np.reshape(root,(-1,k))
        assert ind.shape[0] == r.shape[0] == expected_output.shape[0]
        for i in range(expected_output.shape[0]):
            for j in range(k):
                expected_output[i,int(ind[i,j])] = r[i,j]
        expected_output = expected_output.reshape(input.shape)
        assert np.allclose(output, expected_output)

    dt = PRECISION_TO_TYPE[precision]
    dev = cntk_device(device_id)

    axis=-1
    h = C.placeholder()
    p = C.parameter((4, 5, 6))
    p.value = p.value + np.random.randn(*p.shape)
    y = C.top_k(h, 3, axis=axis)
    y.replace_placeholder(p)
    dy, top = y.forward({}, y.outputs, set([y.outputs[0]]))
    indices = top[y.outputs[1]]
    root = np.ones_like(indices)
    root = root + np.arange(np.prod(root.shape)).reshape(*root.shape)
    cg = y.backward(dy, {y.outputs[0]:root}, set([p]))[p]
    check_grad_last_axis(p.value, root, indices, cg)

    q = C.sequence.input_variable((5,6), needs_gradient=True)
    q0 = [np.random.randn(4-i,5,6).astype(dt) for i in range(2)]
    y = C.top_k(q, 3, axis=axis)
    dy, top = y.forward({q:q0}, y.outputs, set([y.outputs[0]]), device=dev)
    indices = top[y.outputs[1]]
    root = [np.ones_like(i) + 100 * k + np.arange(np.prod(i.shape)).reshape(*i.shape) for k,i in enumerate(indices)]
    cg = y.backward(dy, {y.outputs[0]:root}, set([q]))[q]
    for i in range(2):
        check_grad_last_axis(q0[i], root[i], indices[i], cg[i])
Exemple #37
0
def create_network():
    input_var = cntk.sequence.input_variable((num_channels, frame_height, frame_width), name='input_var')
    target_var = cntk.input_variable((num_classes,), is_sparse=True, name='target_var')

    with cntk.layers.default_options(enable_self_stabilization=True):
        model = Sequential([
            resnet_model(cntk.placeholder()), Label('resnet'),
            Dense(hidden_dim, name='cnn_fc'),
            cntk.layers.Stabilizer(),
            bidirectional_recurrence(LSTM(hidden_dim // 2), LSTM(hidden_dim // 2)),
            cntk.sequence.last,
            BatchNormalization(),
            Dense(num_classes)
        ])(input_var)

    return {
        'input': input_var,
        'target': target_var,
        'model': model,
        'loss': cntk.cross_entropy_with_softmax(model, target_var),
        'metric': cntk.classification_error(model, target_var)
    }
def test_placeholder(device_id, precision):
    dt = PRECISION_TO_TYPE[precision]
    dev = cntk_device(device_id)

    import cntk.random as cr
    p = C.placeholder()
    u = cr.uniform_like(p)
    x = C.sequence.input_variable((4, 5))

    x1 = np.ones((2, 3, 4, 5), dtype=dt)
    f = u + p
    f.replace_placeholders({p: x})
    fx0, fx1 = f.eval({x: x1})

    assert fx0.shape == (3, 4, 5)
    assert fx1.shape == (3, 4, 5)

    assert fx0.min() >= 1
    assert fx0.max() < 2

    assert fx1.min() >= 1
    assert fx1.max() < 2
Exemple #39
0
def test_cloning():
    p = C.placeholder(shape=(1, ), name='p')
    i = C.input_variable(shape=(1, ), needs_gradient=True, name='i')
    res = p + i

    with pytest.raises(ValueError):
        res.clone(2)

    from ..functions import CloneMethod

    # Test freeze
    cloned = res.clone(CloneMethod.freeze)
    assert cloned.inputs[0].name == 'p'
    assert cloned.inputs[0].uid != p.uid
    assert cloned.inputs[1].name == 'i'
    assert cloned.inputs[1].uid != i.uid

    cloned = res.clone('freeze')
    assert cloned.inputs[0].name == 'p'
    assert cloned.inputs[0].uid != p.uid
    assert cloned.inputs[1].name == 'i'
    assert cloned.inputs[1].uid != i.uid
Exemple #40
0
def test_sequence_unpack_basic(device_id):
    dev = cntk_device(device_id)

    # Unpack a placeholder
    p = C.placeholder()
    p_unpacked_outputs = C.sequence.unpack(p, padding_value=0).outputs
    assert len(p_unpacked_outputs) == 2

    x = C.input((C.FreeDimension, 2, 3), is_sparse=False)
    x_seq_lens = C.input(())
    x_seq = C.to_sequence(x, x_seq_lens)
    x_seq_unpacked = C.sequence.unpack(x_seq, padding_value=-1000.0)
    x_seq_unpacked_value_output = x_seq_unpacked.outputs[0]
    x_seq_unpacked_mask_output = x_seq_unpacked.outputs[1]
    assert len(x_seq_unpacked_value_output.dynamic_axes) == 1
    assert x_seq_unpacked_value_output.shape == (C.FreeDimension, 2, 3)

    seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]]
    seq2_data = [[0, 1, 1], [1, 1, 0]]
    x_data = [
        np.asarray(seq1_data, dtype=np.float32),
        np.asarray(
            [seq2_data, [[-100.0, -100.0, -100.0], [-100.0, -100.0, -100.0]]],
            dtype=np.float32)
    ]
    x_seq_lens_data = np.asarray([2, 1], dtype=np.float32)
    result = x_seq_unpacked.eval({
        x: x_data,
        x_seq_lens: x_seq_lens_data
    },
                                 device=dev)
    value = result[x_seq_unpacked_value_output]
    mask = result[x_seq_unpacked_mask_output]
    assert np.array_equal(value[0], seq1_data)
    assert np.array_equal(value[1], [
        seq2_data, [[-1000.0, -1000.0, -1000.0], [-1000.0, -1000.0, -1000.0]]
    ])
    assert np.array_equal(mask, [[1, 1], [1, 0]])
Exemple #41
0
def create_model(base_model_file,
                 feature_node_name,
                 last_hidden_node_name,
                 num_classes,
                 input_features,
                 freeze=False):
    # Load the pretrained classification net and find nodes
    base_model = load_model(base_model_file)
    feature_node = find_by_name(base_model, feature_node_name)
    last_node = find_by_name(base_model, last_hidden_node_name)

    # Clone the desired layers with fixed weights
    cloned_layers = combine([last_node.owner]).clone(
        CloneMethod.freeze if freeze else CloneMethod.clone,
        {feature_node: placeholder(name='features')})

    # Add new dense layer for class prediction
    feat_norm = input_features - Constant(114)
    cloned_out = cloned_layers(feat_norm)
    z = Dense(num_classes, activation=None,
              name=new_output_node_name)(cloned_out)

    return z
Exemple #42
0
def test_op_sequence_reduce_sum(device_id, precision):
    from .. import sequence

    a = sequence.input(shape=(1, ),
                       dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]),
                       needs_gradient=True,
                       name='a')

    sequence_sum_a_plus_sequence_sum_a = sequence.reduce_sum(
        a) + sequence.reduce_sum(a)

    a_data = [
        AA([[2]], dtype=PRECISION_TO_TYPE[precision]),
        AA([[2], [3]], dtype=PRECISION_TO_TYPE[precision]),
        AA([[2], [3], [4]], dtype=PRECISION_TO_TYPE[precision])
    ]

    actual_grad = sequence_sum_a_plus_sequence_sum_a.grad({a: a_data}, [a])
    assert np.array_equal(actual_grad[0], np.asarray([[2.]]))
    assert np.array_equal(actual_grad[1], np.asarray([[2.], [2.]]))
    assert np.array_equal(actual_grad[2], np.asarray([[2.], [2.], [2.]]))

    res = sequence_sum_a_plus_sequence_sum_a.eval({a: a_data})
    assert np.array_equal(res[0], np.asarray([4.]))
    assert np.array_equal(res[1], np.asarray([10.]))
    assert np.array_equal(res[2], np.asarray([18.]))

    # Verify that calling sequence reduction on a placeholder with known
    # shape but unknown dynamic axes does not result in a problem
    p = C.placeholder(shape=(1, ))
    r = sequence.reduce_sum(p)
    r.replace_placeholder(a)

    res = r.eval({a: a_data})
    assert np.array_equal(res[0], np.asarray([2.]))
    assert np.array_equal(res[1], np.asarray([5.]))
    assert np.array_equal(res[2], np.asarray([9.]))
    def create_trainer(use_sparse, device):
        a = C.sequence.input_variable(shape=input_shape,
                                      is_sparse=use_sparse,
                                      name='input')
        w_i = C.parameter(init=w_init_i, device=dev)
        a_projection = times(a, w_i)

        p_o = C.placeholder()
        h = C.sequence.past_value(p_o)
        w_h = C.parameter(init=w_init_h, device=dev)
        h_projection = times(h, w_h)
        z = a_projection + h_projection
        z = z.replace_placeholder(z)
        z = reshape(z, label_shape)

        l = C.sequence.input_variable(shape=label_shape,
                                      is_sparse=use_sparse,
                                      name='label')
        loss = cross_entropy_with_softmax(z, l, axis=-1)
        trainer = C.Trainer(
            z, (loss, None),
            C.sgd(z.parameters,
                  lr=C.learning_rate_schedule(0.7, C.UnitType.sample)))
        return (a, l, w_i, w_h, trainer)
Exemple #44
0
def gpt2_block(token_dims: int,
               head_dims: int,
               as_block: bool = False,
               name: str = 'gpt2_block'):
    X = C.placeholder(token_dims,
                      dynamic_axes=(C.Axis.default_batch_axis(),
                                    C.Axis.default_dynamic_axis()),
                      name=name)

    sa_layer = gpt2_self_attention(token_dims, head_dims)
    ff_layer = feed_forward_layer(4 * token_dims, token_dims)

    sa = sa_layer(layer_normalization(X))

    sa = X + sa
    ff = ff_layer(layer_normalization(sa))
    ff = X + ff

    result = ff

    if as_block:
        return C.as_block(result, [(X, X)], 'gpt2_block', 'gpt2_block')

    return result
def BiRecurrence(fwd, bwd):
    F = Recurrence(fwd)
    G = Recurrence(fwd, go_backwards=True)
    x = placeholder()
    apply_x = splice(F(x), G(x))
    return apply_x
Exemple #46
0
    def sample(self, batchSize):
        z = self.prior.sample(batchSize).astype(np.float32)
        logp = C.log(self.prior.pdf(z))
        x = self.reverse(z)
        return x

    def parameters(self):
        return self.forward.parameters


if __name__ == '__main__':
    nets = lambda: C.layers.Sequential([
        C.layers.Dense(256, activation=C.leaky_relu),
        C.layers.Dense(256, activation=C.leaky_relu),
        C.layers.Dense(2, activation=C.tanh)
    ])(C.placeholder(2))
    nett = lambda: C.layers.Sequential([
        C.layers.Dense(256, activation=C.leaky_relu),
        C.layers.Dense(256, activation=C.leaky_relu),
        C.layers.Dense(2)
    ])(C.placeholder(2))
    masks = C.Constant(np.array([[0, 1], [1, 0]] * 3).astype(np.float32),
                       name='mask')
    prior = MultivariateNormalDiag(loc=[0., 0.], scale_diag=[1., 1.])
    flow = RealNVP(nets, nett, masks, prior)

    loss = -C.reduce_mean(flow.log_prob)

    learner = C.adam(loss.parameters, C.learning_parameter_schedule(1e-1),
                     C.momentum_schedule(0.9))
    trainer = C.Trainer(flow.forward, (loss, None), learner)
Exemple #47
0
    def rnet_output_layer(self, attention_context, query):

        att_context = C.placeholder(shape=(2 * self.hidden_dim, ))
        q_processed = C.placeholder(shape=(2 * self.hidden_dim, ))

        wuq = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim),
                          init=C.glorot_uniform())
        whp = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim),
                          init=C.glorot_uniform())
        wha = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim),
                          init=C.glorot_uniform())
        v = C.parameter(shape=(2 * self.hidden_dim, 1),
                        init=C.glorot_uniform())
        bias = C.parameter(shape=(2 * self.hidden_dim),
                           init=C.glorot_uniform())

        whp_end = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim),
                              init=C.glorot_uniform())
        wha_end = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim),
                              init=C.glorot_uniform())
        v_end = C.parameter(shape=(2 * self.hidden_dim, 1),
                            init=C.glorot_uniform())

        # sequence[tensor[1]] q_len x 1
        s0 = C.times(C.tanh(C.times(q_processed, wuq) + bias), v)
        a0 = C.sequence.softmax(s0)
        rQ = C.sequence.reduce_sum(a0 * q_processed)

        # sequence[tensor[1]] plen x 1
        ts = C.reshape(
            C.times(
                C.tanh(
                    C.times(att_context, whp) +
                    C.times(C.sequence.broadcast_as(rQ, att_context), wha)),
                v), (-1))

        # sequence[tensor[1]]
        ta = C.sequence.softmax(ts)

        # sequence[2d] 1 x 2d
        c0 = C.reshape(C.sequence.reduce_sum(ta * att_context),
                       (2 * self.hidden_dim))

        # sequence[tensor[2d]]
        ha1 = C.layers.blocks.GRU(2 * self.hidden_dim)(rQ, c0)

        # sequence[tensor[1]] plen x 1
        s1 = C.reshape(
            C.times(
                C.tanh(
                    C.times(att_context, whp_end) +
                    C.times(C.sequence.broadcast_as(ha1, att_context), wha_end)
                ), v_end), (-1))

        # sequence[tensor[1]] plen x 1
        a1 = C.sequence.softmax(s1)

        return C.as_block(C.combine([ts,
                                     s1]), [(att_context, attention_context),
                                            (q_processed, query)],
                          'output_layer', 'output_layer')
Exemple #48
0
    def attention_layer(self, context, query, layer):

        q_processed = C.placeholder(shape=(2 * self.hidden_dim, ))
        p_processed = C.placeholder(shape=(2 * self.hidden_dim, ))

        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        wq = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim),
                         init=C.glorot_uniform())
        wp = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim),
                         init=C.glorot_uniform())
        wg = C.parameter(shape=(8 * self.hidden_dim, 8 * self.hidden_dim),
                         init=C.glorot_uniform())
        v = C.parameter(shape=(2 * self.hidden_dim, 1),
                        init=C.glorot_uniform())

        # seq[tensor[2d]] p_len x 2d
        wpt = C.reshape(C.times(p_processed, wp), (-1, 2 * self.hidden_dim))

        # q_len x 2d
        wqt = C.reshape(C.times(qvw, wq), (-1, 2 * self.hidden_dim))

        # seq[tensor[q_len]]
        S = C.reshape(
            C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt),
                    v), (-1))

        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed)

        # seq[tensor[q_len]]
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))

        # seq[tensor[q_len]]
        A = C.softmax(S, axis=0)

        # seq[tensor[2d]]
        swap_qvw = C.swapaxes(qvw)
        cq = C.reshape(
            C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1),
            (-1))

        # seq[tensor[4d]]
        uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq)

        # seq[tensor[4d]]
        gt = C.tanh(C.times(uc_concat, wg))

        # seq[tensor[4d]]
        uc_concat_star = gt * uc_concat

        # seq[tensor[4d]]
        vp = C.layers.Sequential([
            C.layers.Dropout(self.dropout),
            OptimizedRnnStack(self.hidden_dim,
                              bidirectional=True,
                              use_cudnn=self.use_cudnn,
                              name=layer + '_attention_rnn')
        ])(uc_concat_star)

        return C.as_block(vp, [(p_processed, context), (q_processed, query)],
                          'attention_layer', 'attention_layer')
Exemple #49
0
def OneWordLookahead():
    x = C.placeholder()
    apply_x = C.splice(x, C.sequence.future_value(x))
    return apply_x
Exemple #50
0
def create_rpn(conv_out,
               scaled_gt_boxes,
               im_info,
               cfg,
               add_loss_functions=True):
    '''
    Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper:
        Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun:
        "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks"

    Outputs object detection proposals by applying estimated bounding-box
    transformations to a set of regular boxes (called "anchors").

    Args:
        conv_out:        The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network
        scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image.
        im_info:         A CNTK variable or constant containing
                         (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height)
                         e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000
        cfg:             The configuration dictionary
        add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses

    Returns:
        rpn_rois - the proposed ROIs
        rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness)
    '''

    # RPN network
    # init = 'normal', initValueScale = 0.01, initBias = 0.1
    num_channels = cfg["MODEL"].RPN_NUM_CHANNELS
    rpn_conv_3x3 = Convolution((3, 3),
                               num_channels,
                               activation=relu,
                               pad=True,
                               strides=1,
                               init=normal(scale=0.01),
                               init_bias=0.0)(conv_out)
    rpn_cls_score = Convolution(
        (1, 1),
        18,
        activation=None,
        name="rpn_cls_score",
        init=normal(scale=0.01),
        init_bias=0.0)(rpn_conv_3x3)  # 2(bg/fg)  * 9(anchors)
    rpn_bbox_pred = Convolution(
        (1, 1),
        36,
        activation=None,
        name="rpn_bbox_pred",
        init=normal(scale=0.01),
        init_bias=0.0)(rpn_conv_3x3)  # 4(coords) * 9(anchors)

    # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W)
    num_predictions = int(rpn_cls_score.shape[0] / 2)
    rpn_cls_score_rshp = reshape(
        rpn_cls_score,
        (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]),
        name="rpn_cls_score_rshp")
    p_rpn_cls_score_rshp = cntk.placeholder()
    rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0)
    rpn_cls_prob = cntk.as_block(rpn_cls_sm,
                                 [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)],
                                 'Softmax', 'rpn_cls_prob')
    rpn_cls_prob_reshape = reshape(rpn_cls_prob,
                                   rpn_cls_score.shape,
                                   name="rpn_cls_prob_reshape")

    # proposal layer
    rpn_rois = create_proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred,
                                     im_info, cfg)

    rpn_losses = None
    if (add_loss_functions):
        # RPN targets
        # Comment: rpn_cls_score is only passed   vvv   to get width and height of the conv feature map ...
        proposal_layer_params = "'feat_stride': {}\n'scales':\n - {}". \
            format(cfg["MODEL"].FEATURE_STRIDE, "\n - ".join([str(v) for v in cfg["DATA"].PROPOSAL_LAYER_SCALES]))
        atl = user_function(
            AnchorTargetLayer(
                rpn_cls_score,
                scaled_gt_boxes,
                im_info,
                rpn_batch_size=cfg["TRAIN"].RPN_BATCHSIZE,
                rpn_fg_fraction=cfg["TRAIN"].RPN_FG_FRACTION,
                clobber_positives=cfg["TRAIN"].RPN_CLOBBER_POSITIVES,
                positive_overlap=cfg["TRAIN"].RPN_POSITIVE_OVERLAP,
                negative_overlap=cfg["TRAIN"].RPN_NEGATIVE_OVERLAP,
                param_str=proposal_layer_params))
        rpn_labels = atl.outputs[0]
        rpn_bbox_targets = atl.outputs[1]
        rpn_bbox_inside_weights = atl.outputs[2]

        # classification loss
        p_rpn_labels = cntk.placeholder()
        p_rpn_cls_score_rshp = cntk.placeholder()

        keeps = cntk.greater_equal(p_rpn_labels, 0.0)
        fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets")
        bg_labels = minus(1, fg_labels, name="bg_targets")
        rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0)
        rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp,
                                            rpn_labels_ignore,
                                            axis=0)
        rpn_loss_cls = element_times(rpn_ce, keeps)

        # The terms that are accounted for in the cls loss are those that have a label >= 0
        cls_num_terms = reduce_sum(keeps)
        cls_normalization_factor = 1.0 / cls_num_terms
        normalized_rpn_cls_loss = reduce_sum(
            rpn_loss_cls) * cls_normalization_factor

        reduced_rpn_loss_cls = cntk.as_block(
            normalized_rpn_cls_loss,
            [(p_rpn_labels, rpn_labels),
             (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore',
            'norm_rpn_cls_loss')

        # regression loss
        p_rpn_bbox_pred = cntk.placeholder()
        p_rpn_bbox_targets = cntk.placeholder()
        p_rpn_bbox_inside_weights = cntk.placeholder()
        rpn_loss_bbox = SmoothL1Loss(cfg.SIGMA_RPN_L1, p_rpn_bbox_pred,
                                     p_rpn_bbox_targets,
                                     p_rpn_bbox_inside_weights, 1.0)
        # The bbox loss is normalized by the rpn batch size
        bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE
        normalized_rpn_bbox_loss = reduce_sum(
            rpn_loss_bbox) * bbox_normalization_factor

        reduced_rpn_loss_bbox = cntk.as_block(
            normalized_rpn_bbox_loss,
            [(p_rpn_bbox_pred, rpn_bbox_pred),
             (p_rpn_bbox_targets, rpn_bbox_targets),
             (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)],
            'SmoothL1Loss', 'norm_rpn_bbox_loss')

        rpn_losses = plus(reduced_rpn_loss_cls,
                          reduced_rpn_loss_bbox,
                          name="rpn_losses")

    return rpn_rois, rpn_losses
Exemple #51
0
def BiRecurrence(fwd, bwd):
    F = C.layers.Recurrence(fwd)
    G = C.layers.Recurrence(bwd, go_backwards=True)
    x = C.placeholder()
    apply_x = C.splice(F(x), G(x))  # concatenate the tensors
    return apply_x
Exemple #52
0
def flow_forward(input_dim: int, act_func_pair: tuple = (None, None), batch_norm: bool = False):
    chunk = {}
    log_det_J = 0

    chunk['input_dim'] = input_dim
    _ph = C.placeholder(input_dim, name='place_holder')
    _out = _ph

    if batch_norm:
        # _bn = C.layers.BatchNormalization(name='batch_norm')(_ph)
        # chunk['scale'] = _bn.parameters[0]
        # chunk['bias'] = _bn.parameters[1]

        chunk['mu'] = C.Constant(np.zeros(shape=input_dim))
        chunk['var'] = C.Constant(np.ones(shape=input_dim))

        _eps = C.Constant(1e-7)
        _mu = C.reduce_mean(_ph, axis=C.Axis.default_batch_axis())
        _var = C.reduce_mean(C.square(_ph-_mu), axis=C.Axis.default_batch_axis())

        chunk['muB'] = _mu
        chunk['varB'] = _var

        # _bn = (_ph-chunk['mu'])/C.sqrt(chunk['var']+_eps)
        _bn = C.sqrt(chunk['var']+_eps)*_ph + chunk['mu']
        _ph = _bn

        log_det_J += -0.5*C.reduce_sum(C.log((_var+_eps)))
        # log_det_J += C.reduce_sum(C.log())

    chunk['W_rot_mat'] = _W = C.parameter((input_dim, input_dim))
    _W.value = random_rotation_matrix = special_ortho_group.rvs(input_dim)
    # _W.value = np.roll(np.eye(input_dim),input_dim//2,axis=0)
    _out = _ph@_W
    log_det_J += C.log(C.abs(C.det(_W))) # or # log_det_J += C.slogdet(_W)[1]
    
    _half_dim = input_dim//2
    _x1 = _out[:_half_dim]
    _x2 = _out[_half_dim:]

    _log_s_func, _t_func = act_func_pair
    if _log_s_func is None: # basic network
        _log_s_func = C.layers.Sequential([
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(_half_dim, C.tanh),
        ])#(C.placeholder(input_dim, name='place_holder'))
    if _t_func is None: # basic network
        _t_func = C.layers.Sequential([
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(256, C.leaky_relu),
            C.layers.Dense(_half_dim),
        ])#(C.placeholder(input_dim, name='place_holder'))

    chunk['log_s_func'] = _log_s_func
    chunk['t_func'] = _t_func

    _log_s, _t = _log_s_func(_x2), _t_func(_x2)

    _s = C.exp(_log_s)

    _y1 = _s*_x1 + _t
    _y2 = _x2

    _Y = C.splice(_y1, _y2)
    chunk['output'] = _Y

    log_det_J += C.reduce_sum(_log_s)

    return _Y, log_det_J, chunk
def create_criterion_function(model):
    labels = C.placeholder(name='labels')
    ce = C.cross_entropy_with_softmax(model, labels)
    errs = C.classification_error(model, labels)
    return C.combine([ce, errs])  # (features, labels) -> (loss, metric)
def _convert_optimized_rnnstack(root_func, map_param_to_func):
    '''
    Internal implementation that converts root_func that contains cudnn optimized_rnnstack to use non-cudnn functions, so it can be used in non-CUDA environment

    Args:
        root_func: a root function of a graph that contains optimized_rnnstacks
        map_param_to_func: a mapping of converted rnn functions for parameter sharing
    Returns:
        converted root_func on GEMM based implementation of rnn that can be used on CPU
    '''
    # recursively convert for blocks in root_func
    blocks = C.logging.graph.depth_first_search(
        root_func,
        lambda x: type(x) == C.Function and x.root_function.is_block,
        depth=0)
    for i in range(len(blocks)):
        # search for blocks again in case block input/output has been modified
        blocks1 = C.logging.graph.depth_first_search(
            root_func,
            lambda x: type(x) == C.Function and x.root_function.is_block,
            depth=0)
        block = blocks1[
            i]  # assuming depth_first_search order to be stable, so use the old index on new search results
        block_root = C.as_composite(block.block_root)
        new_block_root = _convert_optimized_rnnstack(block_root,
                                                     map_param_to_func)
        if new_block_root != block_root:
            block_arguments_mapping = dict(block.block_arguments_mapping)
            new_block_arguments_mapping = []
            for arg, new_arg in zip(block_root.arguments,
                                    new_block_root.arguments):
                new_block_arguments_mapping += [(new_arg,
                                                 block_arguments_mapping[arg])]
            new_block = C.as_block(new_block_root, new_block_arguments_mapping,
                                   block.op_name, block.name)
            if all([x not in root_func.outputs for x in block.outputs]) or all(
                [x in block.outputs for x in root_func.outputs]):
                root_func = root_func.clone(
                    C.CloneMethod.share,
                    dict(zip(block.outputs, new_block.outputs)))
            else:
                new_outputs = [
                    new_block.outputs[block.outputs.index(x)]
                    if x in block.outputs else None for x in root_func.outputs
                ]
                root_func_nonreplaced = C.combine(
                    [x for x in root_func.outputs if x not in block.outputs])
                root_func_nonreplaced_clone = root_func_nonreplaced.clone(
                    C.CloneMethod.share,
                    dict(zip(block.outputs, new_block.outputs)))
                idx = 0
                for nonreplaced_output in root_func_nonreplaced_clone.outputs:
                    while new_outputs[idx]:
                        idx += 1
                    new_outputs[idx] = nonreplaced_output
                root_func = C.combine(new_outputs)

    # replace all optimized_rnnstack instances in root_func
    cudnn_rnns = C.logging.graph.depth_first_search(
        root_func,
        lambda x: type(x) == C.Function and x.root_function.op_name ==
        'OptimizedRNNStack',
        depth=0)
    for cudnn_rnn in cudnn_rnns:
        param = cudnn_rnn.parameters[0]
        if map_param_to_func[param]:
            #shared parameter, clone
            converted = map_param_to_func[param][0].clone(
                C.CloneMethod.share, {
                    map_param_to_func[param][1]: cudnn_rnn.inputs[0],
                    map_param_to_func[param][2]: C.placeholder()
                })
        else:
            #unique or first parameter, convert
            converted = _from_optimized_rnnstack(cudnn_rnn)
            map_param_to_func[param] = (
                converted,
                cudnn_rnn.inputs[0],
                cudnn_rnn.output,
            )

        if not cudnn_rnn.output in root_func.outputs:
            root_func = root_func.clone(C.CloneMethod.share,
                                        {cudnn_rnn.output: converted.output})
        else:
            # if cudnn_rnn output is the root_func output, just use converted as root_func and no clone needed
            if len(root_func.outputs) > 1:
                root_func = C.combine([
                    converted if x == cudnn_rnn.output else x
                    for x in root_func.outputs
                ])
            else:
                root_func = converted

    return root_func
Exemple #55
0
def create_transfer_learning_model(input, num_classes, model_file, freeze=False):

    base_model = load_model(model_file)
    base_model = C.as_composite(base_model[3].owner)

    # Load the pretrained classification net and find nodes
    feature_node = C.logging.find_by_name(base_model, feature_node_name)
    last_node = C.logging.find_by_name(base_model, last_hidden_node_name)
    
    base_model = C.combine([last_node.owner]).clone(C.CloneMethod.freeze if freeze else C.CloneMethod.clone, {feature_node: C.placeholder(name='features')})
    base_model = base_model(C.input_variable((num_channels, image_height, image_width)))

    r1 = C.logging.find_by_name(base_model, "z.x.x.r")
    r2_2 = C.logging.find_by_name(base_model, "z.x.x.x.x.r")
    r3_2 = C.logging.find_by_name(base_model, "z.x.x.x.x.x.x.r")
    r4_2 = C.logging.find_by_name(base_model, "z.x.x.x.x.x.x.x.x.r")

    up_r1 = OneByOneConvAndUpSample(r1, 3, num_classes)
    up_r2_2 = OneByOneConvAndUpSample(r2_2, 2, num_classes)
    up_r3_2 = OneByOneConvAndUpSample(r3_2, 1, num_classes)
    up_r4_2 = OneByOneConvAndUpSample(r4_2, 0, num_classes)
    
    merged = C.splice(up_r1, up_r3_2, up_r2_2, axis=0)

    resnet_fcn_out = Convolution((1, 1), num_classes, init=he_normal(), activation=sigmoid, pad=True)(merged)

    z = UpSampling2DPower(resnet_fcn_out,2)
    
    return z
def with_lookahead():
    x = placeholder()
    future_x = sequence.future_value(x)
    apply_x = splice(x, future_x)
    return apply_x
Exemple #57
0
    def output_layer(self, embed, attention_context, model_context, aw, q_processed, c_processed,cw):
        cw_ph=C.placeholder()
        att_context = C.placeholder(shape=(8*self.hidden_dim,))
        query_processed = C.placeholder(shape=(2*self.hidden_dim,))
        context_processed = C.placeholder(shape=(2*self.hidden_dim,))
        mod_context = C.placeholder(shape=(2*self.hidden_dim))
        a_onehot = C.placeholder(shape=(self.vocab_size+1,))

        start_logits = C.layers.Dense(1, name='out_start')(C.dropout(C.splice(mod_context, att_context), self.dropout))
        start_hardmax = seq_hardmax(start_logits)
        att_mod_ctx = C.sequence.last(C.sequence.gather(mod_context, start_hardmax))
        att_mod_ctx_expanded = C.sequence.broadcast_as(att_mod_ctx, att_context)
        end_input = C.splice(att_context, mod_context, att_mod_ctx_expanded, mod_context * att_mod_ctx_expanded)
        m2 = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='output_rnn')(end_input)
        end_logits = C.layers.Dense(1, name='out_end')(C.dropout(C.splice(m2, att_context), self.dropout))

        start_flag = C.hardmax(start_logits)
        end_flag = C.hardmax(end_logits)
     
        def create_model():
            # Encoder: (input*) --> (h0, c0)
            # Create multiple layers of LSTMs by passing the output of the i-th layer
            # to the (i+1)th layer as its input
            with C.layers.default_options(enable_self_stabilization=True, go_backwards=False):
                LastRecurrence = C.layers.Recurrence
                encode = C.layers.Sequential([
                    C.layers.Stabilizer(),
                    OptimizedRnnStack(self.hidden_dim, return_full_state=True),
                ])

                encode_c = C.layers.Sequential([
                    C.layers.Stabilizer(),
                    OptimizedRnnStack(self.hidden_dim, return_full_state=True),
                ])
            
            # Decoder: (history*, input*) --> unnormalized_word_logp*
            # where history is one of these, delayed by 1 step and <s> prepended:
            #  - training: labels
            #  - testing:  its own output hardmax(z) (greedy decoder)
            with C.layers.default_options(enable_self_stabilization=True):
                # sub-layers
                stab_in = C.layers.Stabilizer()
                rec_blocks = [C.layers.LSTM(self.hidden_dim) for i in range(self.num_layers)]
                stab_out = C.layers.Stabilizer()
                proj_out = C.layers.Dense(self.vocab_size+1, name='out_proj')
                # attention model
                attention_model = C.layers.AttentionModel(self.attention_dim, 
                                                              name='attention_model') # :: (h_enc*, h_dec) -> (h_dec augmented)
                hstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1)
                cstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1)
                W_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1)
                U_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1)
                V_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1)
                maxout  = C.layers.MaxPooling((2,), strides=2)
                # layer function
                @C.Function
                def decode(history, q, c, start_logits, end_logits):
                    q = encode(q)
                    c = encode_c(C.splice(c, start_logits, end_logits, axis=0))
                    r = history
                    r = stab_in(r)

                    q_last_h = C.sequence.last(q.outputs[0])
                    q_last_c = C.sequence.last(q.outputs[1])
                    c_last_h = C.sequence.last(c.outputs[0])
                    c_last_c = C.sequence.last(c.outputs[1])
                    initial_hstate = hstate_dense(C.splice(q_last_h, c_last_h))
                    initial_cstate = cstate_dense(C.splice(q_last_c, c_last_c))

                    rec_block = rec_blocks[0]   # LSTM(hidden_dim)  # :: (dh, dc, x) -> (h, c)
                    
                    @C.Function
                    def find_embed(x):
                        gx, ngx = C.slice(x, 0, 0, self.wg_dim), C.slice(x, 0, self.wg_dim, self.vocab_size)
                        return embed(gx, ngx) 

                    @C.Function
                    def lstm_with_attention(dh, dc, r, x):
                        history_embed = find_embed(x)
                        h_att = attention_model(c.outputs[0], dh)
                        q_att = attention_model(q.outputs[0], dh)
                        att = C.splice(h_att, q_att)
                        x = C.splice(x, att)
                        x, dc = rec_block(dh, dc, x).outputs
          
                        # 0*r is a hack because cntk freaks out when r is not used.
                        r = U_dense(att) + W_dense(history_embed) + V_dense(x) + 0*r 
                        #bug when W_dense is added first, wtf?!
                        #r = W_dense(embed(gx, ngx)) + U_dense(att) + V_dense(x) + 0*r
                        return x, dc, r
                    _, _, r = C.layers.RecurrenceFrom(lstm_with_attention, return_full_state=True)(initial_hstate, initial_cstate, C.Constant(np.zeros(2*self.hidden_dim)),r).outputs
        
                    r = maxout(r)
                    r = stab_out(r)
                    r = proj_out(r)
                    #r = C.softmax(r)
                    r = C.layers.Label('out_proj_out')(r)
                    return r
            return decode

        def create_model_train(s2smodel):
            # model used in training (history is known from labels)
            # note: the labels must NOT contain the initial <s>
            @C.Function
            def model_train(labels, q, c, start_logits, end_logits): # (input*, labels*) --> (word_logp*)

                # The input to the decoder always starts with the special label sequence start token.
                # Then, use the previous value of the label sequence (for training) or the output (for execution).
                past_labels = C.layers.Delay(initial_state=self.sentence_start)(labels)
    
                return s2smodel(past_labels, q, c, start_logits, end_logits)
            return model_train

        def create_model_greedy(s2smodel):
            # model used in (greedy) decoding (inferencing) (history is decoder's own output)
            @C.Function
            def model_greedy(q, c, start_logits, end_logits): # (input*) --> (word_sequence*)
                # Decoding is an unfold() operation starting from sentence_start.
                # We must transform s2smodel (history*, input* -> word_logp*) into a generator (history* -> output*)
                # which holds 'input' in its closure.
                unfold = C.layers.UnfoldFrom(\
                                    lambda history: s2smodel(history, q, c, start_logits, end_logits) >> C.hardmax,
                                    # stop once sentence_end_index was max-scoring output
                                    until_predicate=lambda w: w[...,self.sentence_end_index],
                                    length_increase=self.sentence_max_length)
                return unfold(initial_state=self.sentence_start, dynamic_axes_like=c)
            return model_greedy
       
        s2smodel = create_model()
      
        model_train = create_model_train(s2smodel)(a_onehot, query_processed, context_processed, start_logits, end_logits)
        model_greed = create_model_greedy(s2smodel)(query_processed, context_processed, start_logits, end_logits)
        model_greedy = C.argmax(model_greed,0)
        context = C.argmax(cw_ph,0)

        return C.as_block(
            C.combine((model_train, model_greedy, start_logits, end_logits,context)),
            [(att_context, attention_context), (mod_context, model_context), (a_onehot, aw), (query_processed, q_processed), (context_processed, c_processed),(cw_ph,cw)],
            'attention_layer',
            'attention_layer')
Exemple #58
0
def run_experiment_cntk():
    if os.path.isfile('x_train_imdb.bin'):
        print('Loading from .bin files')
        x_train, y_train, x_test, y_test = load_from_files(x_shape=(25000,
                                                                    500),
                                                           y_shape=(25000, ))
    else:
        print('Loading data...')
        (x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(
            num_words=Constants.max_words)
        print(len(x_train), 'train sequences')
        print(len(x_test), 'test sequences')

        print('Pad sequences (samples x time)')
        x_train = keras.preprocessing.sequence.pad_sequences(
            x_train, maxlen=Constants.maxlen)
        x_test = keras.preprocessing.sequence.pad_sequences(
            x_test, maxlen=Constants.maxlen)
        print('x_train shape:', x_train.shape)
        print('x_test shape:', x_test.shape)
        print('Saving to .bin files')
        save_to_files(x_train, y_train, x_test, y_test)

    x = cntk.sequence.input_variable(shape=(), dtype=np.float32)
    y = cntk.input_variable(shape=(), dtype=np.float32)
    x_placeholder = cntk.placeholder(shape=(),
                                     dynamic_axes=[
                                         cntk.Axis.default_batch_axis(),
                                         cntk.Axis.default_dynamic_axis()
                                     ])

    model = cntk.one_hot(x_placeholder,
                         num_classes=Constants.max_words,
                         sparse_output=True)
    model = cntk.layers.Embedding(Constants.embedding_dim)(model)
    model = cntk.layers.Recurrence(cntk.layers.LSTM(32))(model)
    model = cntk.sequence.last(model)
    model = cntk.layers.Dense(1, activation=cntk.sigmoid)(model)
    model.save('ch6-2.cntk.model')
    model = None
    model = cntk.load_model('ch6-2.cntk.model')
    model.replace_placeholders({model.placeholders[0]: x})

    loss_function = cntk.binary_cross_entropy(model.output, y)
    round_predictions = cntk.round(model.output)
    equal_elements = cntk.equal(round_predictions, y)
    accuracy_function = cntk.reduce_mean(equal_elements,
                                         axis=cntk.Axis.all_static_axes())

    max_epochs = 10
    batch_size = 128
    learner = cntk.adam(model.parameters,
                        cntk.learning_parameter_schedule_per_sample(0.01),
                        cntk.learning_parameter_schedule_per_sample(0.9))
    progress_printer = cntk.logging.ProgressPrinter(tag='Training',
                                                    num_epochs=max_epochs)
    trainer = cntk.Trainer(model, (loss_function, accuracy_function),
                           [learner], progress_printer)
    evaluator = cntk.Evaluator(accuracy_function)

    cntk_train(x, y, x_train, y_train, max_epochs, batch_size, trainer,
               evaluator)
 def converter(self, cudnn_rnn):
     param = cudnn_rnn.parameters[0]
     if self.map_param_to_func[param]:
         #shared parameter, clone
         converted = self.map_param_to_func[param][0].clone(C.CloneMethod.share, {self.map_param_to_func[param][1] : cudnn_rnn.inputs[0], self.map_param_to_func[param][2] : C.placeholder()})
     else:
         #unique or first parameter, convert
         converted = _from_optimized_rnnstack(cudnn_rnn)
         self.map_param_to_func[param] = (converted, cudnn_rnn.inputs[0], cudnn_rnn.output,)
     
     return converted
Exemple #60
0
    def load_model(self):
        if self.__model:
            raise Exception("Model already loaded")

        trained_frcnn_model = load_model(self.__model_path)
        self.__is_python_model = True if (
            len(trained_frcnn_model.arguments) < 3) else False

        if (self.__is_python_model):
            self.__args_indices = {"features": 0, "rois": 1}
            self.__nr_rois = trained_frcnn_model.arguments[
                self.__args_indices["rois"]].shape[0]
            self.__resize_width = trained_frcnn_model.arguments[
                self.__args_indices["features"]].shape[1]
            self.__resize_height = trained_frcnn_model.arguments[
                self.__args_indices["features"]].shape[2]
            self.labels_count = trained_frcnn_model.arguments[
                self.__args_indices["rois"]].shape[1]
            self.__model = trained_frcnn_model

        else:
            # cache indices of the model arguments
            args_indices = {}
            for i, arg in enumerate(trained_frcnn_model.arguments):
                args_indices[arg.name] = i

            self.__nr_rois = trained_frcnn_model.arguments[
                args_indices["rois"]].shape[0]
            self.__resize_width = trained_frcnn_model.arguments[
                args_indices["features"]].shape[1]
            self.__resize_height = trained_frcnn_model.arguments[
                args_indices["features"]].shape[2]
            self.labels_count = trained_frcnn_model.arguments[
                args_indices["roiLabels"]].shape[1]

            # next, we adjust the clone the model and create input nodes just for the features (image) and ROIs
            # This will make sure that only the calculations that are needed for evaluating images are performed
            # during test time
            #
            # find the original features and rois input nodes
            features_node = find_by_name(trained_frcnn_model, "features")
            rois_node = find_by_name(trained_frcnn_model, "rois")

            #  find the output "z" node
            z_node = find_by_name(trained_frcnn_model, 'z')

            # define new input nodes for the features (image) and rois
            image_input = input_variable(features_node.shape, name='features')
            roi_input = input_variable(rois_node.shape, name='rois')

            # Clone the desired layers with fixed weights and place holder for the new input nodes
            cloned_nodes = combine([z_node.owner]).clone(
                CloneMethod.freeze, {
                    features_node: placeholder(name='features'),
                    rois_node: placeholder(name='rois')
                })

            # apply the cloned nodes to the input nodes to obtain the model for evaluation
            self.__model = cloned_nodes(image_input, roi_input)

            # cache the indices of the input nodes
            self.__args_indices = {}

            for i, arg in enumerate(self.__model.arguments):
                self.__args_indices[arg.name] = i