Esempio n. 1
0
    def gru(dh, x):

        dhs = Sdh(dh)  # previous value, stabilized
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        projx3 = b + times(x, W)
        projh2  = times(dhs, H)

        zt_proj = slice (projx3, stack_axis, 0*stacked_dim, 1*stacked_dim) + slice (projh2, stack_axis, 0*stacked_dim, 1*stacked_dim)
        rt_proj = slice (projx3, stack_axis, 1*stacked_dim, 2*stacked_dim) + slice (projh2, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ct_proj = slice (projx3, stack_axis, 2*stacked_dim, 3*stacked_dim)

        zt = sigmoid (zt_proj)        # update gate z(t)

        rt = sigmoid (rt_proj)        # reset gate r(t)

        rs = dhs * rt        # "cell" c
        ct = activation (ct_proj + times(rs, H1))

        ht = (1 - zt) * ct + zt * dhs # hidden state ht / output

        # for comparison: CUDNN_GRU
        # i(t) = sigmoid(W_i x(t) +          R_i h(t-1)  + b_Wi + b_Ru)
        # r(t) = sigmoid(W_r x(t) +          R_r h(t-1)  + b_Wr + b_Rr)   --same up to here
        # h'(t) =   tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh)   --r applied after projection? Would make life easier!
        # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1)                     --TODO: need to confirm bracketing with NVIDIA

        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        return Function.NamedOutput(h=h)
Esempio n. 2
0
    def lstm(dh, dc, x):
        # projected contribution from input(s), hidden, and bias

        dropped_H = dropout(H) if weight_drop_rate is not None else H
        proj4 = b + times(x, W) + times(dh, dropped_H)

        # slicing layout different from cntk's implementation
        it_proj = slice(proj4, stack_axis, 0 * stacked_dim,
                        1 * stacked_dim)  # split along stack_axis
        ft_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        bit_proj = slice(proj4, stack_axis, 2 * stacked_dim,
                         3 * stacked_dim)  # g gate
        ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

        it = sigmoid(it_proj)  # input gate(t)
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(ft_proj)  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(ot_proj)  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))
        return ht, ct
Esempio n. 3
0
 def rnn(dh, x):
     dhs = Sdh(dh)  # previous value, stabilized
     ht = activation(times(x, W) + times(dhs, H) + b)
     h = times(Sht(ht), Wmr) if has_projection else \
         ht
     #return Function.NamedOutput(h=h)
     return h
Esempio n. 4
0
    def weight_dropped_lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, dropout(H))

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else ht

        return h, c
Esempio n. 5
0
    def gru(dh, x):

        dhs = Sdh(dh)  # previous value, stabilized
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        projx3 = b + times(x, W)
        projh2  = times(dhs, H)

        zt_proj = slice (projx3, stack_axis, 0*stacked_dim, 1*stacked_dim) + slice (projh2, stack_axis, 0*stacked_dim, 1*stacked_dim)
        rt_proj = slice (projx3, stack_axis, 1*stacked_dim, 2*stacked_dim) + slice (projh2, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ct_proj = slice (projx3, stack_axis, 2*stacked_dim, 3*stacked_dim)

        zt = sigmoid (zt_proj)        # update gate z(t)

        rt = sigmoid (rt_proj)        # reset gate r(t)

        rs = dhs * rt        # "cell" c
        ct = activation (ct_proj + times(rs, H1))

        ht = (1 - zt) * ct + zt * dhs # hidden state ht / output

        # for comparison: CUDNN_GRU
        # i(t) = sigmoid(W_i x(t) +          R_i h(t-1)  + b_Wi + b_Ru)
        # r(t) = sigmoid(W_r x(t) +          R_r h(t-1)  + b_Wr + b_Rr)   --same up to here
        # h'(t) =   tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh)   --r applied after projection? Would make life easier!
        # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1)                     --TODO: need to confirm bracketing with NVIDIA

        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        return Function.NamedOutput(h=h)
 def dense(x):
     r = times(x, W1)
     r = times(r, W2)
     if b:
         r = r + b
     if activation is not None:
         r = activation(r)
     return r
Esempio n. 7
0
 def dense(x):
     r = times(x, W1)
     r = times(r, W2)
     if b:
         r = r + b
     if activation is not None:
         r = activation(r)
     return r
Esempio n. 8
0
    def lstm(dh, dc, sv, x):

        # projected contribution from input(s), hidden, and bias
        proj3 = b + times(x, W) + times(dh, H) + times(sv, Hsv)

        it_proj = slice(proj3, stack_axis, 0 * stacked_dim, 1 * stacked_dim)
        ft_proj = slice(proj3, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        ot_proj = slice(proj3, stack_axis, 2 * stacked_dim, 3 * stacked_dim)

        it = sigmoid(it_proj)  # input gate(t)
        ft = sigmoid(ft_proj)  # forget-me-not gate(t)
        ot = sigmoid(ot_proj)  # output gate(t)

        # the following is reading gate
        proj3rg = sigmoid(
            times(x, Wrg) + times(dh, Hrg) + times(sv, Hsvrg) + brg)
        v = proj3rg * sv

        cx_t = tanh(times(x, Wcx) + times(dh, Hcx))

        # need to do stablization ??
        # update memory cell
        c = it * cx_t + ft * dc + tanh(times(v, Wfc))

        h = ot * tanh(c)

        return (h, c, v)
Esempio n. 9
0
def project_cosine_sim(att_dim, init=glorot_uniform(), name=''):
    """
  Compute the project cosine similarity of two input sequences, where each of the input will be projected to a new dimention space (att_dim) via Wi/Wm
  """
    Wi = Parameter(_INFERRED + tuple((att_dim, )), init=init, name='Wi')
    Wm = Parameter(_INFERRED + tuple((att_dim, )), init=init, name='Wm')
    status = placeholder_variable(name='status')
    memory = placeholder_variable(name='memory')
    projected_status = times(status, Wi, name='projected_status')
    projected_memory = times(memory, Wm, name='projected_memory')
    sim = cosine_similarity(projected_status,
                            projected_memory,
                            name=name + '_sim')
    return seq_softmax(sim, name=name)
Esempio n. 10
0
def project_cosine(project_dim, init = glorot_uniform(), name=''):
  """
  Compute the project cosine similarity of two input sequences, 
  where each of the input will be projected to a new dimention space (project_dim) via Wi/Wm
  """
  Wi = Parameter(_INFERRED + (project_dim,), init = init, name='Wi')
  Wm = Parameter(_INFERRED + (project_dim,), init = init, name='Wm')

  status = placeholder(name='status')
  memory = placeholder(name='memory')

  projected_status = times(status, Wi, name = 'projected_status')   
  projected_memory = times(memory, Wm, name = 'projected_memory')
  status_br = sequence.broadcast_as(projected_status, projected_memory, name='status_broadcast')
  sim = cosine_distance(status_br, projected_memory, name= name)
  return sim
Esempio n. 11
0
def frcn_predictor(features, rois, n_classes):
    # Load the pretrained classification net and find nodes
    loaded_model = load_model(model_file)
    feature_node = find_by_name(loaded_model, feature_node_name)
    conv_node = find_by_name(loaded_model, last_conv_node_name)
    pool_node = find_by_name(loaded_model, pool_node_name)
    last_node = find_by_name(loaded_model, last_hidden_node_name)

    # Clone the conv layers and the fully connected layers of the network
    conv_layers = combine([conv_node.owner
                           ]).clone(CloneMethod.freeze,
                                    {feature_node: Placeholder()})
    fc_layers = combine([last_node.owner]).clone(CloneMethod.clone,
                                                 {pool_node: Placeholder()})

    # Create the Fast R-CNN model
    feat_norm = features - Constant(114)
    conv_out = conv_layers(feat_norm)
    roi_out = roipooling(conv_out, rois, (roi_dim, roi_dim))
    fc_out = fc_layers(roi_out)

    # z = Dense(rois[0], num_classes, map_rank=1)(fc_out)  # --> map_rank=1 is not yet supported
    W = parameter(shape=(4096, n_classes), init=glorot_uniform())
    b = parameter(shape=n_classes, init=0)
    z = times(fc_out, W) + b

    return z
Esempio n. 12
0
def Embedding(shape=None, init=None, weights=None):
    if init is not None or weights is not None:
        raise ValueError('Embedding: init and weights options are mutually exclusive')

    # parameters bound to this Function:
    # no weights given: learn the embedding
    if weights is None:
        if shape is None:
            raise ValueError('Embedding: output shape must be specified')
        if init is None:
            init = init_default_or_glorot_uniform
        shape = _as_tuple(shape)
        weight_shape = _INFERRED + shape
        E = Parameter(weight_shape, init=init, name='E')
    # weights given: use them as constant
    else:
        UntestedBranchError("Embedding, from constant")
        import numpy as np
        if not isinstance(weights, array): # TODO: is this the correct test for a numpy array
            UntestedBranchError("Embedding, from constant that is not an array")
            # TODO: can 'weights' be a CNTK object? Then how to do this?
            raise ValueError('Embedding: weights must be a numpy array')
        weight_shape = np.shape(weights)
        if shape is not None: # user may give shape, then it must match
            if len(shape) >= len(weight_shape) or weight_shape[-len(shape):] != shape:
                raise ValueError('Embedding: shape parameter must match weights')
        E = Constant(weights, name='E')

    # expression
    x = Placeholder(name='embedding_arg')
    apply_x = times(x, E)
    return Block(apply_x, 'Embedding', Record(E=E))
Esempio n. 13
0
def Embedding(shape=None, init=None, weights=None):
    if init is not None or weights is not None:
        raise ValueError('Embedding: init and weights options are mutually exclusive')

    # parameters bound to this Function:
    # no weights given: learn the embedding
    if weights is None:
        if shape is None:
            raise ValueError('Embedding: output shape must be specified')
        if init is None:
            init = init_default_or_glorot_uniform
        shape = _as_tuple(shape)
        weight_shape = _INFERRED + shape
        E = Parameter(weight_shape, init=init, name='E')
    # weights given: use them as constant
    else:
        UntestedBranchError("Embedding, from constant")
        import numpy as np
        if not isinstance(weights, array): # TODO: is this the correct test for a numpy array
            UntestedBranchError("Embedding, from constant that is not an array")
            # TODO: can 'weights' be a CNTK object? Then how to do this?
            raise ValueError('Embedding: weights must be a numpy array')
        weight_shape = np.shape(weights)
        if shape is not None: # user may give shape, then it must match
            if len(shape) >= len(weight_shape) or weight_shape[-len(shape):] != shape:
                raise ValueError('Embedding: shape parameter must match weights')
        E = Constant(weights, name='E')

    # expression
    x = Placeholder(name='embedding_arg')
    apply_x = times(x, E)
    return Block(apply_x, 'Embedding', Record(E=E))
Esempio n. 14
0
def test_trainer_with_some_params_not_learned():
    input_dim = 2
    proj_dim = 2
    x = input_variable(shape=(input_dim, ))
    W = parameter(shape=(input_dim, proj_dim), init=glorot_uniform())
    B = parameter(shape=(proj_dim, ), init=glorot_uniform())
    t = times(x, W)
    z = t + B

    W_orig_value = W.value
    B_orig_value = B.value

    labels = input_variable(shape=(proj_dim, ))
    ce = cross_entropy_with_softmax(z, labels)
    pe = classification_error(z, labels)

    lr_per_sample = learning_rate_schedule(0.1, UnitType.sample)
    trainer = Trainer(z, (ce, pe), sgd([W], lr_per_sample))

    x_value = [[1, 1], [2, 2]]
    label_value = [[0, 1], [1, 0]]
    arguments = {x: x_value, labels: label_value}

    num_iters = 3
    for i in range(num_iters):
        trainer.train_minibatch(arguments)

        assert np.array_equal(B.value, B_orig_value)
        assert not np.array_equal(W.value, W_orig_value)
        W_orig_value = W.value

    trainer.test_minibatch(arguments)
Esempio n. 15
0
def test_trainer_with_some_params_not_learned():
    input_dim = 2
    proj_dim = 2
    x = input_variable(shape=(input_dim,))
    W = parameter(shape=(input_dim, proj_dim), init=glorot_uniform())
    B = parameter(shape=(proj_dim,), init=glorot_uniform())
    t = times(x, W)
    z = t + B

    W_orig_value = W.value
    B_orig_value = B.value

    labels = input_variable(shape=(proj_dim,))
    ce = cross_entropy_with_softmax(z, labels)
    pe = classification_error(z, labels)

    lr_per_sample = learning_rate_schedule(0.1, UnitType.sample)
    trainer = Trainer(z, (ce, pe), sgd([W], lr_per_sample))

    x_value = [[1, 1],[2, 2]]
    label_value = [[0, 1], [1, 0]]
    arguments = {x: x_value, labels: label_value}

    num_iters = 3
    for i in range(num_iters):
        trainer.train_minibatch(arguments)

        assert np.array_equal(B.value, B_orig_value)
        assert not np.array_equal(W.value, W_orig_value)
        W_orig_value = W.value

    trainer.test_minibatch(arguments)
Esempio n. 16
0
def fully_connected_layer(input, output_dim, device_id, nonlinearity):        
    input_dim = input.shape()[0]    
    times_param = parameter(shape=(input_dim,output_dim))    
    t = times(input,times_param)
    plus_param = parameter(shape=(output_dim,))
    p = plus(plus_param,t.output())    
    return nonlinearity(p.output());
Esempio n. 17
0
def resnet_classifer(input, num_classes):
    conv_w_scale = 7.07
    conv_b_value = 0

    fc1_w_scale = 0.4
    fc1_b_value = 0

    sc_value = 1
    bn_time_const = 4096

    kernel_width = 3
    kernel_height = 3

    conv1_w_scale = 0.26
    c_map1 = 16

    conv1 = conv_bn_relu_layer(input, c_map1, kernel_width, kernel_height, 1,
                               1, conv1_w_scale, conv_b_value, sc_value,
                               bn_time_const)
    rn1_1 = resnet_node2(conv1, c_map1, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)
    rn1_2 = resnet_node2(rn1_1, c_map1, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)
    rn1_3 = resnet_node2(rn1_2, c_map1, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)

    c_map2 = 32
    rn2_1_wProj = get_projection_map(c_map2, c_map1)
    rn2_1 = resnet_node2_inc(rn1_3, c_map2, kernel_width, kernel_height,
                             conv1_w_scale, conv_b_value, sc_value,
                             bn_time_const, rn2_1_wProj)
    rn2_2 = resnet_node2(rn2_1, c_map2, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)
    rn2_3 = resnet_node2(rn2_2, c_map2, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)

    c_map3 = 64
    rn3_1_wProj = get_projection_map(c_map3, c_map2)
    rn3_1 = resnet_node2_inc(rn2_3, c_map3, kernel_width, kernel_height,
                             conv1_w_scale, conv_b_value, sc_value,
                             bn_time_const, rn3_1_wProj)
    rn3_2 = resnet_node2(rn3_1, c_map3, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)
    rn3_3 = resnet_node2(rn3_2, c_map3, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)

    # Global average pooling
    poolw = 8
    poolh = 8
    poolh_stride = 1
    poolv_stride = 1

    pool = pooling(rn3_3, AVG_POOLING, (1, poolh, poolw),
                   (1, poolv_stride, poolh_stride))
    out_times_params = parameter(shape=(c_map3, 1, 1, num_classes),
                                 init=glorot_uniform())
    out_bias_params = parameter(shape=(num_classes), init=0)
    t = times(pool, out_times_params)
    return t + out_bias_params
Esempio n. 18
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.device import cpu, gpu, set_default_device
    from cntk.ops import input_variable, times
    from scipy.sparse import csr_matrix

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_vocab_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=label_vocab_dim,  is_sparse=True)
    )), randomize=False, epoch_size = 2)

    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(
        shape=input_vocab_dim, dynamic_axes=input_dynamic_axes,
        name='raw_input', is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, 
            input_map={raw_input : mbs.streams.features})

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid)

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [
            [3, 4, 5, 4, 7, 12, 1], 
            [60, 61]
            ]
    data = [csr_matrix(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in
            one_hot_data]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = one_hot(one_hot_data, num_classes=input_vocab_dim)
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
Esempio n. 19
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.ops import input_variable, times

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_vocab_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=label_vocab_dim,  is_sparse=True)
    )), randomize=False, epoch_size = 2)

    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(
        shape=input_vocab_dim, dynamic_axes=input_dynamic_axes,
        name='raw_input', is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
            input_map={raw_input : mbs.streams.features},
            device=cntk_device(device_id))

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid, device=cntk_device(device_id))

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [
            [3, 4, 5, 4, 7, 12, 1],
            [60, 61]
            ]
    data = [csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in
            one_hot_data]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id))
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
Esempio n. 20
0
def Dense(shape,
          init=init_default_or_glorot_uniform,
          activation=activation_default_or_None,
          input_rank=None,
          map_rank=None,
          bias=bias_default_or_True,
          init_bias=init_bias_default_or_0):
    activation = _resolve_activation(activation)
    bias = bias if _is_given(bias) else _current_default_options.bias
    output_shape = _as_tuple(shape)

    if input_rank is not None and map_rank is not None:
        raise ValueError(
            "Dense: input_rank and map_rank cannot be specified at the same time."
        )

    # determine meaning of axes
    # W gets dimension (input_shape + shape)
    # where input_shape is determined as:
    #  - by default, equal to the dimensions of the input passed to Dense()
    #  - if input_rank is given, then the last 'input_rank' dimensions of the input (all others are not reduced over)
    #  - if map_rank is given, then the all but the first 'map_rank' dimensions of the input (those are not reduced over)
    # where input_rank and map_rank are mutuallly exclusive.

    #output_rank = -len(output_shape)   # support outputs with tensor layouts
    # BUGBUG: Should this be a negative number now, since output is the last axis in Python?
    output_rank = len(output_shape)  # support outputs with tensor layouts

    # If input_rank not given then pass a single _INFERRED; map_rank if given will determine the input_rank.
    # The dimension inference may still create multiple axes.
    input_shape = _INFERRED * (input_rank if input_rank is not None else 1)

    if input_rank is not None:
        UntestedBranchError("Dense, input_rank option not implemented")
        infer_input_rank_to_map = -1  # means map_rank is not specified; input_rank rules
    elif map_rank is None:
        infer_input_rank_to_map = 0  # neither given: default to 'infer W to use all input dims'
    else:
        UntestedBranchError("Dense, map_rank option not implemented")
        infer_input_rank_to_map = map_rank  # infer W to use all input dims except the first static 'map_rank' ones

    # parameters bound to this Function
    init_weights = _initializer_for(init, Record(output_rank=output_rank))
    W = Parameter(input_shape + output_shape, init=init_weights, name='W')
    b = Parameter(output_shape, init=init_bias, name='b') if bias else None

    # expression of this function
    x = Placeholder(name='dense_arg')
    apply_x = times(x,
                    W,
                    output_rank=output_rank,
                    infer_input_rank_to_map=infer_input_rank_to_map)
    if b:
        apply_x = apply_x + b
    apply_x = apply_x >> activation
    return Block(apply_x, 'Dense', Record(W=W, b=b))
Esempio n. 21
0
def _sparse_to_dense_network_cache(input_shape, is_sequence, device):
    from cntk.ops import times, input, sequence

    if is_sequence:
        temp_input = sequence.input(input_shape, is_sparse=True)
    else:
        temp_input = input(input_shape, is_sparse=True)

    eye_shape = input_shape[-1]
    return times(temp_input, np.eye(eye_shape))
Esempio n. 22
0
def fully_connected_classifier_net(input, num_output_classes, hidden_layer_dim, num_hidden_layers, device, nonlinearity):
    classifier_root = fully_connected_layer(input, hidden_layer_dim, device, nonlinearity)
    for i in range(1, num_hidden_layers):
        classifier_root = fully_connected_layer(classifier_root.output(), hidden_layer_dim, device, nonlinearity)
    
    output_times_param = parameter(shape=(hidden_layer_dim,num_output_classes))
    output_plus_param = parameter(shape=(num_output_classes,))
    t = times(classifier_root.output(),output_times_param)
    classifier_root = plus(output_plus_param,t.output()) 
    return classifier_root;
Esempio n. 23
0
def resnet_classifer(input, num_classes):
    conv_w_scale = 7.07
    conv_b_value = 0

    fc1_w_scale = 0.4
    fc1_b_value = 0

    sc_value = 1
    bn_time_const = 4096

    kernel_width = 3
    kernel_height = 3

    conv1_w_scale = 0.26
    c_map1 = 16

    conv1 = conv_bn_relu_layer(input, c_map1, kernel_width, kernel_height,
                               1, 1, conv1_w_scale, conv_b_value, sc_value, bn_time_const)
    rn1_1 = resnet_node2(conv1, c_map1, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)
    rn1_2 = resnet_node2(rn1_1, c_map1, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)
    rn1_3 = resnet_node2(rn1_2, c_map1, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)

    c_map2 = 32
    rn2_1_wProj = get_projection_map(c_map2, c_map1)
    rn2_1 = resnet_node2_inc(rn1_3, c_map2, kernel_width, kernel_height,
                             conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn2_1_wProj)
    rn2_2 = resnet_node2(rn2_1, c_map2, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)
    rn2_3 = resnet_node2(rn2_2, c_map2, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)

    c_map3 = 64
    rn3_1_wProj = get_projection_map(c_map3, c_map2)
    rn3_1 = resnet_node2_inc(rn2_3, c_map3, kernel_width, kernel_height,
                             conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn3_1_wProj)
    rn3_2 = resnet_node2(rn3_1, c_map3, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)
    rn3_3 = resnet_node2(rn3_2, c_map3, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const)

    # Global average pooling
    poolw = 8
    poolh = 8
    poolh_stride = 1
    poolv_stride = 1

    pool = pooling(rn3_3, AVG_POOLING, (1, poolh, poolw), (1, poolv_stride, poolh_stride))
    out_times_params = parameter(shape=(c_map3, 1, 1, num_classes), init=glorot_uniform())
    out_bias_params = parameter(shape=(num_classes), init=0)
    t = times(pool, out_times_params)
    return t + out_bias_params
Esempio n. 24
0
def test_eval_sparse_no_seq(batch_index_data, device_id):
    dim = 10
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse)
        z = times(in1, multiplier*np.eye(dim))
        batch = np.eye(dim)[batch_index_data]
        expected = batch * multiplier
        sparse_val = csr(batch.astype('f'))
        result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id))
        assert np.allclose(result, [expected])
Esempio n. 25
0
def test_eval_sparse_no_seq(batch_index_data, device_id):
    dim = 10
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim, ), is_sparse=var_is_sparse)
        z = times(in1, multiplier * np.eye(dim))
        batch = np.eye(dim)[batch_index_data]
        expected = batch * multiplier
        sparse_val = csr(batch.astype('f'))
        result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id))
        assert np.allclose(result, [expected])
Esempio n. 26
0
def test_disallow_seq_starts_with_Value_objects():
    one_hot_batch = [[2,5], [0,1,6]]
    dim = 10

    in1 = input_variable(shape=(dim,), is_sparse=True)
    z = times(in1, np.eye(dim))
    batch = one_hot(one_hot_batch, num_classes=dim)

    with pytest.raises(ValueError):
        result = z.eval(({in1: batch}, len(batch)*[True]))

    with pytest.raises(ValueError):
        result = z.eval({in1: (batch, len(batch)*[True])})
Esempio n. 27
0
def test_eval_one_hot_seq(one_hot_batch, device_id):
    dim = 10
    multiplier = 2

    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse)
        # Convert CNTK node value to dense so that we can compare it later
        z = times(in1, np.eye(dim)*multiplier)
        # Convert expectation to dense
        expected = [np.eye(dim)[seq]*multiplier for seq in one_hot_batch]
        batch = one_hot(one_hot_batch, num_classes=dim, device=cntk_device(device_id))
        result = z.eval({in1: batch}, device=cntk_device(device_id))
        assert np.all([np.allclose(a,b) for a,b in zip(result, expected)])
Esempio n. 28
0
def test_disallow_seq_starts_with_Value_objects():
    one_hot_batch = [[2, 5], [0, 1, 6]]
    dim = 10

    in1 = input_variable(shape=(dim, ), is_sparse=True)
    z = times(in1, np.eye(dim))
    batch = one_hot(one_hot_batch, num_classes=dim)

    with pytest.raises(ValueError):
        result = z.eval(({in1: batch}, len(batch) * [True]))

    with pytest.raises(ValueError):
        result = z.eval({in1: (batch, len(batch) * [True])})
Esempio n. 29
0
def test_eval_one_hot_seq(one_hot_batch, device_id):
    dim = 10
    multiplier = 2

    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse)
        # Convert CNTK node value to dense so that we can compare it later
        z = times(in1, np.eye(dim)*multiplier)
        # Convert expectation to dense
        expected = [np.eye(dim)[seq]*multiplier for seq in one_hot_batch]
        batch = one_hot(one_hot_batch, num_classes=dim, device=cntk_device(device_id))
        result = z.eval({in1: batch}, device=cntk_device(device_id))
        assert np.all([np.allclose(a,b) for a,b in zip(result, expected)])
Esempio n. 30
0
    def lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, H)

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid (peep (it_proj, dcs, Ci))        # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation (bit_proj)              # applied to tanh of input network

        ft = sigmoid (peep (ft_proj, dcs, Cf))        # forget-me-not gate(t)
        bft = ft * dc                                 # applied to cell(t-1)

        ct = bft + bit                                # c(t) is sum of both

        ot = sigmoid (peep (ot_proj, Sct(ct), Co))    # output gate(t)
        ht = ot * activation (ct)                     # applied to tanh(cell(t))

        c = ct                                        # cell value
        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        return (Function.NamedOutput(h=h), Function.NamedOutput(c=c))
Esempio n. 31
0
def test_eval_sparse_seq_1(batch, device_id):
    dim = 4
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse)
        z = times(in1, multiplier*np.eye(dim))
        if isinstance(batch[0], list):
            expected = [np.vstack([m.todense() * multiplier for m in seq]) for seq in
                    batch]
        else:
            expected = [seq.todense() * multiplier for seq in batch]
        result = z.eval({in1: batch}, device=cntk_device(device_id))

        assert np.all([np.allclose(a,b) for a,b in zip(result, expected)]), \
                "%s != %s"%(result,expected)
Esempio n. 32
0
def test_eval_sparse_seq_1(batch, device_id):
    dim = 4
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse)
        z = times(in1, multiplier*np.eye(dim))
        if isinstance(batch[0], list):
            expected = [np.vstack([m.todense() * multiplier for m in seq]) for seq in
                    batch]
        else:
            expected = [seq.todense() * multiplier for seq in batch]
        result = z.eval({in1: batch}, device=cntk_device(device_id))

        assert np.all([np.allclose(a,b) for a,b in zip(result, expected)]), \
                "%s != %s"%(result,expected)
Esempio n. 33
0
def Dense(shape, init=init_default_or_glorot_uniform, activation=activation_default_or_None,
          input_rank=None, map_rank=None,
          bias=bias_default_or_True, init_bias=init_bias_default_or_0):
    activation = _resolve_activation(activation)
    bias       = bias if _is_given(bias) else _current_default_options.bias
    output_shape = _as_tuple(shape)

    if input_rank is not None and map_rank is not None:
        raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.")

    # determine meaning of axes
    # W gets dimension (input_shape + shape)
    # where input_shape is determined as:
    #  - by default, equal to the dimensions of the input passed to Dense()
    #  - if input_rank is given, then the last 'input_rank' dimensions of the input (all others are not reduced over)
    #  - if map_rank is given, then the all but the first 'map_rank' dimensions of the input (those are not reduced over)
    # where input_rank and map_rank are mutuallly exclusive.

    #output_rank = -len(output_shape)   # support outputs with tensor layouts
    # BUGBUG: Should this be a negative number now, since output is the last axis in Python?
    output_rank = len(output_shape)   # support outputs with tensor layouts

    # If input_rank not given then pass a single _INFERRED; map_rank if given will determine the input_rank.
    # The dimension inference may still create multiple axes.
    input_shape = _INFERRED * (input_rank if input_rank is not None else 1)

    if input_rank is not None:
        UntestedBranchError("Dense, input_rank option not implemented")
        infer_input_rank_to_map = -1 # means map_rank is not specified; input_rank rules
    elif map_rank is None:
        infer_input_rank_to_map = 0  # neither given: default to 'infer W to use all input dims'
    else:
        UntestedBranchError("Dense, map_rank option not implemented")
        infer_input_rank_to_map = map_rank  # infer W to use all input dims except the first static 'map_rank' ones

    # parameters bound to this Function
    init_weights = _initializer_for(init, Record(output_rank=output_rank))
    W = Parameter(input_shape + output_shape, init=init_weights, name='W')
    b = Parameter(              output_shape, init=init_bias,    name='b') if bias else None

    # expression of this function
    x = Placeholder(name='dense_arg')
    apply_x = times(x, W, output_rank=output_rank, infer_input_rank_to_map=infer_input_rank_to_map)
    if b:
        apply_x = apply_x + b
    apply_x = apply_x >> activation
    return Block(apply_x, 'Dense', Record(W=W, b=b))
Esempio n. 34
0
def test_model_one_output_of_multi_output_function():
    input_dim = 2
    proj_dim = 11
    x = input_variable((input_dim,))

    x_placeholder = placeholder_variable()
    w = parameter((input_dim, proj_dim))
    b = parameter((proj_dim,))
    proj = times(x_placeholder, w)
    proj_plus_bias = proj + b
    combined_model = as_block(combine([proj, proj_plus_bias]), [(x_placeholder, x)], 'dense_op')

    labels = input_variable((proj_dim,))
    lr_schedule = learning_rate_schedule(0.003, UnitType.sample)
    ce = cross_entropy_with_softmax(combined_model.outputs[0], labels)
    pe = classification_error(combined_model.outputs[0], labels)
    trainer_multitask = Trainer(combined_model.outputs[0], (ce, pe), sgd(ce.parameters, lr=lr_schedule))
Esempio n. 35
0
def test_model_one_output_of_multi_output_function():
    input_dim = 2
    proj_dim = 11
    x = input_variable((input_dim, ))

    x_placeholder = placeholder_variable()
    w = parameter((input_dim, proj_dim))
    b = parameter((proj_dim, ))
    proj = times(x_placeholder, w)
    proj_plus_bias = proj + b
    combined_model = as_block(combine([proj, proj_plus_bias]),
                              [(x_placeholder, x)], 'dense_op')

    labels = input_variable((proj_dim, ))
    lr_schedule = learning_rate_schedule(0.003, UnitType.sample)
    ce = cross_entropy_with_softmax(combined_model.outputs[0], labels)
    pe = classification_error(combined_model.outputs[0], labels)
    trainer_multitask = Trainer(combined_model.outputs[0], (ce, pe),
                                sgd(ce.parameters, lr=lr_schedule))
def frcn_predictor(features, rois, n_classes, base_path):
    # model specific variables for AlexNet
    model_file = base_path + "/../../../resources/cntk/AlexNet.model"
    roi_dim = 6
    feature_node_name = "features"
    last_conv_node_name = "conv5.y"
    pool_node_name = "pool3"
    last_hidden_node_name = "h2_d"

    # Load the pretrained classification net and find nodes
    print("Loading pre-trained model...")
    loaded_model = load_model(model_file)
    print("Loading pre-trained model... DONE.")
    feature_node = find_by_name(loaded_model, feature_node_name)
    conv_node = find_by_name(loaded_model, last_conv_node_name)
    pool_node = find_by_name(loaded_model, pool_node_name)
    last_node = find_by_name(loaded_model, last_hidden_node_name)

    # Clone the conv layers and the fully connected layers of the network
    conv_layers = combine([conv_node.owner
                           ]).clone(CloneMethod.freeze,
                                    {feature_node: placeholder()})
    fc_layers = combine([last_node.owner]).clone(CloneMethod.clone,
                                                 {pool_node: placeholder()})

    # Create the Fast R-CNN model
    feat_norm = features - constant(114)
    conv_out = conv_layers(feat_norm)
    roi_out = roipooling(conv_out, rois, (roi_dim, roi_dim))
    fc_out = fc_layers(roi_out)
    #fc_out.set_name("fc_out")

    # z = Dense(rois[0], num_classes, map_rank=1)(fc_out)  # --> map_rank=1 is not yet supported
    W = parameter(shape=(4096, n_classes), init=glorot_uniform())
    b = parameter(shape=n_classes, init=0)
    z = times(fc_out, W) + b
    return z, fc_out
Esempio n. 37
0
def frcn_predictor(features, rois, n_classes):
    # Load the pretrained classification net and find nodes
    loaded_model = load_model(model_file)
    feature_node = find_by_name(loaded_model, feature_node_name)
    conv_node    = find_by_name(loaded_model, last_conv_node_name)
    pool_node    = find_by_name(loaded_model, pool_node_name)
    last_node    = find_by_name(loaded_model, last_hidden_node_name)

    # Clone the conv layers and the fully connected layers of the network
    conv_layers = combine([conv_node.owner]).clone(CloneMethod.freeze, {feature_node: Placeholder()})
    fc_layers = combine([last_node.owner]).clone(CloneMethod.clone, {pool_node: Placeholder()})

    # Create the Fast R-CNN model
    feat_norm = features - Constant(114)
    conv_out  = conv_layers(feat_norm)
    roi_out   = roipooling(conv_out, rois, (roi_dim, roi_dim))
    fc_out    = fc_layers(roi_out)

    # z = Dense(rois[0], num_classes, map_rank=1)(fc_out)  # --> map_rank=1 is not yet supported
    W = parameter(shape=(4096, n_classes), init=glorot_uniform())
    b = parameter(shape=n_classes, init=0)
    z = times(fc_out, W) + b

    return z
Esempio n. 38
0
def gru_cell(shape, init=glorot_uniform(), name=''):  # (x, (h,c))
    """ GRU cell function
  """
    shape = _as_tuple(shape)

    if len(shape) != 1:
        raise ValueError("gru_cell: shape must be vectors (rank-1 tensors)")

    # determine stacking dimensions
    cell_shape_stacked = shape * 2  # patched dims with stack_axis duplicated 2 times

    # parameters
    Wz = Parameter(cell_shape_stacked, init=init, name='Wz')
    Wr = Parameter(cell_shape_stacked, init=init, name='Wr')
    Wh = Parameter(cell_shape_stacked, init=init, name='Wh')
    Uz = Parameter(_INFERRED + shape, init=init, name='Uz')
    Ur = Parameter(_INFERRED + shape, init=init, name='Ur')
    Uh = Parameter(_INFERRED + shape, init=init, name='Uh')

    def create_s_placeholder():
        # we pass the known dimensions here, which makes dimension inference easier
        return Placeholder(shape=shape, name='S')  # (h, c)

    # parameters to model function
    x = Placeholder(name='gru_block_arg')
    prev_status = create_s_placeholder()

    # formula of model function
    Sn_1 = prev_status

    z = sigmoid(times(x, Uz, name='x*Uz') + times(Sn_1, Wz, name='Sprev*Wz'),
                name='z')
    r = sigmoid(times(x, Ur, name='x*Ur') + times(Sn_1, Wr, name='Sprev*Wr'),
                name='r')
    h = tanh(times(x, Uh, name='x*Uh') +
             times(element_times(Sn_1, r, name='Sprev*r'), Wh),
             name='h')
    s = plus(element_times((1 - z), h, name='(1-z)*h'),
             element_times(z, Sn_1, name='z*SPrev'),
             name=name)
    apply_x_s = combine([s])
    apply_x_s.create_placeholder = create_s_placeholder
    return apply_x_s
Esempio n. 39
0
def resnet_classifer(input, num_classes, device, output_name):
    conv_w_scale = 7.07
    conv_b_value = 0

    fc1_w_scale = 0.4
    fc1_b_value = 0

    sc_value = 1
    bn_time_const = 4096

    kernel_width = 3
    kernel_height = 3

    conv1_w_scale = 0.26
    c_map1 = 16

    conv1 = conv_bn_relu_layer(input, c_map1, kernel_width, kernel_height, 1,
                               1, conv1_w_scale, conv_b_value, sc_value,
                               bn_time_const, device)
    rn1_1 = resnet_node2(conv1.output(), c_map1, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const,
                         device)
    rn1_2 = resnet_node2(rn1_1.output(), c_map1, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const,
                         device)
    rn1_3 = resnet_node2(rn1_2.output(), c_map1, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const,
                         device)

    c_map2 = 32
    rn2_1_wProj = get_projection_map(c_map2, c_map1, device)
    rn2_1 = resnet_node2_inc(rn1_3.output(), c_map2, kernel_width,
                             kernel_height, conv1_w_scale, conv_b_value,
                             sc_value, bn_time_const, rn2_1_wProj, device)
    rn2_2 = resnet_node2(rn2_1.output(), c_map2, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const,
                         device)
    rn2_3 = resnet_node2(rn2_2.output(), c_map2, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const,
                         device)

    c_map3 = 64
    rn3_1_wProj = get_projection_map(c_map3, c_map2, device)
    rn3_1 = resnet_node2_inc(rn2_3.output(), c_map3, kernel_width,
                             kernel_height, conv1_w_scale, conv_b_value,
                             sc_value, bn_time_const, rn3_1_wProj, device)
    rn3_2 = resnet_node2(rn3_1.output(), c_map3, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const,
                         device)
    rn3_3 = resnet_node2(rn3_2.output(), c_map3, kernel_width, kernel_height,
                         conv1_w_scale, conv_b_value, sc_value, bn_time_const,
                         device)

    # Global average pooling
    poolw = 8
    poolh = 8
    poolh_stride = 1
    poolv_stride = 1

    pool = pooling(rn3_3.output(), AVG_POOLING, (1, poolh, poolw),
                   (1, poolv_stride, poolh_stride))
    out_times_params = parameter(shape=(c_map3, 1, 1, num_classes),
                                 device_id=device)
    out_bias_params = parameter(shape=(num_classes, ), device_id=device)
    t = times(pool.output(), out_times_params)
    return plus(t.output(), out_bias_params, output_name)
Esempio n. 40
0
def LSTM(shape, cell_shape=None, use_peepholes=use_peepholes_default_or_False,
         init=init_default_or_glorot_uniform, init_bias=init_bias_default_or_0,
         enable_self_stabilization=enable_self_stabilization_default_or_False): # (x, (h, c))

    use_peepholes             = use_peepholes             if _is_given(use_peepholes)             else _current_default_options.use_peepholes
    enable_self_stabilization = enable_self_stabilization if _is_given(enable_self_stabilization) else _current_default_options.enable_self_stabilization
    has_projection = cell_shape is not None
    has_aux = False

    if has_aux:
        UntestedBranchError("LSTM, has_aux option")

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError("LSTM: shape and cell_shape must be vectors (rank-1 tensors)")
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # stacking along the fastest-changing one, to match BS
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[0]
    cell_shape_list[stack_axis] = stacked_dim*4
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b  = Parameter(            cell_shape_stacked, init=init_bias, name='b')                              # a bias
    W  = Parameter(_INFERRED + cell_shape_stacked, init=init,      name='W')                              # input
    A  = Parameter(_INFERRED + cell_shape_stacked, init=init,      name='A') if has_aux else None         # aux input (optional)
    H  = Parameter(shape     + cell_shape_stacked, init=init,      name='H')                              # hidden-to-hidden
    Ci = Parameter(            cell_shape,         init=init,      name='Ci') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Cf = Parameter(            cell_shape,         init=init,      name='Cf') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Co = Parameter(            cell_shape,         init=init,      name='Co') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}

    Wmr = Parameter(cell_shape + shape, init=init) if has_projection else None  # final projection

    Sdh = Stabilizer() if enable_self_stabilization else identity
    Sdc = Stabilizer() if enable_self_stabilization else identity
    Sct = Stabilizer() if enable_self_stabilization else identity
    Sht = Stabilizer() if enable_self_stabilization else identity

    def create_hc_placeholder():
        # we pass the known dimensions here, which makes dimension inference easier
        return (Placeholder(shape=shape, name='hPh'), Placeholder(shape=cell_shape, name='cPh')) # (h, c)

    # parameters to model function
    x = Placeholder(name='lstm_block_arg')
    prev_state = create_hc_placeholder()

    # formula of model function
    dh, dc = prev_state

    dhs = Sdh(dh)  # previous values, stabilized
    dcs = Sdc(dc)
    # note: input does not get a stabilizer here, user is meant to do that outside

    # projected contribution from input(s), hidden, and bias
    proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \
            b + times(x, W) + times(dhs, H)

    it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
    bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
    ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
    ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

    # add peephole connection if requested
    def peep(x, c, C):
        return x + C * c if use_peepholes else x

    it = sigmoid (peep (it_proj, dcs, Ci))        # input gate(t)
    bit = it * tanh (bit_proj)                    # applied to tanh of input network

    ft = sigmoid (peep (ft_proj, dcs, Cf))        # forget-me-not gate(t)
    bft = ft * dc                                 # applied to cell(t-1)

    ct = bft + bit                                # c(t) is sum of both

    ot = sigmoid (peep (ot_proj, Sct(ct), Co))    # output gate(t)
    ht = ot * tanh (ct)                           # applied to tanh(cell(t))

    c = ct                                        # cell value
    h = times(Sht(ht), Wmr) if has_projection else \
        ht

    _name_node(h, 'h')
    if _trace_layers:
        _log_node(h)  # this looks right
    _name_node(c, 'c')

    # TODO: figure out how to do scoping, and also rename all the apply... to expression
    apply_x_h_c = combine ([h, c])
    # return to caller a helper function to create placeholders for recurrence
    # Note that this function will only exist in the object returned here, but not any cloned version of it.
    apply_x_h_c.create_placeholder = create_hc_placeholder
    #return Block(apply_x_h_c, 'LSTM') # BUGBUG: fails with "RuntimeError: A Function instance with more than one output cannot be implicitly converted to a Variable"
    return apply_x_h_c
Esempio n. 41
0
 def no_op(input):
     return times(input, I)
Esempio n. 42
0
def test_op_times(left_operand, right_operand, device_id, precision,
        left_matrix_type, right_matrix_type):
    if right_matrix_type == 'sparse':
        pytest.skip('second operator of times() has to be dense')

    dt = PRECISION_TO_TYPE[precision]
    # Forward pass test
    #==================
    # we compute the expected output for the forward pass
    # we need two surrounding brackets
    # the first for sequences (length=1, since we have dynamic_axis='')
    # the second for batch of one sample
    expected = [[np.dot(AA(left_operand, dtype=dt), AA(right_operand, dtype=dt))]]
    
    if left_matrix_type == 'sparse':
        a = SI(*batch_dense_to_sparse([left_operand]))
    else:
        a = I([left_operand])

    b = I([right_operand])

    from cntk.ops import times, constant
    left_as_input = times(a, constant(right_operand))
    right_as_input = times(constant(left_operand), b)

    unittest_helper(left_as_input, None, expected, device_id=device_id,
                    precision=precision, clean_up=True, backward_pass=False)

    unittest_helper(right_as_input, None, expected, device_id=device_id,
                    precision=precision, clean_up=True, backward_pass=False)

    unittest_helper(times(a, b), None, expected, device_id=device_id,
                    precision=precision, clean_up=True, backward_pass=False)


    # Backward pass test
    #==================

    def op_grad(A, B):
        '''
        Compute derivative of A with respect to B. For simplicity, assume A
        and B to be matrices.
        Let A be 2x2 and B be 2x1, then we have
        [a11 a12] [b11]  = [ a11 b11 + a12 b21 ]
        [a21 a22] [b21]    [ a21 b11 + a22 b21 ]

        The derivative for A with respect to B is
        [b11 b21]
        [b11 b21]

        The derivative for B with respect to A:
        [a11 + a12]
        [a21 + a22]
        '''
        assert len(A.shape) == len(B.shape) == 2
        D = np.zeros_like(A)
        D[:,:] = B.sum(axis=1)
        
        return D

    if 'sparse' not in [left_matrix_type, right_matrix_type]:
        # FIXME: disabling until the Pass node supports sparse 
        expected_left = [[op_grad(AA(left_operand, dtype=dt), AA(right_operand, dtype=dt))]]
        expected_right = [[op_grad(AA(right_operand, dtype=dt).T, AA(left_operand, dtype=dt).T).T]]

        unittest_helper(left_as_input, None, expected_left, device_id=device_id,
                        precision=precision, clean_up=True, backward_pass=True, input_node=a)
        # BUG: Fails because of Pass node?
        unittest_helper(right_as_input, None, expected_right, device_id=device_id,
                        precision=precision, clean_up=True, backward_pass=True, input_node=b)
Esempio n. 43
0
def termination_gate(init=glorot_uniform(), name=''):
    Wt = Parameter(_INFERRED + tuple((1, )), init=init, name='Wt')
    status = placeholder_variable(name='status')
    return sigmoid(times(status, Wt), name=name)
Esempio n. 44
0
 def _linear(x):
     apply_x = ops.times(x, sc)
     apply_x += b
     return apply_x
Esempio n. 45
0
    else:
        recurrence_hook_h = lambda operand: element_select(
            is_first_label, thought_vector_broadcast_h, past_value(operand))
        recurrence_hook_c = lambda operand: element_select(
            is_first_label, thought_vector_broadcast_c, past_value(operand))

    (decoder_output_h,
     decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim,
                                    recurrence_hook_h, recurrence_hook_c)
    # 1.
# Add the linear layer

W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim),
              init=glorot_uniform())
B = parameter(shape=(label_vocab_dim), init=0)
z = plus(B, times(decoder_output_h, W))


def create_model():

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(input_vocab_dim),
                               dynamic_axes=input_dynamic_axes,
                               name='raw_input')

    label_dynamic_axes = [batch_axis, label_seq_axis]
Esempio n. 46
0
def create_model():

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(input_vocab_dim),
                               dynamic_axes=input_dynamic_axes,
                               name='raw_input')

    label_dynamic_axes = [batch_axis, label_seq_axis]
    raw_labels = input_variable(shape=(label_vocab_dim),
                                dynamic_axes=label_dynamic_axes,
                                name='raw_labels')

    # Instantiate the sequence to sequence translation model
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
    label_sequence = sequence.slice(
        raw_labels, 1, 0,
        name='label_sequence')  # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)  # <s>

    # Setup primer for decoder
    is_first_label = sequence.is_first(label_sequence)  # 1 0 0 0 ...
    label_sentence_start_scattered = sequence.scatter(label_sentence_start,
                                                      is_first_label)

    # Encoder
    stabilize = Stabilizer()
    encoder_output_h = stabilize(input_sequence)
    for i in range(0, num_layers):
        (encoder_output_h,
         encoder_output_c) = LSTM_layer(encoder_output_h.output, hidden_dim,
                                        future_value, future_value)

    # Prepare encoder output to be used in decoder
    thought_vector_h = sequence.first(encoder_output_h)
    thought_vector_c = sequence.first(encoder_output_c)

    thought_vector_broadcast_h = sequence.broadcast_as(thought_vector_h,
                                                       label_sequence)
    thought_vector_broadcast_c = sequence.broadcast_as(thought_vector_c,
                                                       label_sequence)

    # Decoder
    decoder_history_hook = alias(
        label_sequence, name='decoder_history_hook')  # copy label_sequence

    decoder_input = element_select(is_first_label,
                                   label_sentence_start_scattered,
                                   past_value(decoder_history_hook))

    decoder_output_h = stabilize(decoder_input)
    for i in range(0, num_layers):
        if (i > 0):
            recurrence_hook_h = past_value
            recurrence_hook_c = past_value
        else:
            recurrence_hook_h = lambda operand: element_select(
                is_first_label, thought_vector_broadcast_h, past_value(operand)
            )
            recurrence_hook_c = lambda operand: element_select(
                is_first_label, thought_vector_broadcast_c, past_value(operand)
            )

        (decoder_output_h,
         decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim,
                                        recurrence_hook_h, recurrence_hook_c)

    # Linear output layer
    W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim),
                  init=glorot_uniform())
    B = parameter(shape=(label_vocab_dim), init=0)
    z = plus(B, times(stabilize(decoder_output_h), W))

    return z
Esempio n. 47
0
 def rnn(dh, x):
     dhs = Sdh(dh)  # previous value, stabilized
     ht = activation (times(x, W) + times(dhs, H) + b)
     h = times(Sht(ht), Wmr) if has_projection else \
         ht
     return Function.NamedOutput(h=h)
Esempio n. 48
0
def _sparse_to_dense_network_cache(input_shape):
    from cntk.ops import times, input_variable

    temp_input = input_variable(input_shape)
    eye_shape = input_shape[-1]
    return times(temp_input, np.eye(eye_shape))
Esempio n. 49
0
 def rnn_step(dh, x):
     dhs = Sdh(dh)  # previous value, stabilized
     ht = activation(times(x, W) + dhs * H + b)
     h = times(Sht(ht), Wmr) if has_projection else \
         ht
     return h