Example #1
0
def seqcla():

    # LSTM params
    input_dim = 50
    output_dim = 128
    cell_dim = 128
    
    # model
    num_labels = 5
    vocab = 2000
    embed_dim = 50    

    t = C.dynamic_axis(name='t')
    features = C.sparse_input(vocab, dynamic_axis=t, name='features')    
    labels = C.input(num_labels, name='labels')
   
    train_reader = C.CNTKTextFormatReader(train_file)

    # setup embedding matrix
    embedding = C.parameter((embed_dim, vocab), learning_rate_multiplier=0.0, 
                             init_from_file_path=embedding_file)

    # get the vector representing the word
    sequence = C.times(embedding, features, name='sequence')
    
    # add an LSTM layer
    L = lstm_layer(output_dim, cell_dim, sequence, input_dim)
    
    # add a softmax layer on top
    w = C.parameter((num_labels, output_dim), name='w')
    b = C.parameter((num_labels), name='b')
    z = C.times(w, L) + b
    z.name='z'
    z.tag = "output"
    
    # and reconcile the shared dynamic axis
    pred = C.reconcile_dynamic_axis(z, labels, name='pred')    
    
    ce = C.cross_entropy_with_softmax(labels, pred)
    ce.tag = "criterion"
    
    my_sgd = C.SGDParams(epoch_size=0, minibatch_size=10, learning_rates_per_mb=0.1, max_epochs=3)    
    
    with C.LocalExecutionContext('seqcla') as ctx:
        # train the model
        ctx.train(root_nodes=[ce], training_params=my_sgd, input_map=train_reader.map(
                  features, alias='x', dim=vocab, format='Sparse').map(
                  labels, alias='y', dim=num_labels, format='Dense'))        
        
        # write out the predictions
        ctx.write(input_map=train_reader.map(
                  features, alias='x', dim=vocab, format='Sparse').map(
                  labels, alias='y', dim=num_labels, format='Dense'))
                  
        # do some manual accuracy testing
        acc = calc_accuracy(train_file, ctx.output_filename_base)
        
        # and test for the same number...
        TOLERANCE_ABSOLUTE = 1E-02
        assert np.allclose(acc, 0.6006415396952687, atol=TOLERANCE_ABSOLUTE)
Example #2
0
def test_op_times_reduce_sequence_axis(device_id, precision):
    dt_precision = PRECISION_TO_TYPE[precision]

    from cntk import times, Value, TIMES_REDUCE_SEQUENCE_AXIS_WITHOUT_INFERRED_INPUT_RANK
    from cntk import sequence
    dim = 10
    seq = [[0,1,2], [3], [4,5,6,7,8,9]]
    right_data = Value.one_hot(seq, dim, dtype=dt_precision)
    right_var = sequence.input_variable(shape=(dim), is_sparse=True, dtype=dt_precision)
    left_data = [AA([1,1,1],dtype=dt_precision), AA([1],dtype=dt_precision), AA([1,1,1,1,1,1],dtype=dt_precision)]
    left_var = sequence.input_variable(shape=(1), dtype=dt_precision)

    func = times(left_var, right_var, infer_input_rank_to_map=TIMES_REDUCE_SEQUENCE_AXIS_WITHOUT_INFERRED_INPUT_RANK)
    func2 = sequence.reduce_sum(times(left_var, right_var))

    assert func.dynamic_axes == func2.dynamic_axes

    _, forward_output = func.forward({left_var:left_data, right_var:right_data})
    
    actual_forward = forward_output[func.output]

    expected_forward = AA([[[1,1,1,0,0,0,0,0,0,0]],
                           [[0,0,0,1,0,0,0,0,0,0]],
                           [[0,0,0,0,1,1,1,1,1,1]]])
    
    assert np.allclose(actual_forward, expected_forward)
Example #3
0
def test_validation_before_eval():
    w = C.parameter((4,C.InferredDimension))
    v = C.parameter((C.InferredDimension,5))
    wv = C.times(w,v)

    p = C.input((4,1))
    wp = C.times(w,p)

    q = C.input((1,5))
    qv = C.times(q,v)

    with pytest.raises(ValueError):
        wv.eval()
def test_free_static_axis_in_recurrence():
    x = C.sequence.input_variable((C.FreeDimension, 2))
    out_placeholder = C.placeholder()
    out_past = C.sequence.past_value(out_placeholder)
    wh = C.parameter(init=np.asarray([[2, 5], [1, 3]], dtype=np.float32))
    wx = C.parameter(init=np.asarray([[1, 4], [2, 5]], dtype=np.float32))
    out = C.times(x, wx) + C.times(out_past, wh)
    out.replace_placeholders({out_placeholder : out})
    
    x_data = np.asarray([[0.5, 0.2], [-0.7, 1.2]], np.float32)
    w_grad, out_val = out.grad({x : x_data}, wrt=[wh, wx], outputs=[out])
    assert np.allclose(out_val, [[[[0.9, 3.], [1.7, 3.2]]]])
    assert np.allclose(w_grad[wx], [[-0.2, -0.2], [1.4, 1.4]])
Example #5
0
def cross_entropy_with_sampled_softmax(
    hidden_vector,           # Node providing the output of the recurrent layers
    target_vector,           # Node providing the expected labels (as sparse vectors)
    vocab_dim,               # Vocabulary size
    hidden_dim,              # Dimension of the hidden vector
    num_samples,             # Number of samples to use for sampled softmax
    sampling_weights,        # Node providing weights to be used for the weighted sampling
    allow_duplicates = False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement.
    ):
    bias = C.Parameter(shape = (vocab_dim, 1), init = 0)
    weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

    sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size]
    if use_sparse:
        sample_selector = sample_selector_sparse
    else:
        # Note: Sampled softmax with dense data is only supported for debugging purposes.
        # It might easily run into memory issues as the matrix 'I' below might be quite large.
        # In case we wan't to a dense representation for all data we have to convert the sample selector
        I = C.Constant(np.eye(vocab_dim, dtype=np.float32))
        sample_selector = C.times(sample_selector_sparse, I)

    inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size]
    log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim]


    print("hidden_vector: "+str(hidden_vector.shape))
    wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim]
    print("ws:"+str(wS.shape))
    zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')# [num_samples]

    # Getting the weight vector for the true label. Dimension hidden_dim
    wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim]
    zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(target_vector, bias, name='zT2') - C.times_transpose(target_vector, log_prior, name='zT3') # [1]


    zSReduced = C.reduce_log_sum_exp(zS)

    # Compute the cross entropy that is used for training.
    # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted
    # twice in the normalizing denominator of sampled softmax.
    cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT

    # For applying the model we also output a node providing the input for the full softmax
    z = C.times_transpose(weights, hidden_vector) + bias
    z = C.reshape(z, shape = (vocab_dim))

    zSMax = C.reduce_max(zS)
    error_on_samples = C.less(zT, zSMax)
    return (z, cross_entropy_on_samples, error_on_samples)
Example #6
0
def test_replace_placeholder_s():
    left_val = [[10,2]]
    right_val = [[2],[3]]

    p = C.placeholder(shape=(1,2))
    c = C.constant(left_val)

    op = C.times(p, right_val)
    op.replace_placeholders({p:c})
    assert op.eval() == 26

    op = C.times(p, right_val)
    op.replace_placeholder(c)
    assert op.eval() == 26
Example #7
0
def test_clone_freeze():
    inputs = 3
    outputs = 5

    features = C.input_variable((inputs), np.float32)
    label = C.input_variable((outputs), np.float32)
    weights = C.parameter((inputs, outputs))
    const_weights = C.constant(weights.value)
    z = C.times(features, weights)
    c = C.times(features, const_weights)
    z_clone = z.clone('freeze')
    c_clone = c.clone('freeze')

    # check that z and z_clone are the same
    for p, q in zip(z.parameters, z_clone.constants):
        assert np.array_equal(p.value, q.value)

    # check that c and c_clone are the same
    for p, q in zip(c.constants, c_clone.constants):
        assert np.array_equal(p.value, q.value)

    # keep copies of the old values
    z_copies = [q.value for q in z_clone.constants]
    c_copies = [q.value for q in c_clone.constants]

    # update z
    trainer = C.Trainer(z, C.squared_error(z, label),  C.sgd(z.parameters, C.learning_rate_schedule(1.0, C.UnitType.minibatch)))
    x = np.random.randn(16,3).astype('f')
    y = np.random.randn(16,5).astype('f')
    trainer.train_minibatch({features: x, label: y})
    # update c
    for cc in c.constants:
        cc.value = np.random.randn(*cc.value.shape).astype('f')

    # check that z changed
    for p, q in zip(z.parameters, z_clone.constants):
        assert not np.array_equal(p.value, q.value)

    # check that z_clone did not change
    for p, q in zip(z_copies, z_clone.constants):
        assert np.array_equal(p, q.value)

    # check that c changed
    for p, q in zip(c.constants, c_clone.constants):
        assert not np.array_equal(p.value, q.value)

    # check that c_clone did not change
    for p, q in zip(c_copies, c_clone.constants):
        assert np.array_equal(p, q.value)
Example #8
0
def _graph_dict():
    # This function creates a graph that has no real meaning other than
    # providing something to traverse.
    d = {}

    d['i1'] = C.sequence.input_variable(shape=(2, 3), sequence_axis=Axis('ia'), name='i1')
    d['c1'] = C.constant(shape=(2, 3), value=6, name='c1')
    d['p1'] = C.parameter(shape=(3, 2), init=7, name='p1')

    d['op1'] = C.plus(d['i1'], d['c1'], name='op1')
    d['op2'] = C.times(d['op1'], d['p1'], name='op2')

    #d['slice'] = slice(d['c1'], Axis.default_dynamic_axis(), 0, 3)
    #label_sentence_start = sequence.first(raw_labels)

    # no name
    d['p2'] = C.parameter(shape=(2, 2))

    # duplicate names
    d['op3a'] = C.plus(d['op2'], d['p2'], name='op3')
    d['op3b'] = C.plus(d['op3a'], d['p2'], name='op3')

    d['first'] = C.sequence.first(d['op3b'], name='past')

    d['root'] = d['first']

    return d
Example #9
0
def test_op_gather_sparse(device_id):
    input_sparse_indices = [[1, 3, 5, 5], [2, 4], [0, 2]]
    vocab_size = 6
    input_data = Value.one_hot(input_sparse_indices, vocab_size)

    a = C.sequence.input_variable(shape=(vocab_size,), is_sparse=True, name='a')

    a_last = C.sequence.last(a)
    a_last_dense = C.times(a_last, np.eye(vocab_size))
    res = a_last_dense.eval({a : input_data})
    assert np.array_equal(res, [[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0]])

    a_last_2 = C.sequence.slice(a, -2, 0)
    a_last_2_dense = C.times(a_last_2, np.eye(vocab_size))
    res = a_last_2_dense.eval({a : input_data})
    assert np.array_equal(res, [[[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]], [[0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0]], [[1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]]])
Example #10
0
def test_trainer_with_some_params_not_learned():
    input_dim = 2
    proj_dim = 2
    x = input_variable(shape=(input_dim,))
    W = parameter(shape=(input_dim, proj_dim), init=glorot_uniform())
    B = parameter(shape=(proj_dim,), init=glorot_uniform())
    t = times(x, W)
    z = t + B

    W_orig_value = W.value
    B_orig_value = B.value

    labels = input_variable(shape=(proj_dim,))
    ce = cross_entropy_with_softmax(z, labels)
    pe = classification_error(z, labels)

    lr_per_sample = learning_rate_schedule(0.1, UnitType.sample)
    trainer = Trainer(z, (ce, pe), sgd([W], lr_per_sample))

    x_value = [[1, 1],[2, 2]]
    label_value = [[0, 1], [1, 0]]
    arguments = {x: x_value, labels: label_value}

    num_iters = 3
    for i in range(num_iters):
        trainer.train_minibatch(arguments)

        assert np.array_equal(B.value, B_orig_value)
        assert not np.array_equal(W.value, W_orig_value)
        W_orig_value = W.value

    trainer.test_minibatch(arguments)
Example #11
0
def test_op_batch_times_grad_with_beta_equals_to_one(left_operand, right_operand, device_id, precision):
    dt_precision = PRECISION_TO_TYPE[precision]
    a = AA(left_operand, dtype=dt_precision)
    b = AA(right_operand, dtype=dt_precision)
    
    root_gradient = np.ones_like(a)
    
    input1 = C.input_variable((2,2), needs_gradient=True)
    input2 = C.input_variable((2,2), needs_gradient=True)
    z = input1 + input2 + C.times(input1, input2)
    state, actual_forward = z.forward({input1: a, input2: b}, [z.output], {z.output}, cntk_device(device_id))
    actual_backwards = z.backward(state, {z.output: root_gradient}, [input1, input2])
    
    k = a.shape[0]
    left_backward = np.ones_like(a)
    for x in range(k):
        left_backward[x, ...] += b[x].sum(axis=-1)
    right_backward = np.ones_like(b)
    for x in range(k):
        transpose_axes = list(np.roll(np.arange(len(b.shape[1:])), -1))
        sum_axes = tuple(np.arange(0, len(a.shape) - len(b.shape) + 1))
        right_backward[x, ...] += np.transpose(
            AA([a[x].sum(axis=sum_axes)]), axes=transpose_axes)

    assert np.allclose(actual_backwards[input1], left_backward)
    assert np.allclose(actual_backwards[input2], right_backward)
Example #12
0
def create_fast_rcnn_predictor(conv_out, rois, fc_layers):
    # RCNN
    roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (roi_dim, roi_dim), spatial_scale=1/16.0)
    fc_out = fc_layers(roi_out)

    # prediction head
    W_pred = parameter(shape=(4096, globalvars['num_classes']), init=normal(scale=0.01), name="cls_score.W")
    b_pred = parameter(shape=globalvars['num_classes'], init=0, name="cls_score.b")
    cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score')

    # regression head
    W_regr = parameter(shape=(4096, globalvars['num_classes']*4), init=normal(scale=0.001), name="bbox_regr.W")
    b_regr = parameter(shape=globalvars['num_classes']*4, init=0, name="bbox_regr.b")
    bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr')

    return cls_score, bbox_pred
Example #13
0
def test_data_type_inference():
    x_float = C.input_variable((1,), dtype = np.float64)
    param1 = C.parameter((C.InferredDimension, 1), init = C.glorot_uniform(), dtype = C.cntk_py.DataType_Unknown)
    assert (param1.get_data_type() == C.cntk_py.DataType_Unknown)

    x_times_param1 = C.times(x_float, param1)
    assert (param1.dtype == np.float64)
Example #14
0
    def session(is_sparse):
        x = C.input_variable((200,), is_sparse=is_sparse)
        w = C.parameter((200, 100))
        y = C.times(x, w)

        z = [0] * 100 + [1] * 100
        for i in range(200):
            j = (3 * i * i + 5 * i + 1) % 200  # just a random looking index
            z[i], z[j] = z[j], z[i]

        import scipy.sparse
        x11 = scipy.sparse.csr_matrix(np.array([1] * 200).astype('f'))
        x01 = scipy.sparse.csr_matrix(np.array(z).astype('f'))

        t = C.Trainer(y, y, learner(y.parameters))

        w.value = 0 * w.value
        t.train_minibatch({x: [x11]})
        t.train_minibatch({x: [x01]})
        t.train_minibatch({x: [x01]})
        if checkpoint:
            t.save_checkpoint(str(tmpdir.join('checkpoint')))
            t.train_minibatch({x: [x11]})
            t.train_minibatch({x: [x01]})
            t.train_minibatch({x: [x01]})
            t.restore_from_checkpoint(str(tmpdir.join('checkpoint')))
        t.train_minibatch({x: [x01]})
        t.train_minibatch({x: [x01]})
        t.train_minibatch({x: [x11]})
        return w.value
Example #15
0
def create_fast_rcnn_predictor(conv_out, rois, fc_layers, cfg):
    # RCNN
    roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (cfg["MODEL"].ROI_DIM, cfg["MODEL"].ROI_DIM), spatial_scale=1/16.0)
    fc_out = fc_layers(roi_out)

    # prediction head
    W_pred = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES), init=normal(scale=0.01), name="cls_score.W")
    b_pred = parameter(shape=cfg["DATA"].NUM_CLASSES, init=0, name="cls_score.b")
    cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score')

    # regression head
    W_regr = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES*4), init=normal(scale=0.001), name="bbox_regr.W")
    b_regr = parameter(shape=cfg["DATA"].NUM_CLASSES*4, init=0, name="bbox_regr.b")
    bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr')

    return cls_score, bbox_pred
 def create_model(self):
     self.input_dim = 1000
     self.embed_dim = 30
     i = C.input_variable((self.input_dim,), is_sparse=True)
     self.p = C.parameter(shape=(self.input_dim, self.embed_dim), init=1)
     o = C.times(i, self.p)
     self.z = C.reduce_sum(o)
Example #17
0
def test_large_model_serialization_double(tmpdir):
    import os; 

    two_gb = 2**31
    type_size = np.dtype(np.float64).itemsize
    size = two_gb /  type_size + 10

    assert size * type_size > two_gb

    device = C.device.cpu()
    i = C.sequence.input(size, dtype=np.float64)
    w = C.Parameter((size,), dtype=np.float64, 
        init=C.uniform(3.0, seed=12345), device=device)
    z = C.times(i, w)

    filename = str(tmpdir / 'test_large_model_serialization_double.out')
    z.save(filename)

    assert os.path.getsize(filename) > two_gb

    y = C.Function.load(filename, device=device)

    assert (len(z.parameters) == len(y.parameters))

    for param_pair in zip(z.parameters, y.parameters):
        assert param_pair[0].shape == param_pair[1].shape
        assert np.allclose(param_pair[0].value, param_pair[1].value)
def linear_layer(input_var, output_dim):
    input_dim = input_var.shape[0]
    times_param = C.parameter(shape=(input_dim, output_dim))
    bias_param = C.parameter(shape=(output_dim))

    t = C.times(input_var, times_param)
    return bias_param + t
Example #19
0
def attention_pooling(inputs, inputs_mask, inputs_weights, decode, decode_weights, keys):
    """
    inputs: shape=(n, dim)
    inputs_weight: shape=(dim, dim)
    decode: shape=(1, dec_dim)
    decode_weights: shape=(dec_dim, dim)
    keys: shape=(dim, 1)
    
    """
    w_in = C.times(inputs, inputs_weights)  #shape=(n, dim)
    w_dec = C.times(decode, decode_weights) #shape=(dim, 1)
    S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in)) #shape=(n, dim)
    S = C.element_select(inputs_mask, S, C.constant(-1e+30))
    S = C.times(S, keys) #shape=(n)
    S = C.ops.sequence.softmax(S, name="softmax")
    attention = C.reduce_sum(inputs * S, axis=0)
    return attention
Example #20
0
def _sparse_to_dense_network_cache(input_shape, is_sequence, device):
    if is_sequence:
        temp_input = C.sequence.input_variable(input_shape, is_sparse=True)
    else:
        temp_input = C.input_variable(input_shape, is_sparse=True)

    eye_shape = input_shape[-1]
    return C.times(temp_input, np.eye(eye_shape))
Example #21
0
def _to_dense(val, is_sequence=False):
    if is_sequence:
        x = C.sequence.input_variable(val.shape[2:], is_sparse=True)
    else:
        x = C.input_variable(val.shape[1:], is_sparse=True)

    dense = C.times(x, C.constant(value=np.eye(val.shape[-1], dtype=np.float32)))
    return dense.eval({x : val}, device=val.device)
Example #22
0
def linear_layer(input_var, output_dim):
    input_dim = input_var.shape[0]
    weight_param = C.parameter(shape=(input_dim, output_dim))
    bias_param = C.parameter(shape=(output_dim))

    param_dict['w'], param_dict['b'] = weight_param, bias_param

    return C.times(input_var, weight_param) + bias_param
Example #23
0
def test_nce_backward_indices(classes, xdim, batch, expected_value, device_id, precision):
    """
    Simple test that makes sure that the derivatives have the correct sparsity pattern
    """

    # ignore precision, only sparsity pattern matters for this test
    dt = np.float32

    from cntk.losses import nce_loss
    import scipy
    trials = 10

    # Establish baseline
    expected_count = np.zeros(classes)
    I = C.constant(np.eye(classes, dtype=dt))
    q = np.arange(classes, dtype=dt) + 1
    z = C.reduce_sum(C.times(C.random_sample(q, 32, True, seed=98052), I), axis=0)
    for i in range(trials):
        expected_count[np.nonzero(z.eval().ravel())] += 1

    # Set things up to measure the same thing with nce_loss

    x = C.input_variable(xdim, needs_gradient=True)
    y = C.input_variable(classes, is_sparse=True)

    x0 = np.arange(batch * xdim, dtype=dt).reshape((batch, xdim))/(batch * xdim)
    data = np.ones(batch, dtype=dt)
    indices = list(range(10,10*batch+1,10))
    indptr = list(range(batch+1))
    y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes))

    b = C.parameter((classes, 1))
    W = C.parameter((classes, C.InferredDimension))

    gb = np.zeros(classes)
    vb = C.input_variable((classes, 1), dtype=dt)
    Ib = C.constant(np.eye(1, dtype=dt))
    zb = C.times(vb, Ib)

    loss = C.nce_loss(W, b, x, y, q, seed=98052)
    for i in range(trials):
        v = loss.grad({x: x0, y: y0}, wrt=loss.parameters, as_numpy=False)
        gb[np.nonzero(zb.eval({vb: v[b]}).ravel())] += 1
    for i in range(classes):
        assert gb[i] == expected_count[i] or (i in indices and gb[i] == trials)
Example #24
0
def test_2d_sparse_csr_batch_input(device_id):
    dev = cntk_device(device_id)
    features = C.input_variable((2, 3), is_sparse=True)
    w = C.parameter(init=np.asarray([[0.5, 1], [-.5, 2], [1., 1.5]], dtype=np.float32), device=dev)
    t = C.times(features, w)
    features_data = [sp.sparse.csr_matrix(np.asarray([[1.,0.,0.], [0.,1.,0.]], dtype=np.float32)),
                     sp.sparse.csr_matrix(np.asarray([[0.,0.,1.], [1.,0.,0.]], dtype=np.float32))]
    result = t.eval({features : features_data}, device=dev)
    assert np.array_equal(result, [[[.5, 1], [-.5, 2]], [[1, 1.5], [.5, 1]]])
Example #25
0
    def returnFunction():
        left_val = [[10,2]]
        right_val = [[2],[3]]

        p = placeholder(shape=(1,2))
        op = times(p, right_val)
        c = constant(left_val)

        return op.replace_placeholders({p:c})
Example #26
0
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.ops import input_variable, times

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_vocab_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=label_vocab_dim,  is_sparse=True)
    )), randomize=False, epoch_size = 2)

    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')
    label_seq_axis = Axis('labelAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(
        shape=input_vocab_dim, dynamic_axes=input_dynamic_axes,
        name='raw_input', is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
            input_map={raw_input : mbs.streams.features},
            device=cntk_device(device_id))

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid, device=cntk_device(device_id))

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [
            [3, 4, 5, 4, 7, 12, 1],
            [60, 61]
            ]
    data = [csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in
            one_hot_data]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = Value.one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id))
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
Example #27
0
def test_op_gather_grad(device_id):
    dim = 10
    ii = C.sequence.input_variable(())
    param = C.parameter((dim, 1), init=np.reshape(np.arange(dim), (dim,1)).astype(np.float32))
    ss = C.gather(param, ii)
    data = [[0], [0,1,2], [1,2,3,4,5, 6]]
    grad1 = ss.grad(data, wrt=[param])
    ss2 = C.times(C.one_hot(ii, num_classes=dim, sparse_output=False), param)
    grad2 = ss2.grad(data, wrt=[param])
    assert np.array_equal(grad1, grad2)
Example #28
0
def test_ext_eval_5_times():
    dim = 2
    p_init = 10
    p = C.parameter(shape=(dim,), init=p_init, name='p')
    m = C.user_function(MyPlus(p, C.constant(3)))
    z = C.times(m, C.parameter(shape=(2, 50), init=2))

    result = z.eval()
    # No batch dimension since we have no input
    assert np.allclose(result, ((p_init * np.ones_like(result)) + 3) * 2 * 2)
Example #29
0
    def LSTMCell(x, y, dh, dc):
        '''LightLSTM Cell'''

        b = C.parameter(shape=(4 * cell_dim), init=0)
        W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform())
        H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform())

        # projected contribution from input x, hidden, and bias
        proj4 = b + C.times(x, W) + C.times(dh, H)

        it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim)
        bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim)
        ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim)
        ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim)

        it = C.sigmoid(it_proj)  # input gate
        bit = it * C.tanh(bit_proj)

        ft = C.sigmoid(ft_proj)  # forget gate
        bft = ft * dc

        ct = bft + bit
        ot = C.sigmoid(ot_proj)  # output gate
        ht = ot * C.tanh(ct)

        # projected contribution from input y, hidden, and bias
        proj4_2 = b + C.times(y, W) + C.times(ht, H)

        it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim)
        bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim)
        ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim)
        ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim)

        it_2 = C.sigmoid(it_proj_2)  # input gate
        bit_2 = it_2 * C.tanh(bit_proj_2)

        ft_2 = C.sigmoid(ft_proj_2)  # forget gate
        bft_2 = ft_2 * ct

        ct2 = bft_2 + bit_2
        ot_2 = C.sigmoid(ot_proj_2)  # output gate
        ht2 = ot_2 * C.tanh(ct2)
        return (ht, ct, ht2, ct2)
Example #30
0
def test_eval_sparse_no_seq(batch_index_data, device_id):
    dim = 10
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse)
        z = times(in1, multiplier*np.eye(dim))
        batch = np.eye(dim)[batch_index_data]
        expected = batch * multiplier
        sparse_val = csr(batch.astype('f'))
        result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id))
        assert np.allclose(result, [expected])
Example #31
0
def test_op_scatter_sparse(device_id):
    input_sparse_indices = [[1, 3, 5, 5], [2, 4], [0, 2]]
    vocab_size = 6
    input_data = Value.one_hot(input_sparse_indices, vocab_size)

    a = C.sequence.input_variable(shape=(vocab_size,), is_sparse=True, name='a')

    a_last_scatter = C.sequence.scatter(C.sequence.last(a), C.sequence.is_first(a))
    a_last_scatter_dense = C.times(a_last_scatter, np.eye(vocab_size))
    res = a_last_scatter_dense.eval({a : input_data})
    assert np.array_equal(res[0], np.asarray([[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]))
    assert np.array_equal(res[1], np.asarray([[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]]))
    assert np.array_equal(res[2], np.asarray([[0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0]]))
Example #32
0
 def func(x_var):
     x = C.placeholder()
     WT = C.Parameter((
         dim,
         dim,
     ),
                      init=transform_weight_initializer,
                      name=name + '_WT')
     bT = C.Parameter(dim,
                      init=transform_bias_initializer,
                      name=name + '_bT')
     WU = C.Parameter((
         dim,
         dim,
     ),
                      init=update_weight_initializer,
                      name=name + '_WU')
     bU = C.parameter(dim, init=update_bias_initializer, name=name + '_bU')
     transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT)
     update = C.tanh(C.times(x, WU, name=name + '_U') + bU)
     return C.as_block(update * transform_gate + (1 - transform_gate) * x,
                       [(x, x_var)], 'SingleInner', 'SingleInner' + name)
Example #33
0
def cross_entropy_with_sampled_softmax(
    hidden_vector,          
    label_vector,           
    vocab_dim,              
    hidden_dim,             
    num_samples,            
    sampling_weights,       
    allow_duplicates = False 
    ):

	bias = C.layers.Parameter(shape = (vocab_dim, 1), init = 0)
	weights = C.layers.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

	sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates)
	sample_selector = sample_selector_sparse

	inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates)
	log_prior = C.log(inclusion_probs)

	wS = C.times(sample_selector, weights, name='wS')
	zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')

	# Getting the weight vector for the true label. Dimension hidden_dim
	wT = C.times(label_vector, weights, name='wT')
	zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(label_vector, bias, name='zT2') - C.times_transpose(label_vector, log_prior, name='zT3')

	zSReduced = C.reduce_log_sum_exp(zS)

	# Compute the cross entropy that is used for training.
	cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT

	# For applying the model we also output a node providing the input for the full softmax
	z = C.times_transpose(weights, hidden_vector) + bias
	z = C.reshape(z, shape = (vocab_dim))

	zSMax = C.reduce_max(zS)
	error_on_samples = C.less(zT, zSMax)

	return (z, cross_entropy_on_samples, error_on_samples)
def test_eval_one_hot_seq(one_hot_batch, device_id):
    dim = 10
    multiplier = 2

    for var_is_sparse in [True, False]:
        in1 = sequence.input_variable(shape=(dim,), is_sparse=var_is_sparse)
        # Convert CNTK node value to dense so that we can compare it later
        z = times(in1, np.eye(dim)*multiplier)
        # Convert expectation to dense
        expected = [np.eye(dim)[seq]*multiplier for seq in one_hot_batch]
        batch = Value.one_hot(one_hot_batch, num_classes=dim, device=cntk_device(device_id))
        result = z.eval({in1: batch}, device=cntk_device(device_id))
        assert np.all([np.allclose(a,b) for a,b in zip(result, expected)])
Example #35
0
    def createNetwork(self, inputEmb, preHidden, preMem):
        WX = C.times(inputEmb, self.W) + self.Wb
        UH = C.times(preHidden, self.U) + self.Ub

        I = C.sigmoid(
            C.slice(WX, -1, 0, self.hiddenSize) +
            C.slice(UH, -1, 0, self.hiddenSize))
        O = C.sigmoid(
            C.slice(WX, -1, self.hiddenSize, self.hiddenSize * 2) +
            C.slice(UH, -1, self.hiddenSize, self.hiddenSize * 2))
        F = C.sigmoid(
            C.slice(WX, -1, self.hiddenSize * 2, self.hiddenSize * 3) +
            C.slice(UH, -1, self.hiddenSize * 2, self.hiddenSize * 3))
        N = C.tanh(
            C.slice(WX, -1, self.hiddenSize * 3, self.hiddenSize * 4) +
            C.slice(UH, -1, self.hiddenSize * 3, self.hiddenSize * 4))

        NI = C.element_times(N, I)
        FM = C.element_times(F, preMem)
        CurMem = NI + FM
        CurH = C.element_times(C.tanh(CurMem), O)
        return (CurH, CurMem)
Example #36
0
def test_disallow_seq_starts_with_Value_objects():
    one_hot_batch = [[2, 5], [0, 1, 6]]
    dim = 10

    in1 = input(shape=(dim, ), is_sparse=True)
    z = times(in1, np.eye(dim))
    batch = Value.one_hot(one_hot_batch, num_classes=dim)

    with pytest.raises(ValueError):
        result = z.eval(({in1: batch}, len(batch) * [True]))

    with pytest.raises(ValueError):
        result = z.eval({in1: (batch, len(batch) * [True])})
 def func(x_var):
     x = C.placeholder()
     WT = C.Parameter((
         dim,
         dim,
     ),
                      init=transform_weight_initializer,
                      name=name + '_WT')
     bT = C.Parameter(dim,
                      init=transform_bias_initializer,
                      name=name + '_bT')
     WU = C.Parameter((
         dim,
         dim,
     ),
                      init=update_weight_initializer,
                      name=name + '_WU')
     bU = C.Parameter(dim, init=update_bias_initializer, name=name + '_bU')
     transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT)
     update = C.relu(C.times(x, WU, name=name + '_U') + bU)
     return C.as_block(x + transform_gate * (update - x), [(x, x_var)],
                       'HighwayBlock', 'HighwayBlock' + name)
def test_gather_implementation_using_one_hot_and_times():
    num_classes = 4

    w_init = np.asarray([[0, 1], [2, 3], [4, 5], [6, 7]]).astype(np.float32)
    w = C.parameter(init=w_init)

    x = C.input_variable((2, ))
    sparse_one_hot = C.one_hot(x, num_classes, sparse_output=True)
    t = C.times(sparse_one_hot, w)
    indices = np.asarray([[0, 3], [2, 1]], dtype=np.float32)
    result = t.eval({x: indices})
    expected_result = np.asarray([[[0., 1.], [6., 7.]], [[4., 5.], [2., 3.]]])
    assert np.array_equal(result, expected_result)
Example #39
0
def test_op_times_reduce_sequence_axis(device_id, precision):
    dt_precision = PRECISION_TO_TYPE[precision]

    from cntk import times, Value, TIMES_REDUCE_SEQUENCE_AXIS_WITHOUT_INFERRED_INPUT_RANK
    from cntk import sequence
    dim = 10
    seq = [[0, 1, 2], [3], [4, 5, 6, 7, 8, 9]]
    right_data = Value.one_hot(seq, dim, dtype=dt_precision)
    right_var = sequence.input_variable(shape=(dim),
                                        is_sparse=True,
                                        dtype=dt_precision)
    left_data = [
        AA([1, 1, 1], dtype=dt_precision),
        AA([1], dtype=dt_precision),
        AA([1, 1, 1, 1, 1, 1], dtype=dt_precision)
    ]
    left_var = sequence.input_variable(shape=(1), dtype=dt_precision)

    func = times(left_var,
                 right_var,
                 infer_input_rank_to_map=
                 TIMES_REDUCE_SEQUENCE_AXIS_WITHOUT_INFERRED_INPUT_RANK)
    func2 = sequence.reduce_sum(times(left_var, right_var))

    assert func.dynamic_axes == func2.dynamic_axes

    _, forward_output = func.forward({
        left_var: left_data,
        right_var: right_data
    })

    actual_forward = forward_output[func.output]

    expected_forward = AA([[[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]],
                           [[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]],
                           [[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]])

    assert np.allclose(actual_forward, expected_forward)
Example #40
0
def test_debug_multi_output():
    input_dim = 2
    num_output_classes = 2

    f_input = input_variable(input_dim,
                             np.float32,
                             needs_gradient=True,
                             name='features')

    p = parameter(shape=(input_dim, ), init=10, name='p')

    comb = combine([f_input, p])

    ins = InStream(['n', 'n', 'n', 'n', 'n'])
    outs = OutStream()

    z = times(comb.outputs[0], comb.outputs[1], name='z')
    z = debug_model(z, ins, outs)

    l_input = input_variable(num_output_classes, np.float32, name='labels')
    loss = cross_entropy_with_softmax(z, l_input)
    eval_error = classification_error(z, l_input)

    _train(z, loss, eval_error, loss.find_by_name('features'),
           loss.find_by_name('labels'), num_output_classes, 1)

    # outs.written contains something like
    # =================================== forward  ===================================
    # Parameter('p', [], [2]) with uid 'Parameter4'
    # Input('features', [#, *], [2]) with uid 'Input3'
    # Times: Output('UserDefinedFunction12_Output_0', [#, *], [2]), Output('UserDefinedFunction15_Output_0', [], [2]) -> Output('z', [#, *], [2 x 2]) with uid 'Times21'
    # =================================== backward ===================================
    # Times: Output('UserDefinedFunction12_Output_0', [#, *], [2]), Output('UserDefinedFunction15_Output_0', [], [2]) -> Output('z', [#, *], [2 x 2]) with uid 'Times21'
    # Input('features', [#, *], [2]) with uid 'Input3'
    # Parameter('p', [], [2]) with uid 'Parameter4'   assert outs.written == out_stuff

    assert len(outs.written) == 8

    v_p = "Parameter('p', "
    v_i = "Input('features'"
    v_t = 'Times: '

    assert outs.written[0].startswith('=') and 'forward' in outs.written[0]
    line_1, line_2, line_3 = outs.written[1:4]

    assert outs.written[4].startswith('=') and 'backward' in outs.written[4]
    line_5, line_6, line_7 = outs.written[5:8]
    assert line_5.startswith(v_t)
    assert line_6.startswith(v_p) and line_7.startswith(v_i) or \
           line_6.startswith(v_i) and line_7.startswith(v_p)
Example #41
0
def test_to_sequence_backprop(device_id):
    dev = cntk_device(device_id)
    input_vocab_size=3
    emb_dim = 2
    hidden_dim = 2
    num_labels = 2
    x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features')
    with C.default_options(initial_state=0.1):
        model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input)
        model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model)
        model = C.layers.Dense(num_labels, name='classify')(model)

    z = model
    label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, name='labels')
    ce = C.cross_entropy_with_softmax(z, label_seq_input)

    seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]]
    seq2_data = [[0, 0, 1], [0, 1, 1]]
    seq1_label_data = [[0, 1], [0, 1], [1, 0]]
    seq2_label_data = [[1, 0], [0, 1]]
    label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)]
    param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_seq_input : label_seq_data},
                                           wrt=ce.parameters, outputs=[ce], as_numpy=False)

    # Create a clone of the model that uses a non-sequence input
    # and converts it to a sequence using to_sequence
    x_non_seq_input = C.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='non_seq_features')
    x_seq_lens = C.input_variable((), name='sequence_lengths')
    x_seq = C.to_sequence(x_non_seq_input, x_seq_lens)
    x_seq = C.reconcile_dynamic_axes(C.times(x_seq, np.eye(input_vocab_size, dtype=np.float32)), label_seq_input)
    ce_clone = ce.clone('share', {x_seq_input : x_seq})

    x_non_seq_data = C.NDArrayView.from_csr(_to_csr([seq1_data, seq2_data + [[0, 0, 0]]]), shape=(2, 3, 3))
    x_seq_lens_data = np.asarray([3, 2], dtype=np.float32)

    x_non_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'non_seq_features')
    label_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'labels')
    x_seq_lens = next(argument for argument in ce_clone.arguments if argument.name == 'sequence_lengths')
    param_grads_2, loss_result_2 = ce_clone.grad({x_non_seq_input : x_non_seq_data, x_seq_lens : x_seq_lens_data, label_seq_input : label_seq_data},
                                                 wrt=ce_clone.parameters, outputs=[ce_clone], as_numpy=False)


    assert np.array_equal(loss_result_1.as_sequences()[0], loss_result_2.as_sequences()[0])
    assert np.array_equal(loss_result_1.as_sequences()[1], loss_result_2.as_sequences()[1])

    for param in param_grads_1:
        if not param_grads_1[param].is_sparse:
            reference_grad_value = param_grads_1[param].asarray()
            grad_value = param_grads_2[param].asarray()
            assert np.array_equal(reference_grad_value, grad_value)
def test_eval_sparse_dense(tmpdir, device_id):
    from cntk import Axis
    from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
    from cntk.ops import times

    input_vocab_dim = label_vocab_dim = 69

    ctf_data = '''\
0	|S0 3:1 |# <s>	|S1 3:1 |# <s>
0	|S0 4:1 |# A	|S1 32:1 |# ~AH
0	|S0 5:1 |# B	|S1 36:1 |# ~B
0	|S0 4:1 |# A	|S1 31:1 |# ~AE
0	|S0 7:1 |# D	|S1 38:1 |# ~D
0	|S0 12:1 |# I	|S1 47:1 |# ~IY
0	|S0 1:1 |# </s>	|S1 1:1 |# </s>
2	|S0 60:1 |# <s>	|S1 3:1 |# <s>
2	|S0 61:1 |# A	|S1 32:1 |# ~AH
'''
    ctf_file = str(tmpdir/'2seqtest.txt')
    with open(ctf_file, 'w') as f:
        f.write(ctf_data)

    mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
        features  = StreamDef(field='S0', shape=input_vocab_dim,  is_sparse=True),
        labels    = StreamDef(field='S1', shape=label_vocab_dim,  is_sparse=True)
    )), randomize=False, max_samples = 2)

    raw_input = sequence.input_variable(shape=input_vocab_dim, sequence_axis=Axis('inputAxis'), name='raw_input', is_sparse=True)

    mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
            input_map={raw_input : mbs.streams.features},
            device=cntk_device(device_id))

    z = times(raw_input, np.eye(input_vocab_dim))
    e_reader = z.eval(mb_valid, device=cntk_device(device_id))

    # CSR with the raw_input encoding in ctf_data
    one_hot_data = [
            [3, 4, 5, 4, 7, 12, 1],
            [60, 61]
            ]
    data = [csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in
            one_hot_data]
    e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)])

    # One-hot with the raw_input encoding in ctf_data
    data = Value.one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id))
    e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
    assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
Example #43
0
def GenMatMul_1k():
    feature = C.input_variable(
        (
            1024,
            1024,
        ),
        np.float32,
    )
    model = C.times(feature, C.parameter((1024, 1024),
                                         init=C.glorot_uniform()))

    data_feature = np.random.rand(1, *feature.shape).astype(np.float32)
    data_output = model.eval(data_feature)
    Save("test_MatMul_1k", model, data_feature, data_output)
 def __init__(self):
     self.input_dim = 40000
     self.embed_dim = 100
     self.batch_size = 20
     i = C.input_variable((self.input_dim, ), is_sparse=True)
     self.p = C.parameter(shape=(self.input_dim, self.embed_dim), init=1)
     o = C.times(i, self.p)
     z = C.reduce_sum(o)
     learner = C.data_parallel_distributed_learner(
         C.sgd(
             z.parameters,
             C.learning_rate_schedule(0.01,
                                      unit=C.learners.UnitType.sample)))
     self.trainer = C.Trainer(z, (z, None), learner, [])
Example #45
0
    def createNetwork(self, length):
        networkHiddenTrg = {}
        networkMemTrg = {}
        inputTrg = C.reshape(self.inputMatrixTrg,
                             shape=(Config.TrgMaxLength, Config.BatchSize,
                                    Config.TrgVocabSize))
        tce = 0
        for i in range(0, length - 1, 1):
            if (i == 0):
                networkHiddenTrg[i] = self.firstHidden
                networkMemTrg[i] = networkHiddenTrg[i]
            else:
                (networkHiddenTrg[i],
                 networkMemTrg[i]) = self.Decoder.createNetwork(
                     self.Emb(inputTrg[i]), networkHiddenTrg[i - 1],
                     networkMemTrg[i - 1])

            preSoftmax = C.times(networkHiddenTrg[i], self.Wt) + self.Wtb
            ce = C.cross_entropy_with_softmax(preSoftmax, inputTrg[i + 1], 2)
            tce += C.times(
                C.reshape(ce, shape=(1, Config.BatchSize)),
                C.reshape(self.maskMatrixTrg[i], shape=(Config.BatchSize, 1)))
        return tce
Example #46
0
def train_eval_logistic_regression_from_file(criterion_name=None,
                                             eval_name=None,
                                             device_id=-1):
    cur_dir = os.path.dirname(__file__)

    # Using data from https://github.com/Microsoft/CNTK/wiki/Tutorial
    train_file = os.path.join(cur_dir, "Train-3Classes.txt")
    test_file = os.path.join(cur_dir, "Test-3Classes.txt")

    X = C.input(2)
    y = C.input(3)

    W = C.parameter(value=np.zeros(shape=(3, 2)))
    b = C.parameter(value=np.zeros(shape=(3, 1)))

    out = C.times(W, X) + b
    out.tag = 'output'
    ce = C.cross_entropy_with_softmax(y, out)
    ce.name = criterion_name
    ce.tag = 'criterion'
    eval = C.ops.square_error(y, out)
    eval.tag = 'eval'
    eval.name = eval_name

    # training data readers
    train_reader = C.CNTKTextFormatReader(train_file, randomize=None)

    # testing data readers
    test_reader = C.CNTKTextFormatReader(test_file, randomize=None)

    my_sgd = C.SGDParams(epoch_size=0,
                         minibatch_size=25,
                         learning_rates_per_mb=0.1,
                         max_epochs=3)

    with C.LocalExecutionContext('logreg') as ctx:
        ctx.device_id = device_id

        ctx.train(root_nodes=[ce, eval],
                  training_params=my_sgd,
                  input_map=train_reader.map(X, alias='I',
                                             dim=2).map(y, alias='L', dim=3))

        result = ctx.test(root_nodes=[ce, eval],
                          input_map=test_reader.map(X, alias='I',
                                                    dim=2).map(y,
                                                               alias='L',
                                                               dim=3))

        return result
Example #47
0
def linear_units(input_var, output_dim):
    input_dim = input_var.shape[0]
    # Introduce model parameters
    weight_param = C.parameter(shape=(output_dim, input_dim), name="weights")
    bias_param = C.parameter(shape=(output_dim, 1), name="biases")
    # Reshape to facilitate matrix multiplication
    input_reshaped = C.reshape(input_var, (input_dim, 1))
    # Weighted sums
    params['w'], params['b'] = weight_param, bias_param
    part1 = C.times(weight_param, input_reshaped)
    # Add biases
    part2 = part1 + bias_param
    # Return 1-D representation
    return C.reshape(part2, (num_classes))
def test_input_without_dynamic_axes():
    x = C.input_variable(shape=(2,), dynamic_axes=[], needs_gradient=True, name='x')
    assert len(x.dynamic_axes) == 0

    op = x * .01 + 3.0
    grad_result, eval_result = op.grad({x : np.asarray([.6, -.8], dtype=np.float32)}, outputs=[op], wrt=[x])
    assert np.allclose(eval_result, [3.006, 2.992])
    assert np.allclose(grad_result, [.01, .01])

    w = C.parameter(init=np.asarray([[0.5], [-1.5]], dtype=np.float32))
    op = C.times(x, w) + 3.0
    grad_result, eval_result = op.grad({x : np.asarray([.6, -.8], dtype=np.float32)}, outputs=[op], wrt=[w])
    assert np.allclose(eval_result, [4.5])
    assert np.allclose(grad_result, [[.6], [-.8]])
def test_gather_2D_using_one_hot_and_times():
    i = C.sequence.input_variable((1, ))
    indices = [[2, 0], [1]]
    sparse_one_hot = C.one_hot(i, num_classes=3, sparse_output=True)
    w = C.parameter((-1, 2, 3), init=C.glorot_uniform())
    t = C.times(sparse_one_hot, w, output_rank=2)
    result = t.eval({i: indices})
    w_value = w.value
    expected_result = [
        np.stack(
            [np.expand_dims(np.asarray(w_value[idx]), axis=0) for idx in seq])
        for seq in indices
    ]
    assert np.array_equal(result[0], expected_result[0])
    assert np.array_equal(result[1], expected_result[1])
def test_eval_sparse_seq_1(batch, device_id):
    dim = 4
    multiplier = 2
    for var_is_sparse in [True, False]:
        in1 = sequence.input_variable(shape=(dim,), is_sparse=var_is_sparse)
        z = times(in1, multiplier*np.eye(dim))
        if isinstance(batch[0], list):
            expected = [np.vstack([m.todense() * multiplier for m in seq]) for seq in
                    batch]
        else:
            expected = [seq.todense() * multiplier for seq in batch]
        result = z.eval({in1: batch}, device=cntk_device(device_id))

        assert np.all([np.allclose(a,b) for a,b in zip(result, expected)]), \
                "%s != %s"%(result,expected)
Example #51
0
    def createAttentionNet(self, hiddenSrc, curHiddenTrg, srcLength):
        srcHiddenSize = Config.SrcHiddenSize * 2
        hsw = C.times(hiddenSrc, self.Was)
        htw = C.times(curHiddenTrg, self.Wat)
        hst = C.reshape(
            hsw, shape=(srcLength, Config.BatchSize * Config.TrgHiddenSize)
        ) + C.reshape(htw, shape=(1, Config.BatchSize * Config.TrgHiddenSize))
        hstT = C.reshape(C.tanh(hst),
                         shape=(srcLength * Config.BatchSize,
                                Config.TrgHiddenSize))
        attScore = C.reshape(C.times(hstT, self.Wav),
                             shape=(srcLength, Config.BatchSize))
        maskOut = (C.slice(self.maskMatrixSrc, 0, 0, srcLength) - 1) * 99999999
        nAttScore = attScore + maskOut
        attProb = C.reshape(C.softmax(nAttScore, axis=0),
                            shape=(srcLength, Config.BatchSize, 1))
        attVector = hiddenSrc * attProb
        contextVector = C.reduce_sum(C.reshape(
            attVector, shape=(srcLength, Config.BatchSize * srcHiddenSize)),
                                     axis=0)
        contextVector = C.reshape(contextVector,
                                  shape=(1, Config.BatchSize, srcHiddenSize))

        return (contextVector, attProb)
Example #52
0
def test_as_composite():
    input_dim = 1
    proj_dim = 2
    x = C.input_variable((input_dim, ))
    b = C.parameter((proj_dim))
    w = C.parameter((input_dim, proj_dim))
    func_name = 't_plus_b'
    t_plus_b = C.plus(C.times(x, w), b, name=func_name)
    assert (t_plus_b.root_function.name == func_name)
    composite = C.as_composite(t_plus_b.root_function)
    assert (composite.root_function.name == func_name)
    composite = C.as_composite(composite)
    assert (composite.root_function.name == func_name)
    composite = C.as_composite(t_plus_b)
    assert (composite.root_function.name == func_name)
def _simple_dict():
    d = {}

    d['i1'] = C.input_variable(shape=(2, 3), name='i1')
    d['c1'] = C.constant(shape=(2, 3), value=6, name='c1')
    d['p1'] = C.parameter(shape=(3, 2), init=7, name='p1')
    d['op1'] = C.plus(d['i1'], d['c1'], name='op1')
    d['op2'] = C.times(d['op1'], d['p1'], name='op2')
    d['root'] = d['op2']

    d['target'] = C.input_variable((), name='label')
    d['all'] = C.combine([d['root'], C.minus(
        d['target'], C.constant(1, name='c2'), name='minus')], name='all')

    return d
def test_2d_sparse_csr_batch_input(device_id):
    dev = cntk_device(device_id)
    features = C.input_variable((2, 3), is_sparse=True)
    w = C.parameter(init=np.asarray([[0.5, 1], [-.5, 2], [1., 1.5]],
                                    dtype=np.float32),
                    device=dev)
    t = C.times(features, w)
    features_data = [
        sp.sparse.csr_matrix(
            np.asarray([[1., 0., 0.], [0., 1., 0.]], dtype=np.float32)),
        sp.sparse.csr_matrix(
            np.asarray([[0., 0., 1.], [1., 0., 0.]], dtype=np.float32))
    ]
    result = t.eval({features: features_data}, device=dev)
    assert np.array_equal(result, [[[.5, 1], [-.5, 2]], [[1, 1.5], [.5, 1]]])
Example #55
0
def test_free_static_axis_times_free_static_axis(output_rank, x_input_shape,
                                                 x_data, y_input_shape,
                                                 y_data):
    x = C.input_variable(x_input_shape)
    y = C.input_variable(y_input_shape)
    t = C.times(x, y, output_rank=output_rank)
    cntk_result = t.eval({x: x_data, y: y_data})[0]
    np_result = []
    for x_item, y_item in zip(x_data, y_data):  #zip over the batch axis
        item_res = np.tensordot(x_item,
                                y_item,
                                axes=len(x_item.shape) - output_rank)
        np_result.append(item_res)
    np_result = np.vstack(np_result)
    np.testing.assert_allclose(np_result, cntk_result)
Example #56
0
def cumsum(x, axis=0):
    dim = x.shape[axis]
    print('dim')
    print(dim)
    U = C.constant(np.triu(np.ones((dim, dim))).astype(x.dtype))
    print('U')
    print(U)
    if axis != -1:
        x = C.swapaxes(x, -1, axis)
        print('swapped')
        print(x())
    out = C.times(x, U)
    if axis != -1:
        out = C.swapaxes(out, -1, axis)
    return out
Example #57
0
def test_unpack_axis_times_transpose_unpack_axis(output_rank, x_input_shape,
                                                 x_data, y_input_shape,
                                                 y_data):
    #test free axis times from unpack batch
    x = C.input_variable(x_input_shape)
    y = C.input_variable(y_input_shape)
    xx = C.unpack_batch(x)
    yy = C.unpack_batch(y)
    yyy = C.transpose(yy, range(len(yy.shape))[::-1])
    t = C.times(xx, yyy, output_rank=output_rank)
    cntk_result = t.eval({x: x_data, y: y_data})
    np_result = np.tensordot(x_data,
                             np.transpose(y_data),
                             axes=len(x_data.shape) - output_rank)
    np.testing.assert_allclose(np_result, cntk_result)
Example #58
0
def test_op_times_sparse_grad(device_id, precision):
    dt_precision = PRECISION_TO_TYPE[precision]

    from cntk import times, times_transpose, parameter, reshape, one_hot
    dim = 5
    num_sequences = 2
    seq = [i for i in range(dim)]
    identity = np.identity(dim, dtype=np.float32)
    input_data = one_hot([seq] * num_sequences, dim)
    input_var = I(shape=(dim), is_sparse=True, needs_gradient=False)
    e = parameter(shape=(dim, dim), init=identity)
    z = reshape(times_transpose(e, times(input_var, e)), dim)
    e_grad = z.grad({input_var: input_data}, [e])

    assert np.allclose(e_grad, np.ones((dim, dim)) * 4)
Example #59
0
    def scale_dot_product_attention_block(self, contextQ, contextV, contextK, name):

        Q = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis])
        V = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis])
        K = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis])

        Ql = C.layers.Dense(100)(Q)
        Vl = C.layers.Dense(100)(V)
        Kl = C.layers.Dense(100)(K)

        kvw, kvw_mask = C.sequence.unpack(Kl, padding_value=0).outputs
        vvw, _ = C.sequence.unpack(Vl, padding_value=0).outputs
        KT = C.swapaxes(kvw)

        S = C.reshape(C.times(Ql, KT)/math.sqrt(100), -1) 
        kvw_mask_expanded = C.sequence.broadcast_as(kvw_mask, Ql)
        S = C.softmax(C.element_select(kvw_mask_expanded, S, C.constant(-1e+30)))
        att = C.times(S, vvw)

        return C.as_block(
            att,
            [(Q, contextQ), (V, contextV), (K, contextK)],
            'sdp_attention_block' + name,
            'sdp_attention_block' + name)
Example #60
0
    def rnet_output_layer(self, attention_context, query):

        att_context = C.placeholder(shape=(2*self.hidden_dim,))
        q_processed = C.placeholder(shape=(2*self.hidden_dim,))

        wuq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        whp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wha = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform())
        bias = C.parameter(shape=(2*self.hidden_dim), init=C.glorot_uniform())

        whp_end = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wha_end = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        v_end = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform())

        # sequence[tensor[1]] q_len x 1
        s0 = C.times(C.tanh(C.times(q_processed, wuq) + bias), v)
        a0 = C.sequence.softmax(s0)
        rQ = C.sequence.reduce_sum(a0 * q_processed)
        
        # sequence[tensor[1]] plen x 1 
        ts = C.reshape(C.times(C.tanh(
            C.times(att_context, whp) + C.times(C.sequence.broadcast_as(rQ, att_context), wha)), v), (-1))

        # sequence[tensor[1]]
        ta = C.sequence.softmax(ts)

        # sequence[2d] 1 x 2d
        c0 = C.reshape(C.sequence.reduce_sum(ta * att_context), (2*self.hidden_dim))
        
        # sequence[tensor[2d]]
        ha1 = C.layers.blocks.GRU(2*self.hidden_dim)(rQ, c0)

        # sequence[tensor[1]] plen x 1
        s1 = C.reshape(C.times(C.tanh(C.times(att_context, whp_end) + C.times(
            C.sequence.broadcast_as(ha1, att_context), wha_end)), v_end), (-1))

        # sequence[tensor[1]] plen x 1
        a1 = C.sequence.softmax(s1)

        return C.as_block(
            C.combine([ts, s1]),
            [(att_context, attention_context), (q_processed, query)],
            'output_layer',
            'output_layer')