def test_trainer_with_some_params_not_learned(): input_dim = 2 proj_dim = 2 x = input_variable(shape=(input_dim,)) W = parameter(shape=(input_dim, proj_dim), init=glorot_uniform()) B = parameter(shape=(proj_dim,), init=glorot_uniform()) t = times(x, W) z = t + B W_orig_value = W.value B_orig_value = B.value labels = input_variable(shape=(proj_dim,)) ce = cross_entropy_with_softmax(z, labels) pe = classification_error(z, labels) lr_per_sample = learning_rate_schedule(0.1, UnitType.sample) trainer = Trainer(z, (ce, pe), sgd([W], lr_per_sample)) x_value = [[1, 1],[2, 2]] label_value = [[0, 1], [1, 0]] arguments = {x: x_value, labels: label_value} num_iters = 3 for i in range(num_iters): trainer.train_minibatch(arguments) assert np.array_equal(B.value, B_orig_value) assert not np.array_equal(W.value, W_orig_value) W_orig_value = W.value trainer.test_minibatch(arguments)
def test_op_batch_normalization_spatial_shape_inference(channels, input_size, device_id, precision): dtype = PRECISION_TO_TYPE[precision] dev = cntk_device(device_id) spatial = True epsilon = 0.01 init_scale = 1 init_bias = 2 init_mean = 3 init_var = 4 init_count = 2 shape = (channels, input_size, input_size) param_shape = (C.InferredDimension,) i = C.input_variable(shape, dtype=dtype) scale = C.parameter(param_shape, init=init_scale, dtype=dtype, device=dev) bias = C.parameter(param_shape, init=init_bias, dtype=dtype, device=dev) run_mean = C.constant(init_mean, shape=param_shape, dtype=dtype, device=dev) run_var = C.constant(init_var, shape=param_shape, dtype=dtype, device=dev) run_count = C.constant(init_count, shape=(), dtype=dtype, device=dev) bn = C.batch_normalization(i, scale, bias, run_mean, run_var, spatial, normalization_time_constant=-1, epsilon=epsilon, running_count = run_count) for param in [scale, bias, run_mean, run_var]: assert(param.shape == (channels,))
def test_convert_optimized_rnnstack(num_layers, bidirectional, recurrent_op, device_id): if device_id == -1: pytest.skip('only runs on GPU') input_dim = 5 hidden_dim = 3 data = [np.random.random((20,input_dim)).astype(np.float32), np.random.random((10,input_dim)).astype(np.float32), np.random.random((40,input_dim)).astype(np.float32)] input_var = C.sequence.input_variable(shape=(input_dim,)) W1 = C.parameter((-1,1), init = C.glorot_uniform()) W2 = C.parameter((-1,1), init = C.glorot_uniform()) cudnn_rnn1 = C.optimized_rnnstack(input_var, W1, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op) dense1 = C.layers.Dense(hidden_dim)(cudnn_rnn1) cudnn_rnn2 = C.optimized_rnnstack(dense1, W2, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op) dense2 = C.layers.Dense(hidden_dim)(cudnn_rnn2) cudnn_rnn3 = C.optimized_rnnstack(dense2, W2, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op) # test shared parameter W2 def blocked(d): blocked_W = C.parameter((-1,d), init = C.glorot_uniform()) @C.layers.BlockFunction('', '') def func(x): return C.optimized_rnnstack(x, blocked_W, d, 1, recurrent_op='lstm') return func cudnn_model = C.layers.Sequential([blocked(hidden_dim), blocked(2*hidden_dim), blocked(3*hidden_dim)])(cudnn_rnn3) cudnn_out = cudnn_model.eval({input_var:data}) model = C.misc.convert_optimized_rnnstack(cudnn_model) # make sure original cudnn model is intact cudnn_out2 = cudnn_model.eval({input_var:data}) assert all(np.allclose(cudnn_out[i], cudnn_out2[i]) for i in range(len(cudnn_out))) model_out = model.eval({model.arguments[0]:data}) assert all(np.allclose(cudnn_out[i], model_out[i]) for i in range(len(cudnn_out)))
def seqcla(): # LSTM params input_dim = 50 output_dim = 128 cell_dim = 128 # model num_labels = 5 vocab = 2000 embed_dim = 50 t = C.dynamic_axis(name='t') features = C.sparse_input(vocab, dynamic_axis=t, name='features') labels = C.input(num_labels, name='labels') train_reader = C.CNTKTextFormatReader(train_file) # setup embedding matrix embedding = C.parameter((embed_dim, vocab), learning_rate_multiplier=0.0, init_from_file_path=embedding_file) # get the vector representing the word sequence = C.times(embedding, features, name='sequence') # add an LSTM layer L = lstm_layer(output_dim, cell_dim, sequence, input_dim) # add a softmax layer on top w = C.parameter((num_labels, output_dim), name='w') b = C.parameter((num_labels), name='b') z = C.times(w, L) + b z.name='z' z.tag = "output" # and reconcile the shared dynamic axis pred = C.reconcile_dynamic_axis(z, labels, name='pred') ce = C.cross_entropy_with_softmax(labels, pred) ce.tag = "criterion" my_sgd = C.SGDParams(epoch_size=0, minibatch_size=10, learning_rates_per_mb=0.1, max_epochs=3) with C.LocalExecutionContext('seqcla') as ctx: # train the model ctx.train(root_nodes=[ce], training_params=my_sgd, input_map=train_reader.map( features, alias='x', dim=vocab, format='Sparse').map( labels, alias='y', dim=num_labels, format='Dense')) # write out the predictions ctx.write(input_map=train_reader.map( features, alias='x', dim=vocab, format='Sparse').map( labels, alias='y', dim=num_labels, format='Dense')) # do some manual accuracy testing acc = calc_accuracy(train_file, ctx.output_filename_base) # and test for the same number... TOLERANCE_ABSOLUTE = 1E-02 assert np.allclose(acc, 0.6006415396952687, atol=TOLERANCE_ABSOLUTE)
def test_cntk_basic(): try: import tensorflow has_tensorflow = True except: has_tensorflow = False if has_tensorflow: tf_baseline_basic() else: cntk_baseline_basic() import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance ci.set_workdir(workdir) p1 = C.parameter(shape1) p2 = C.parameter(shape2) ci.watch(p1, 'p1') ci.watch({'param1':p1, 'param2':p2}, 'p1_p2', var_type=crct.DictParameterType) ci.assign('p1', load=True) assert np.isclose(p1.value, param1).all() ci.assign('p1_p2', load=True) assert np.isclose(p1.value, param1).all() and np.isclose(p2.value, param2).all() # test assign with value ci.assign('p1', value=param1) ci.assign('p1_p2', value={'param1':param1, 'param2':param2}) ci.reset()
def test_get_data_type(): pa32 = C.parameter(init=np.asarray(2, dtype=np.float32)) pa64 = C.parameter(init=np.asarray(2, dtype=np.float64)) pl = C.placeholder(shape=(2)) c = C.constant(value=3.0) n32 = AA(1, dtype=np.float32) n64 = AA(1, dtype=np.float64) assert get_data_type(pa32) == np.float32 assert get_data_type(pa32, n32) == np.float32 assert get_data_type(n32, n32) == np.float32 assert get_data_type(n32, n64) == np.float64 assert get_data_type(pl, n64) == np.float64 assert get_data_type(pl, n32) == np.float32 assert get_data_type(pl, pl) is None # variable's type shall take precedence over provided data assert get_data_type(pa32, n64) == np.float32 assert get_data_type(pa64, n64) == np.float64 assert get_data_type(pa32, pl, n64) == np.float32 assert get_data_type(pa64, pl, n64) == np.float64 assert get_data_type(np.float64(1)) == np.float64 assert get_data_type(np.float32(1)) == np.float32 assert get_data_type(np.int64(1)) == np.float32 # special case for cntk assert get_data_type(1) == np.float32 assert get_data_type(1.0) == np.float32
def test_noise_injection_with_checkpointing(): from cntk import initializer shape = (100,100) w1 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) w2 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) w3 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) lr=learning_rate_schedule(0.5, UnitType.sample) m=C.momentum_schedule(0.99) learner1 = C.momentum_sgd([w1], lr, m, gaussian_noise_injection_std_dev=0.5) learner2 = C.momentum_sgd([w2], lr, m, gaussian_noise_injection_std_dev=0.5) learner3 = C.momentum_sgd([w3], lr, m, gaussian_noise_injection_std_dev=0.5) assert np.allclose(w1.value, w2.value) and np.allclose(w1.value, w3.value) for i in range(10): checkpoint = learner1.create_checkpoint() v = np.float32(np.random.rand(100,100)) learner1.update({w1: v}, 1) learner2.update({w2: v}, 1) assert not np.allclose(w1.value, w2.value) learner3.restore_from_checkpoint(checkpoint) learner3.update({w3: v}, 1) assert np.allclose(w1.value, w3.value)
def _graph_dict(): # This function creates a graph that has no real meaning other than # providing something to traverse. d = {} d['i1'] = C.sequence.input_variable(shape=(2, 3), sequence_axis=Axis('ia'), name='i1') d['c1'] = C.constant(shape=(2, 3), value=6, name='c1') d['p1'] = C.parameter(shape=(3, 2), init=7, name='p1') d['op1'] = C.plus(d['i1'], d['c1'], name='op1') d['op2'] = C.times(d['op1'], d['p1'], name='op2') #d['slice'] = slice(d['c1'], Axis.default_dynamic_axis(), 0, 3) #label_sentence_start = sequence.first(raw_labels) # no name d['p2'] = C.parameter(shape=(2, 2)) # duplicate names d['op3a'] = C.plus(d['op2'], d['p2'], name='op3') d['op3b'] = C.plus(d['op3a'], d['p2'], name='op3') d['first'] = C.sequence.first(d['op3b'], name='past') d['root'] = d['first'] return d
def BinaryConvolution(operand, filter_shape, num_filters=1, channels = 1, init=C.glorot_uniform(), pad=False, strides=1, bias=True, init_bias=0, op_name='BinaryConvolution', name=''): """ arguments: operand: tensor to convolve filter_shape: tuple indicating filter size num_filters: number of filters to use channels: number of incoming channels init: type of initialization to use for weights """ kernel_shape = (num_filters, channels) + filter_shape W = C.parameter(shape=kernel_shape, init=init, name="filter") binary_convolve_operand_p = C.placeholder(operand.shape, operand.dynamic_axes, name="operand") binary_convolve = C.convolution(CustomMultibit(W, 1), CustomMultibit(binary_convolve_operand_p, 1), auto_padding=[False, pad, pad], strides=[strides]) r = C.as_block(binary_convolve, [(binary_convolve_operand_p, operand)], 'binary_convolve') bias_shape = (num_filters, 1, 1) b = C.parameter(shape=bias_shape, init=init_bias, name="bias") r = r + b # apply learnable param relu P = C.parameter(shape=r.shape, init=init, name="prelu") r = C.param_relu(P, r) return r
def test_nce_loss(classes, xdim, batch, expected_value, device_id, precision): dt = PRECISION_TO_TYPE[precision] from cntk.losses import nce_loss import scipy x = C.input_variable(xdim, needs_gradient=True) y = C.input_variable(classes, is_sparse=True) x0 = np.arange(batch * xdim, dtype=dt).reshape((batch, xdim))/(batch * xdim) data = np.ones(batch, dtype=dt) indices = list(range(10,10*batch+1,10)) indptr = list(range(batch+1)) y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes)) q = np.arange(classes, dtype=dt) + 1 b = C.parameter((classes, 1), init=-np.log(classes)) W = C.parameter((classes, C.InferredDimension), init=C.glorot_uniform(seed=98052)) loss = C.nce_loss(W, b, x, y, q, seed=98052) v = loss.grad({x:x0, y:y0}, wrt=loss.parameters, as_numpy=False) for key in v: assert v[key].is_sparse, "gradient of nce_loss with respect to %s is not sparse"%key losses = np.zeros((100,batch)) for i in range(100): losses[i,:] = loss.eval({x:x0, y:y0}) assert np.allclose(np.mean(losses, axis=0), AA(expected_value))
def linear_layer(input_var, output_dim): input_dim = input_var.shape[0] times_param = C.parameter(shape=(input_dim, output_dim)) bias_param = C.parameter(shape=(output_dim)) t = C.times(input_var, times_param) return bias_param + t
def test_eval_again_with_prev_outputs_live(device_id): x = C.input_variable(2) dev = cntk_device(device_id) w1 = C.parameter(init=np.asarray([1], dtype=np.float32), device=dev) w2 = C.parameter(init=np.asarray([-1], dtype=np.float32), device=dev) out1 = x + w1 out2 = x + w2 op = C.combine([out1, out2]) result1 = op.eval({x : np.asarray([2, 5], dtype=np.float32)}, device=dev) assert np.array_equal(result1[out1.output], [[3, 6]]) assert np.array_equal(result1[out2.output], [[1, 4]]) result2 = op.eval({x : np.asarray([[-1, 4], [-4, 7]], dtype=np.float32)}, device=dev) assert np.array_equal(result2[out1.output], [[0, 5], [-3, 8]]) assert np.array_equal(result2[out2.output], [[-2, 3], [-5, 6]]) # result1 should still be valid assert np.array_equal(result1[out1.output], [[3, 6]]) assert np.array_equal(result1[out2.output], [[1, 4]]) result1 = op.eval({x : np.asarray([2, 5], dtype=np.float32)}, device=dev, as_numpy=False) assert np.array_equal(result1[out1.output].asarray(), [[3, 6]]) assert np.array_equal(result1[out2.output].asarray(), [[1, 4]]) result2 = op.eval({x : np.asarray([[-1, 4], [-4, 7]], dtype=np.float32)}, device=dev, as_numpy=False) assert np.array_equal(result2[out1.output].asarray(), [[0, 5], [-3, 8]]) assert np.array_equal(result2[out2.output].asarray(), [[-2, 3], [-5, 6]]) # Accessing result1 now will cause an error since it was a temporary that # is now erased, due to the subsequent eval call with pytest.raises(RuntimeError): assert np.array_equal(result1[out1.output].asarray(), [[3, 6]]) grad_op = out1 + out2 grad1 = grad_op.grad({x : np.asarray([2, 5], dtype=np.float32)}, wrt=[w1, w2], device=dev) assert np.array_equal(grad1[w1], [2]) assert np.array_equal(grad1[w2], [2]) grad2 = grad_op.grad({x : np.asarray([[-1, 4], [-4, 7]], dtype=np.float32)}, wrt=[w1, w2], device=dev) assert np.array_equal(grad2[w1], [4]) assert np.array_equal(grad2[w2], [4]) # grad1 should still be valid assert np.array_equal(grad1[w1], [2]) assert np.array_equal(grad1[w2], [2]) grad1 = grad_op.grad({x : np.asarray([2, 5], dtype=np.float32)}, wrt=[w1, w2], device=dev, as_numpy=False) assert np.array_equal(grad1[w1].asarray(), [2]) assert np.array_equal(grad1[w2].asarray(), [2]) grad2 = grad_op.grad({x : np.asarray([[-1, 4], [-4, 7]], dtype=np.float32)}, wrt=[w1, w2], device=dev, as_numpy=False) assert np.array_equal(grad2[w1].asarray(), [4]) assert np.array_equal(grad2[w2].asarray(), [4]) # Accessing grad1 now will cause an error since it was a temporary that # is now erased, due to the subsequent grad call with pytest.raises(RuntimeError): assert np.array_equal(grad1[w1].asarray(), [2])
def linear_layer(input_var, output_dim): input_dim = input_var.shape[0] weight_param = C.parameter(shape=(input_dim, output_dim)) bias_param = C.parameter(shape=(output_dim)) param_dict['w'], param_dict['b'] = weight_param, bias_param return C.times(input_var, weight_param) + bias_param
def test_gather_op(device_id, precision): a_data = [AA([[0],[1]], dtype=PRECISION_TO_TYPE[precision]), AA([[3],[4]], dtype=PRECISION_TO_TYPE[precision])] a = C.input_variable((2,1)) r_data = np.arange(12).reshape(6,2).astype('f') r = C.parameter(shape=r_data.data, init=r_data) res = C.gather(r, a).eval({a:a_data}) expectd = np.asarray([[[[0., 1.]],[[2., 3.]]],[[[6., 7.]],[[8.,9.]]]]) assert np.array_equal(res, expectd) grads = C.gather(r, a).grad({a:a_data}, [r]) expectd_grad = np.asarray([[1,1],[1,1],[0,0],[1,1],[1,1],[0,0]], dtype=np.float32) assert np.array_equal(grads, expectd_grad) #gather with indices from learning parameter (no gradients should passed through the indices -- 0s should be passed) indices_params = C.parameter(shape=(1,), init=1.0) grads = C.gather(r, (indices_params *a)).grad({a:a_data}, [r, indices_params]) assert np.array_equal(grads[r], expectd_grad) assert np.array_equal(grads[indices_params], np.asarray([0.0], dtype=np.float32)) b_data = [AA([[0,2],[1,3]], dtype=PRECISION_TO_TYPE[precision]), AA([[2,4],[3,5]], dtype=PRECISION_TO_TYPE[precision])] b = C.input_variable((2,2)) res2 = C.gather(r, b).eval({b:b_data}) expectd2 = np.asarray([[[[0., 1.],[4.,5.]],[[2., 3.],[6., 7.]]],[[[4., 5.],[8.,9.]],[[6., 7.], [10., 11.]]]]) assert np.array_equal(res2, expectd2) #the following small model is to test the memory reuse issue of gather node. x = C.input((3, 4)) x1 = C.to_sequence(x) w = C.parameter((5, 6), init=1) z = C.gather(w, x1) assert z.shape == (4, 6) #need the unpack node to trigger memory reuse. f = C.sequence.unpack(z, 0, no_mask_output=True) y = C.input((3, 4, 6)) loss = C.reduce_mean(C.square(f - y), axis=-1) loss = C.reduce_mean(loss, axis=C.Axis.all_axes()) g = C.constant(0, shape=w.shape) u = C.assign(w, g + 1) learner = C.cntk_py.universal_learner([w], [g], u) trainer = C.trainer.Trainer(loss, [loss], [learner]) indices = np.asarray([[[1, 2, 1, 2]]]) input = np.repeat(np.repeat(indices, 3, axis=1), 10, axis=0) lable = np.full((10, 3, 4, 6), 2) trainer.train_minibatch({x: input, y: lable}) # the 2nd and 3rd rows should be udpated by gradients. assert np.mean(w.value[1, :]) < 1 assert np.mean(w.value[2, :]) < 1 # the other three rows should keep as 1 assert np.isclose(np.mean(w.value[0, :]), 1) assert np.isclose(np.mean(w.value[3, :]), 1) assert np.isclose(np.mean(w.value[4, :]), 1)
def test_ext_eval_5_times(): dim = 2 p_init = 10 p = C.parameter(shape=(dim,), init=p_init, name='p') m = C.user_function(MyPlus(p, C.constant(3))) z = C.times(m, C.parameter(shape=(2, 50), init=2)) result = z.eval() # No batch dimension since we have no input assert np.allclose(result, ((p_init * np.ones_like(result)) + 3) * 2 * 2)
def test_h_softmax_for_sequence(): input_dim = 2 num_output_classes = 4 minibatch_size = 3 seq_size = 2 n_classes = int(ceil(sqrt(num_output_classes))) n_outputs_per_class = n_classes w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1') b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1') w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s') b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s') # neural network structure for hierarchical softmax h_input = C.sequence.input_variable(input_dim) h_target_class = C.sequence.input_variable([1]) h_target_output_in_class = C.sequence.input_variable([1]) h_z, class_probs, all_probs = C.hierarchical_softmax_layer_for_sequence(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s) a = np.reshape(np.arange(seq_size * minibatch_size * input_dim, dtype = np.float32), (seq_size, minibatch_size, input_dim)) labels = np.reshape(np.arange(seq_size * minibatch_size, dtype = np.float32), (seq_size, minibatch_size, 1)) % num_output_classes target_labels = labels // n_outputs_per_class target_output_in_labels = labels % n_outputs_per_class val_z = h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) val_class_probs = class_probs.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) val_all_probs = [x.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) for x in all_probs] expected_z = [[[ 0.16448107], [ 0.00597861], [ 0.99322051]], [[ 8.59128195e-04], [ 3.77086673e-09], [ 3.42400197e-12]]] expected_class_probs = [[[ 5.81252098e-01, 4.18747932e-01], [ 1.03938626e-02, 9.89606142e-01], [ 7.94661901e-05, 9.99920487e-01]], [[ 6.01340048e-07, 9.99999404e-01], [ 4.55011762e-09, 1.00000000e+00], [ 3.44291574e-11, 1.00000000e+00]]] expected_all_probs = [[[[ 1.64481074e-01, 4.16771024e-01], [ 4.41524992e-03, 5.97861316e-03], [ 4.61043091e-05, 3.33618809e-05]], [[ 4.33648694e-07, 1.67691354e-07], [ 3.77086673e-09, 7.79251219e-10], [ 3.10051568e-11, 3.42400197e-12]]], [[[ 0.29590073, 0.12284722], [ 0.93986785, 0.04973821], [ 0.99322051, 0.00669997]], [[ 9.99140263e-01, 8.59128195e-04], [ 9.99890447e-01, 1.09594235e-04], [ 9.99986053e-01, 1.39711719e-05]]]] assert np.allclose(expected_z, val_z) assert np.allclose(expected_class_probs, val_class_probs) assert np.allclose(expected_all_probs, val_all_probs)
def hierarchical_softmax_layer(input_var, label_index, label_dim, label_classes=None): ''' A two layers hierarchical softmax function: Args: input_var: Variable with shape: [#,*](dim_x) label_index: index of label's category: [#,*](1) label_dim: number of the label categories label_classes: number of classes of the label categories Returns: output_prob: the probability of the given label [#,*](1) class_probs: the probability of all the label classes [#,*](label_classes) all_probs: the probability of all label classes ''' input_dim = input_var.shape[0] if not label_classes: label_classes = int(np.ceil(np.sqrt(float(label_dim)))) n_outputs_per_class = int(np.ceil(label_dim / label_classes)) target_class = C.floor((label_index + 0.5) / n_outputs_per_class) target_output_in_class = C.round(label_index - target_class * n_outputs_per_class) w1 = parameter(shape=(input_dim, label_classes), init=C.glorot_normal(), name='hsoftmax_w1') b1 = parameter(shape=(label_classes), init=C.glorot_normal(), name='hsoftmax_b1') w2s = parameter(shape=(label_classes, input_dim, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_w2s') b2s = parameter(shape=(label_classes, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_b2s') class_probs = softmax(b1 + times(input_var, w1)) # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate target_class_one_hot = C.one_hot(target_class, num_classes=label_classes, sparse_output=False) w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2), [input_dim, -1]) b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1]) probs_in_class = softmax(b2 + times(input_var, w2)) prob_in_class = C.times_transpose(C.one_hot(target_output_in_class, num_classes=n_outputs_per_class, sparse_output=False), probs_in_class) class_prob = C.times_transpose(C.one_hot(target_class, num_classes=label_classes, sparse_output=False), class_probs) output_prob = prob_in_class * class_prob # this is for calculating all the outputs' probabilities all_probs = [] for i in range(label_classes): ci = C.constant(i) ci_one_hot = C.one_hot(ci, num_classes=label_classes, sparse_output=False) w2a = C.times(ci_one_hot, w2s, output_rank=2) b2a = C.times(ci_one_hot, b2s, output_rank=1) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) class_proba = C.times_transpose(ci_one_hot, class_probs) output_proba = probs_in_classa * class_proba all_probs.append(output_proba) return output_prob, class_probs, all_probs
def test_clone_with_function_in_substitution_map(): input_dim = 1 proj_dim = 2 x = C.input_variable((input_dim,)) w = C.parameter((input_dim, proj_dim)) t = C.times(x, w) b = C.parameter((proj_dim)) t_plus_b = t + b p = C.placeholder() just_b = t_plus_b.clone('clone', {t : p}) t_plus_b_clone = just_b.clone('share', {p : t})
def test_assign_to_param(input_data, device_id, precision): dt = PRECISION_TO_TYPE[precision] data = AA(input_data, dtype=dt) value = C.parameter(init=data) dest = C.parameter(shape=data.shape, dtype=dt) assign_op = C.assign(dest, value) result = assign_op.eval() assert np.array_equal(dest.asarray(), data) assert np.array_equal(result, data)
def cntk_baseline_basic(): import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance p1 = C.parameter(shape1, init=param1) p2 = C.parameter(shape2, init=param2) ci.watch(p1, 'p1') ci.watch({'param1':p1, 'param2':p2}, 'p1_p2', var_type=crct.DictParameterType) ci.set_workdir(workdir) ci.fetch('p1', save=True) ci.fetch('p1_p2', save=True) ci.reset()
def test_0d_1d_parameter_set_value(): x = C.input_variable(2) w_0d = C.parameter(()) op = x + w_0d w_0d_grad = op.grad({x : np.asarray([1, 2], dtype=np.float32)}, wrt=[w_0d], as_numpy=False) w_0d.value = w_0d_grad.data assert w_0d.value == 2. w_1d = C.parameter(shape=2) op = x + w_1d w_1d_grad = op.grad({x : np.asarray([1, 2], dtype=np.float32)}, wrt=[w_1d], as_numpy=False) w_1d.value = w_1d_grad.data assert np.array_equal(w_1d.value, [1., 1.])
def test_free_static_axis_in_recurrence(): x = C.sequence.input_variable((C.FreeDimension, 2)) out_placeholder = C.placeholder() out_past = C.sequence.past_value(out_placeholder) wh = C.parameter(init=np.asarray([[2, 5], [1, 3]], dtype=np.float32)) wx = C.parameter(init=np.asarray([[1, 4], [2, 5]], dtype=np.float32)) out = C.times(x, wx) + C.times(out_past, wh) out.replace_placeholders({out_placeholder : out}) x_data = np.asarray([[0.5, 0.2], [-0.7, 1.2]], np.float32) w_grad, out_val = out.grad({x : x_data}, wrt=[wh, wx], outputs=[out]) assert np.allclose(out_val, [[[[0.9, 3.], [1.7, 3.2]]]]) assert np.allclose(w_grad[wx], [[-0.2, -0.2], [1.4, 1.4]])
def test_validation_before_eval(): w = C.parameter((4,C.InferredDimension)) v = C.parameter((C.InferredDimension,5)) wv = C.times(w,v) p = C.input((4,1)) wp = C.times(w,p) q = C.input((1,5)) qv = C.times(q,v) with pytest.raises(ValueError): wv.eval()
def test_assign_dependency(input_data, device_id, precision): dt = PRECISION_TO_TYPE[precision] data = AA(input_data, dtype=dt) value = C.parameter(init=data) dest = C.parameter(shape=data.shape, dtype=dt) assign_op = C.assign(dest, value) y = dest + value result = C.combine([y, assign_op]).eval() assert np.array_equal(result[y.output], data) assert np.array_equal(dest.asarray(), data) assert np.array_equal(y.eval(), data + data)
def test_as_composite(): input_dim = 1 proj_dim = 2 x = C.input_variable((input_dim,)) b = C.parameter((proj_dim)) w = C.parameter((input_dim, proj_dim)) func_name = 't_plus_b' t_plus_b = C.plus(C.times(x, w), b, name=func_name) assert(t_plus_b.root_function.name == func_name) composite = C.as_composite(t_plus_b.root_function) assert(composite.root_function.name == func_name) composite = C.as_composite(composite) assert(composite.root_function.name == func_name) composite = C.as_composite(t_plus_b) assert(composite.root_function.name == func_name)
def matching_attention_layer(self, attention_context): att_context = C.placeholder(shape=(2*self.hidden_dim,)) #matching layer matching_model = C.layers.AttentionModel(attention_dim=self.hidden_dim, name='attention_model') #gate weight Wg = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim)) #gru att_gru = C.layers.GRU(self.hidden_dim) @C.Function def out_func1(att_input, enc_input): enc_input2 = enc_input @C.Function def bigru_with_match(dh, x): c_att = matching_model(att_input, dh) x = C.splice(x, c_att) x = C.element_times(x, C.sigmoid(C.times(x, Wg))) return att_gru(dh, x) return C.splice(C.layers.Recurrence(bigru_with_match)(enc_input2), C.layers.Recurrence(bigru_with_match, go_backwards=True)(enc_input2), name="bigru_with_match") match_context = out_func1(att_context, att_context) return C.as_block( match_context, [(att_context, attention_context)], 'matching_attention_layer', 'matching_attention_layer')
def test_times_2d_sparse_operand(device_id): from .. import times dev = cntk_device(device_id) vocab_size = 6 sample_shape = (2, vocab_size) input_sparse_indices = [[1, 3], [2, 4], [0, 2]] input_data = C.Value.one_hot(input_sparse_indices, sample_shape, device=dev) a = C.input_variable(shape=sample_shape, is_sparse=True, needs_gradient=True, name='a') w_init = np.eye(vocab_size, dtype=np.float32) w = C.parameter(init=w_init, device=dev) a_dense = times(a, w) # TODO: Also test the results from grad grad = a_dense.grad({a : input_data}, [w, a], as_numpy=False, device=dev) res = a_dense.eval({a : input_data}, device=dev) assert np.array_equal(res, [[w_init[input_sparse_indices[0]]], [w_init[input_sparse_indices[1]]], [w_init[input_sparse_indices[2]]]]) a_no_sequence = C.input_variable(shape=sample_shape, is_sparse=True, name='a', dynamic_axes=[C.Axis.default_batch_axis()]) a_no_sequence_dense = times(a_no_sequence, w) res = a_no_sequence_dense.eval({a_no_sequence : input_data}, device=dev) assert np.array_equal(res, [w_init[input_sparse_indices[0]], w_init[input_sparse_indices[1]], w_init[input_sparse_indices[2]]])
def test_gather_op(device_id, precision): a_data = [ AA([[0], [1]], dtype=PRECISION_TO_TYPE[precision]), AA([[3], [4]], dtype=PRECISION_TO_TYPE[precision]) ] a = C.input_variable((2, 1)) r_data = np.arange(12).reshape(6, 2).astype('f') r = C.parameter(shape=r_data.data, init=r_data) res = C.gather(r, a).eval({a: a_data}) expectd = np.asarray([[[[0., 1.]], [[2., 3.]]], [[[6., 7.]], [[8., 9.]]]]) assert np.array_equal(res, expectd) grads = C.gather(r, a).grad({a: a_data}, [r]) expectd_grad = np.asarray([[1, 1], [1, 1], [0, 0], [1, 1], [1, 1], [0, 0]], dtype=np.float32) assert np.array_equal(grads, expectd_grad) b_data = [ AA([[0, 2], [1, 3]], dtype=PRECISION_TO_TYPE[precision]), AA([[2, 4], [3, 5]], dtype=PRECISION_TO_TYPE[precision]) ] b = C.input_variable((2, 2)) res2 = C.gather(r, b).eval({b: b_data}) expectd2 = np.asarray([[[[0., 1.], [4., 5.]], [[2., 3.], [6., 7.]]], [[[4., 5.], [8., 9.]], [[6., 7.], [10., 11.]]]]) assert np.array_equal(res2, expectd2)
def test_ext_backpropstate(payload): class TestBackPropState(UserFunction): def __init__(self, arg, payload, name='f1'): self.payload = payload super(TestBackPropState, self).__init__([arg]) def infer_outputs(self): return [ C.output_variable(self.inputs[0].shape, self.inputs[0].dtype, self.inputs[0].dynamic_axes) ] def forward(self, argument, device=None, outputs_to_retain=None): return self.payload, argument def backward(self, state, root_gradients): assert state == self.payload return root_gradients dim = 4 p = C.parameter(shape=(dim, ), init=10) in1 = C.input_variable(dim, needs_gradient=True, name='i_var') m = C.user_function(TestBackPropState(in1, payload)) z = m + p lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size=1) trainer = C.Trainer(None, (z), [C.sgd(z.parameters, lr_per_sample)]) for i in range(100): input_data = np.random.rand(dim) trainer.train_minibatch({in1: [input_data]})
def test_output_subset_evaluation(device_id): try: gpu_device = C.gpu(0) except ValueError: pytest.skip('Test only runs when GPU available') device = cntk_device(device_id) x1 = C.input_variable(shape=()) op1 = C.constant(value=1, shape=(1), device=device) + (C.constant(value=1, shape=(1), device=device) + x1) x2 = C.input_variable(shape=(1)) # Deliberately locate the parameter on a different device # instead of the actual compute target device, so that # if we try to use this parameter, it results in an error if (device.type() == 0): parameter_device = gpu_device else: parameter_device = C.cpu() p = C.parameter(shape=(1), init=C.glorot_uniform(), device=parameter_device) op2 = (x2 - C.constant(value=10, shape=(1), device=device)) - p op = C.combine([op1, op2]); _, result = op.forward({x1 : np.asarray([1, 2, 3])}, [op1], device=device) assert np.array_equal(result[op1], np.asarray([[3], [4], [5]]))
def test_cntk_cudnn(): try: import tensorflow has_tensorflow = True except: has_tensorflow = False if has_tensorflow: tf_baseline_lstm() else: cntk_baseline_lstm() import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.sequence.input(shape=(in_dim)) data = {input_var:data_cntk} ci.set_data(data) ci.set_workdir(workdir) W = C.parameter((-1,dim,), init=C.glorot_uniform()) cudnn_fwbw = C.optimized_rnnstack(input_var, W, dim, 1, bidirectional=True, recurrent_op='lstm') ci.watch(cudnn_fwbw, 'cntk_birnn_cudnn', var_type=cstk.RnnAttr, attr=cstk.RnnAttr(bidirectional=True, op_type='lstm', input_dim=in_dim, hidden_dim=dim, forget_bias=0)) ci.watch(cudnn_fwbw, 'cntk_birnn_cudnn_out') ci.assign('cntk_birnn_cudnn', load=True, load_name='cntk_birnn') assert ci.compare('cntk_birnn_cudnn_out', compare_name='cntk_birnn_out') ci.fetch('cntk_birnn_cudnn', save=True) ci.assign('cntk_birnn_cudnn', load=True) assert ci.compare('cntk_birnn_cudnn_out', compare_name='cntk_birnn_out') ci.reset()
def test_trainer(tmpdir, no_eval_function): in1 = C.input_variable(shape=(1,)) labels = C.input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) if no_eval_function: errs = None else: errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size =1) trainer = C.Trainer(z, (ce, errs), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') external_state = {"additional external state":math.pi, "nested dict":{"a":"b"}, "list":[1,2,3]} trainer.save_checkpoint(p, external_state) restored_state = trainer.restore_from_checkpoint(p) assert external_state == restored_state assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__ assert isinstance(trainer.parameter_learners[0], C.Learner)
def test_learner_logging(): from cntk import Trainer from cntk.logging import ProgressPrinter from cntk import cross_entropy_with_softmax, classification_error features = C.input_variable(shape=(1, ), needs_gradient=True, name='a') w_init = 1 w = parameter(shape=(1, ), init=w_init) z = features * w labels = C.input_variable(shape=(1, ), name='b') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) writer = TestProgressWriter() lr_values = [0.3, 0.2, 0.1, 0] m_values = [0.6, 0.7, 0.8] learner = C.momentum_sgd( z.parameters, learning_rate_schedule(lr_values, UnitType.sample, 1), C.momentum_schedule(m_values, 1)) trainer = Trainer(z, (ce, errs), [learner], writer) for i in range(10): trainer.train_minibatch({features: [[2.]], labels: [[1.]]}) assert len(writer.log_output) == len(lr_values + m_values) values = [j for i in zip(lr_values, m_values) for j in i] + [0] for i in range(len(values)): assert (values[i] == writer.log_output[i])
def create_fast_rcnn_predictor(conv_out, rois, fc_layers, cfg): # RCNN roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (cfg["MODEL"].ROI_DIM, cfg["MODEL"].ROI_DIM), spatial_scale=1/16.0) fc_out = fc_layers(roi_out) # prediction head W_pred = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES), init=normal(scale=0.01), name="cls_score.W") b_pred = parameter(shape=cfg["DATA"].NUM_CLASSES, init=0, name="cls_score.b") cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score') # regression head W_regr = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES*4), init=normal(scale=0.001), name="bbox_regr.W") b_regr = parameter(shape=cfg["DATA"].NUM_CLASSES*4, init=0, name="bbox_regr.b") bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr') return cls_score, bbox_pred
def embed(self): npglove = np.zeros((self.wg_dim, self.elmo_dim + self.hidden_dim), dtype=np.float32) hf = h5py.File( os.path.join(self.abs_path, self.data_config['elmo_embedding']), 'r') with open(os.path.join(self.abs_path, self.data_config['glove_embedding']), encoding='utf-8') as f: for line in f: parts = line.split() word = parts[0].lower() if word in self.vocab: try: if len(parts) == 301: npglove[self.vocab[word], :300] = np.asarray( [float(p) for p in parts[-300:]]) npglove[self.vocab[word], 300:] = np.average(hf[word][:], axis=0) except: npglove[self.vocab[word], 300:] = np.average(hf['<UNK>'][:], axis=0) glove = C.constant(npglove) nonglove = C.parameter(shape=(self.wn_dim, self.elmo_dim + self.hidden_dim), init=C.glorot_uniform(), name='TrainableE') def func(wg, wn): return C.times(wg, glove) + C.times(wn, nonglove) return func
def test_empty_minibatch(): scalar = C.input_variable((1,), dtype=np.float32, name='tscalar') op = scalar + parameter(init=np.asarray([1]), dtype=np.float32) lr_per_sample = C.learning_parameter_schedule(0.1, minibatch_size =1) trainer = C.Trainer(op, (op, None), C.sgd(op.parameters, lr_per_sample)) trainer.train_minibatch({})
def test_trainer(tmpdir, no_eval_function): in1 = input(shape=(1, )) labels = input(shape=(1, )) p = parameter(shape=(2, ), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) if no_eval_function: errs = None else: errs = classification_error(z, labels) momentum_time_constant = momentum_as_time_constant_schedule(1100) lr_per_sample = learning_rate_schedule(0.007, UnitType.sample) trainer = Trainer(z, (ce, errs), [ momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True) ]) in1_value = [[1], [2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') trainer.save_checkpoint(p) trainer.restore_from_checkpoint(p) assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__ assert isinstance(trainer.parameter_learners[0], Learner)
def test_rnn(device_id): if device_id == -1: pytest.skip('Test only runs on GPU') batch_size = 8 sequence_len = 100 vocab_dim = 20 embed_dim = 10 hidden_dim = 7 input = C.cast(C.sequence.input_variable(()), np.float16) with C.default_options(dtype=np.float16): embed = C.layers.Embedding(embed_dim)(C.one_hot(input, num_classes=vocab_dim, sparse_output=False)) z = C.layers.Recurrence(C.layers.LSTM(hidden_dim))(embed) feed = np.floor( np.random.rand(batch_size, sequence_len).astype(np.float32) * (vocab_dim - 1)) z.grad(feed, wrt=z.parameters) num_layers = 2 W = C.parameter((C.InferredDimension, embed_dim), init=C.glorot_uniform(), dtype=np.float16) with C.default_options(dtype=np.float16): z = C.optimized_rnnstack(embed, W, hidden_dim, num_layers) feed = np.floor( np.random.rand(batch_size, sequence_len).astype(np.float32) * (vocab_dim - 1)) z.grad(feed, wrt=z.parameters)
def test_OptimizedRNNStack(bidirectional, num_layers, input_size, hidden_size, recurrent_op, tmpdir, device_id): if device_id == -1: pytest.skip('Test only runs on GPU') dev = cntk_device(device_id) from _cntk_py import constant_initializer model_filename = 'optimized_rnn_stack_' + ( 'bi' if bidirectional else 'uni') + '_layers' + str( num_layers) + '_inp' + str(input_size) + '_hid' + str(hidden_size) W = C.parameter((C.InferredDimension, input_size), constant_initializer(0.1), device=dev) x = C.sequence.input_variable(shape=(input_size, )) s = np.asarray(np.random.uniform(-1, 1, (5, input_size)), dtype=np.float32) f = C.optimized_rnnstack(x, W, hidden_size, num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op, name='MyRnnStack') f.parameters[0].value = np.reshape( np.arange(np.prod(f.parameters[0].value.shape), dtype=np.float32), f.parameters[0].value.shape) verify_one_input(f, s, tmpdir, model_filename)
def create_fast_rcnn_predictor(conv_out, rois, fc_layers): # RCNN roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (roi_dim, roi_dim), spatial_scale=1/16.0) fc_out = fc_layers(roi_out) # prediction head W_pred = parameter(shape=(4096, globalvars['num_classes']), init=normal(scale=0.01), name="cls_score.W") b_pred = parameter(shape=globalvars['num_classes'], init=0, name="cls_score.b") cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score') # regression head W_regr = parameter(shape=(4096, globalvars['num_classes']*4), init=normal(scale=0.001), name="bbox_regr.W") b_regr = parameter(shape=globalvars['num_classes']*4, init=0, name="bbox_regr.b") bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr') return cls_score, bbox_pred
def test_free_and_inferred_static_dimension(): x = C.input_variable((C.FreeDimension, -1)) w = C.parameter(init=np.asarray([[2, 5], [1, 3]], dtype=np.float32)) t = C.times(x, w) x_data = np.asarray([[0.5, 0.2]], np.float32) w_grad, t_val = t.grad({x: x_data}, wrt=[w], outputs=[t]) assert np.array_equal(t_val, np.asarray([[[1.2, 3.1]]], dtype=np.float32)) assert np.array_equal(w_grad, np.asarray([[0.5, .5], [.2, .2]], dtype=np.float32)) x_data = np.asarray([[0.5, 0.2], [0.1, .6]], np.float32) w_grad, t_val = t.grad({x: x_data}, wrt=[w], outputs=[t]) assert np.allclose( t_val, np.asarray([[[1.2, 3.1], [0.8, 2.3]]], dtype=np.float32)) assert np.array_equal(w_grad, np.asarray([[0.6, .6], [.8, .8]], dtype=np.float32)) x_data = np.asarray([[0.5, 0.2]], np.float32) w_grad, t_val = t.grad({x: x_data}, wrt=[w], outputs=[t]) assert np.array_equal(t_val, np.asarray([[[1.2, 3.1]]], dtype=np.float32)) assert np.array_equal(w_grad, np.asarray([[0.5, .5], [.2, .2]], dtype=np.float32)) x_data = np.asarray([[0.5, 0.2, 0.9]], np.float32) with pytest.raises(ValueError): w_grad, t_val = t.grad({x: x_data}, wrt=[w], outputs=[t])
def test_learner_logging(): from cntk import Trainer from cntk.logging import ProgressPrinter from cntk import cross_entropy_with_softmax, classification_error features = C.input_variable(shape=(1,), needs_gradient=True, name='a') w_init = 1 w = parameter(shape=(1,), init=w_init) z = features * w labels = C.input_variable(shape=(1,), name='b') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) writer = TestProgressWriter(); lr_values = [0.3, 0.2, 0.1, 0] m_values = [0.6, 0.7, 0.8] learner = C.momentum_sgd(z.parameters, learning_rate_schedule(lr_values, UnitType.sample, 1), C.momentum_schedule(m_values, 1)) trainer = Trainer(z, (ce, errs), [learner], writer) for i in range(10): trainer.train_minibatch({features: [[2.]], labels: [[1.]]}) assert len(writer.log_output) == len(lr_values + m_values) values = [j for i in zip(lr_values,m_values) for j in i] + [0] for i in range(len(values)): assert (values[i] == writer.log_output[i])
def test_times_2d_sparse_operand(device_id): from .. import times dev = cntk_device(device_id) vocab_size = 6 sample_shape = (2, vocab_size) input_sparse_indices = [[1, 3], [2, 4], [0, 2]] input_data = C.Value.one_hot(input_sparse_indices, sample_shape, device=dev) a = C.sequence.input(shape=sample_shape, is_sparse=True, needs_gradient=True, name='a') w_init = np.eye(vocab_size, dtype=np.float32) w = C.parameter(init=w_init, device=dev) a_dense = times(a, w) # TODO: Also test the results from grad grad = a_dense.grad({a : input_data}, [w, a], as_numpy=False, device=dev) res = a_dense.eval({a : input_data}, device=dev) assert np.array_equal(res, [[w_init[input_sparse_indices[0]]], [w_init[input_sparse_indices[1]]], [w_init[input_sparse_indices[2]]]]) a_no_sequence = C.input(shape=sample_shape, is_sparse=True, name='a') a_no_sequence_dense = times(a_no_sequence, w) res = a_no_sequence_dense.eval({a_no_sequence : input_data}, device=dev) assert np.array_equal(res, [w_init[input_sparse_indices[0]], w_init[input_sparse_indices[1]], w_init[input_sparse_indices[2]]])
def session(is_sparse): x = C.input_variable((200,), is_sparse=is_sparse) w = C.parameter((200, 100)) y = C.times(x, w) z = [0] * 100 + [1] * 100 for i in range(200): j = (3 * i * i + 5 * i + 1) % 200 # just a random looking index z[i], z[j] = z[j], z[i] import scipy.sparse x11 = scipy.sparse.csr_matrix(np.array([1] * 200).astype('f')) x01 = scipy.sparse.csr_matrix(np.array(z).astype('f')) t = C.Trainer(y, y, learner(y.parameters)) w.value = 0 * w.value t.train_minibatch({x: [x11]}) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x01]}) if checkpoint: t.save_checkpoint(str(tmpdir.join('checkpoint'))) t.train_minibatch({x: [x11]}) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x01]}) t.restore_from_checkpoint(str(tmpdir.join('checkpoint'))) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x11]}) return w.value
def __init__(self, n_in, n_out, init_lr, momentum): self.param1 = 512 self.param2 = 256 self.n_in = int(n_in) self.n_out = int(n_out) self.input = C.sequence.input_variable(shape=(self.n_in,)) self.label = C.sequence.input_variable(shape=(self.n_out,)) self.three_dnn = C.layers.Sequential([ C.layers.Dense(self.param1, activation=C.tanh, name='dnn_three_1'), C.layers.Dense(self.param1, activation=C.tanh, name='dnn_three_2'), C.layers.Dense(self.param1, activation=C.tanh, name='dnn_three_3')]) self.final_dnn = C.layers.Dense(self.n_out, name='dnn_final') self.dnn_1 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_1') self.dnn_2 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_2') self.dnn_3 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_3') self.dnn_4 = C.layers.Dense(8 * self.param2, bias=False, name='dnn_4') self.list_bias = [] for i in xrange(16): self.list_bias.append(C.parameter(shape=(self.param2, ), name='bias_' + str(i))) self.output = self.model(self.input) self.loss = loss_fun(self.output, self.label) self.eval_err = loss_fun(self.output, self.label) self.lr_s = C.learning_rate_schedule(init_lr, C.UnitType.sample) self.mom_s = C.momentum_schedule(momentum) self.learner = C.momentum_sgd(self.output.parameters, lr=self.lr_s, momentum=self.mom_s) self.trainer = C.Trainer(self.output, (self.loss, self.eval_err), [self.learner])
def session(is_sparse): x = C.input_variable((200, ), is_sparse=is_sparse) w = C.parameter((200, 100)) y = C.times(x, w) z = [0] * 100 + [1] * 100 for i in range(200): j = (3 * i * i + 5 * i + 1) % 200 # just a random looking index z[i], z[j] = z[j], z[i] import scipy.sparse x11 = scipy.sparse.csr_matrix(np.array([1] * 200).astype('f')) x01 = scipy.sparse.csr_matrix(np.array(z).astype('f')) t = C.Trainer(y, y, learner(y.parameters)) w.value = 0 * w.value t.train_minibatch({x: [x11]}) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x01]}) if checkpoint: t.save_checkpoint(str(tmpdir.join('checkpoint'))) t.train_minibatch({x: [x11]}) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x01]}) t.restore_from_checkpoint(str(tmpdir.join('checkpoint'))) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x11]}) return w.value
def test_empty_minibatch(): scalar = input((1, ), dtype=np.float32, name='tscalar') op = scalar + parameter(init=np.asarray([1]), dtype=np.float32) lr_per_sample = learning_rate_schedule(0.1, UnitType.sample) trainer = Trainer(op, (op, None), sgd(op.parameters, lr_per_sample)) trainer.train_minibatch({})
def test_cntk_embed(): try: import tensorflow has_tensorflow = True except: has_tensorflow = False if has_tensorflow: tf_baseline_embed() else: cntk_baseline_embed() import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance ci.set_workdir(workdir) embed = C.parameter(( input_dim, emb_dim, )) ci.watch(embed, 'embed', var_type=cstk.EmbedAttr, attr=cstk.EmbedAttr(dict=dict2, input_dim=input_dim)) ci.assign('embed', load=True) assert np.isclose(emb2, embed.value).all() # test assign with value ci.assign('embed', value={'a': emb1[0], 'b': emb1[1], 'c': emb1[2]}) ci.reset()
def OptimizedRnnStack(hidden_dim, num_layers=1, recurrent_op='gru', bidirectional=False, use_cudnn=True, name=''): if use_cudnn: W = C.parameter(_INFERRED + (hidden_dim, ), init=C.glorot_uniform()) def func(x): return C.optimized_rnnstack(x, W, hidden_dim, num_layers, bidirectional, recurrent_op=recurrent_op, name=name) return func else: def func(x): return C.splice(C.layers.Recurrence(C.layers.GRU(hidden_dim))(x), C.layers.Recurrence(C.layers.GRU(hidden_dim), go_backwards=True)(x), name=name) return func
def word_glove(self): # load glove if os.path.isfile('glove300.model'): print('[BUILD] load glove300.model') return C.load_model('glove300.model') npglove = np.zeros((self.wg_dim, self.word_emb_dim), dtype=np.float32) with open(os.path.join(self.abs_path, self.word_embed_file), encoding='utf-8') as f: for line in f: parts = line.split() word = parts[0].lower() if self.vocab.get(word, self.wg_dim) < self.wg_dim: npglove[self.vocab[word], :] = np.asarray( [float(p) for p in parts[-300:]]) glove = C.constant(npglove) nonglove = C.parameter(shape=(len(self.vocab) - self.wg_dim, self.word_emb_dim), init=C.glorot_uniform(), name='TrainableE') @C.Function def func(wg, wn): return C.times(wg, glove) + C.times(wn, nonglove) func.save('glove300.model') print('[BUILD] save glove300.model') return func
def test_scalar_input(): scalar = C.input_variable((1,), dtype=np.float32, name='tscalar') op = scalar + parameter(init=np.asarray([1]), dtype=np.float32) lr_per_sample = C.learning_rate_schedule(0.1, C.UnitType.sample) trainer = C.Trainer(op, (op, None), C.sgd(op.parameters, lr_per_sample)) trainer.train_minibatch({scalar: np.zeros((2,1), dtype=np.float32)})
def create_model(self): self.input_dim = 1000 self.embed_dim = 30 i = C.input_variable((self.input_dim, ), is_sparse=True) self.p = C.parameter(shape=(self.input_dim, self.embed_dim), init=1) o = C.times(i, self.p) self.z = C.reduce_sum(o)
def test_data_type_inference(): x_float = C.input_variable((1,), dtype = np.float64) param1 = C.parameter((C.InferredDimension, 1), init = C.glorot_uniform(), dtype = C.cntk_py.DataType_Unknown) assert (param1.get_data_type() == C.cntk_py.DataType_Unknown) x_times_param1 = C.times(x_float, param1) assert (param1.dtype == np.float64)
def test_trainer(tmpdir, no_eval_function): in1 = input_variable(shape=(1,)) labels = input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) if no_eval_function: errs = None else: errs = classification_error(z, labels) momentum_time_constant = momentum_as_time_constant_schedule(1100) lr_per_sample = learning_rate_schedule(0.007, UnitType.sample) trainer = Trainer(z, (ce, errs), [momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') trainer.save_checkpoint(p) trainer.restore_from_checkpoint(p) assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__ assert isinstance(trainer.parameter_learners[0], Learner)
def test_usermbsource_training(tmpdir): input_dim = 1000 num_output_classes = 5 mbs = MyDataSource(input_dim, num_output_classes) from cntk import sequence, parameter, plus, cross_entropy_with_softmax, \ classification_error, learning_rate_schedule, sgd, Trainer, \ training_session, times, UnitType, input feature = sequence.input(shape=(input_dim, )) label = input(shape=(num_output_classes, )) p = parameter(shape=(input_dim, num_output_classes), init=10) z = times(sequence.reduce_sum(feature), p, name='z') ce = cross_entropy_with_softmax(z, label) errs = classification_error(z, label) lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = {feature: mbs.fsi, label: mbs.lsi} session = training_session(trainer=trainer, mb_source=mbs, model_inputs_to_streams=input_map, mb_size=4, max_samples=20) session.train() assert trainer.total_number_of_samples_seen == 20
def gated_attention_gru_layer(self, context, query): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) c_processed = C.placeholder(shape=(2*self.hidden_dim,)) #gate weight Wg = C.parameter(shape=(4*self.hidden_dim, 4*self.hidden_dim)) att_gru = C.layers.GRU(2*self.hidden_dim) attention_model = C.layers.AttentionModel(self.hidden_dim, name='attention_model') @C.Function def out_func0(att_input, enc_input): enc_input2 = enc_input @C.Function def gru_with_attentioin(dh, x): c_att = attention_model(att_input, x) x = C.splice(x, c_att) x = C.element_times(x, C.sigmoid(C.times(x, Wg))) return att_gru(dh, x) att_context = Recurrence(gru_with_attentioin)(enc_input2) return att_context att_context = out_func0(q_processed, c_processed) return C.as_block( att_context, [(c_processed, context), (q_processed, query)], 'gated_attention_gru_layer', 'gated_attention_gru_layer')
def create_trainer(use_sparse, device): a = C.sequence.input(shape=input_shape, is_sparse=use_sparse, name='input') w_i = C.parameter(init=w_init_i, device=dev) a_projection = times(a, w_i) p_o = C.placeholder() h = C.sequence.past_value(p_o) w_h = C.parameter(init=w_init_h, device=dev) h_projection = times(h, w_h) z = a_projection + h_projection z = z.replace_placeholder(z) z = reshape(z, label_shape) l = C.sequence.input(shape=label_shape, is_sparse=use_sparse, name='label') loss = cross_entropy_with_softmax(z, l, axis=-1) trainer = C.Trainer(z, (loss, None), C.sgd(z.parameters, lr=C.learning_rate_schedule(0.7, C.UnitType.sample))) return (a, l, w_i, w_h, trainer)
def create_fast_rcnn_predictor(conv_out, rois, fc_layers, cfg): # RCNN roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (cfg["MODEL"].ROI_DIM, cfg["MODEL"].ROI_DIM), spatial_scale=1 / 16.0) fc_out = fc_layers(roi_out) # prediction head W_pred = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES), init=normal(scale=0.01), name="cls_score.W") b_pred = parameter(shape=cfg["DATA"].NUM_CLASSES, init=0, name="cls_score.b") cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score') # regression head W_regr = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES * 4), init=normal(scale=0.001), name="bbox_regr.W") b_regr = parameter(shape=cfg["DATA"].NUM_CLASSES * 4, init=0, name="bbox_regr.b") bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr') return cls_score, bbox_pred
def GenSimpleScan(): feature = C.sequence.input_variable((128, ), np.float32) param = C.parameter(shape=(1, ), dtype=np.float32) scan = C.layers.Recurrence(lambda h, x: x + h + param)(feature) model = C.sequence.reduce_sum(scan) data_feature = np.random.rand(1, 64, 128).astype(np.float32) data_output = np.asarray(model.eval(data_feature), dtype=np.float32) Save('test_SimpleScan', model, data_feature, data_output)
def test_0d_1d_parameter_set_value(): x = C.input_variable(2) w_0d = C.parameter(()) op = x + w_0d w_0d_grad = op.grad({x: np.asarray([1, 2], dtype=np.float32)}, wrt=[w_0d], as_numpy=False) w_0d.value = w_0d_grad.data assert w_0d.value == 2. w_1d = C.parameter(shape=2) op = x + w_1d w_1d_grad = op.grad({x: np.asarray([1, 2], dtype=np.float32)}, wrt=[w_1d], as_numpy=False) w_1d.value = w_1d_grad.data assert np.array_equal(w_1d.value, [1., 1.])