def test_slice_with_inferred_static_axis(): x = C.input_variable(shape=(C.InferredDimension, C.InferredDimension, 3)) padding_shape = (3, C.InferredDimension, 3) y = C.splice(C.constant(value=0, shape=padding_shape), x, axis=0) assert y.shape == (-1, -1, 3) y = C.splice(x, C.constant(value=0, shape=padding_shape), axis=0) assert y.shape == (-1, -1, 3)
def test_batchnorm(device_id): if device_id == -1: pytest.skip('Test only runs on GPU') shape = (3, ) i = C.input_variable(shape, dtype='float16') scale = C.parameter(shape, init=1, dtype='float') bias = C.parameter(shape, init=2, dtype='float') run_mean = C.constant(3, shape=shape, dtype='float') run_variance = C.constant(4, shape=shape, dtype='float') run_count = C.constant(0, shape=(), dtype='float') bn = C.batch_normalization(i, scale, bias, run_mean, run_variance, running_count=run_count, spatial=False, normalization_time_constant=5000, blend_time_constant=0, epsilon=0.00001, use_cudnn_engine=True, disable_regularization=True) data = AA([[1, 2, 3]]).astype(np.float16) bn.grad(data, wrt=[scale, bias])
def BatchNormalizationTester(map_rank=1, init_scale=1, init_bias=0, normalization_time_constant=5000, blend_time_constant=0, epsilon=0.00001, use_cntk_engine=True, norm_shape=(), init_mean=None, init_variance=None, name=''): """Instantiates a batch normalization layer for testing purposes, where mean and variance can be set. """ # parameters bound to this Function scale = parameter(shape=norm_shape, init=init_scale, name='scale') bias = parameter(shape=norm_shape, init=init_bias, name='bias') run_mean = constant(shape=norm_shape, value=init_mean, name='aggregate_mean') run_variance = constant( shape=norm_shape, value=init_variance, name='aggregate_variance') run_count = constant(0, shape=(), name='aggregate_count') # expression def batch_normalize(x): return batch_normalization( x, scale, bias, run_mean, run_variance, running_count=run_count, spatial=map_rank == 1, normalization_time_constant=normalization_time_constant, blend_time_constant=blend_time_constant, epsilon=epsilon, use_cudnn_engine=not use_cntk_engine) return batch_normalize
def test_output_subset_evaluation(device_id): try: gpu_device = C.gpu(0) except ValueError: pytest.skip('Test only runs when GPU available') device = cntk_device(device_id) x1 = C.input_variable(shape=()) op1 = C.constant(value=1, shape=(1), device=device) + ( C.constant(value=1, shape=(1), device=device) + x1) x2 = C.input_variable(shape=(1)) # Deliberately locate the parameter on a different device # instead of the actual compute target device, so that # if we try to use this parameter, it results in an error if (device.type() == 0): parameter_device = gpu_device else: parameter_device = C.cpu() p = C.parameter(shape=(1), init=C.glorot_uniform(), device=parameter_device) op2 = (x2 - C.constant(value=10, shape=(1), device=device)) - p op = C.combine([op1, op2]) _, result = op.forward({x1: np.asarray([1, 2, 3])}, [op1], device=device) assert np.array_equal(result[op1], np.asarray([[3], [4], [5]]))
def test_conv_with_freedim_model(tmpdir): img_shape = (3, 32, 32) img = np.asarray(np.random.uniform(-1, 1, img_shape), dtype=np.float32) x = C.input_variable((3, C.FreeDimension, C.FreeDimension)) conv_size1 = (32, 3, 5, 5) conv_map1 = C.constant(value=np.arange(np.prod(conv_size1), dtype=np.float32).reshape(conv_size1)) conv_op1 = C.convolution(conv_map1, x, auto_padding=(False, True, True)) relu_op1 = C.relu(conv_op1) maxpool_op1 = C.pooling(relu_op1, C.MAX_POOLING, (2, 2), (2, 2)) conv_size2 = (64, 32, 3, 3) conv_map2 = C.constant(value=np.arange(np.prod(conv_size2), dtype=np.float32).reshape(conv_size2)) conv_op2 = C.convolution(conv_map2, maxpool_op1, auto_padding=(False, True, True)) relu_op2 = C.relu(conv_op2) root_node = C.pooling(relu_op2, C.MAX_POOLING, (2, 2), (2, 2)) filename = os.path.join(str(tmpdir), R'conv_with_freedim.onnx') root_node.save(filename, format=C.ModelFormat.ONNX) loaded_node = C.Function.load(filename, format=C.ModelFormat.ONNX) assert root_node.shape == loaded_node.shape x_ = loaded_node.arguments[0] assert np.allclose(loaded_node.eval({x_:img}), root_node.eval({x:img})) # Additional test to ensure that loaded_node can be saved as both ONNX and CNTKv2 again. filename2 = os.path.join(str(tmpdir), R'conv_with_freedim2.onnx') loaded_node.save(filename2, format=C.ModelFormat.ONNX) filename3 = os.path.join(str(tmpdir), R'conv_with_freedim2.cntkmodel') loaded_node.save(filename3, format=C.ModelFormat.CNTKv2)
def test_output_subset_evaluation(device_id): try: gpu_device = C.gpu(0) except ValueError: pytest.skip('Test only runs when GPU available') device = cntk_device(device_id) x1 = C.input_variable(shape=()) op1 = C.constant(value=1, shape=(1), device=device) + (C.constant(value=1, shape=(1), device=device) + x1) x2 = C.input_variable(shape=(1)) # Deliberately locate the parameter on a different device # instead of the actual compute target device, so that # if we try to use this parameter, it results in an error if (device.type() == 0): parameter_device = gpu_device else: parameter_device = C.cpu() p = C.parameter(shape=(1), init=C.glorot_uniform(), device=parameter_device) op2 = (x2 - C.constant(value=10, shape=(1), device=device)) - p op = C.combine([op1, op2]); _, result = op.forward({x1 : np.asarray([1, 2, 3])}, [op1], device=device) assert np.array_equal(result[op1], np.asarray([[3], [4], [5]]))
def test_op_batch_normalization_spatial_shape_inference(channels, input_size, device_id, precision): dtype = PRECISION_TO_TYPE[precision] dev = cntk_device(device_id) spatial = True epsilon = 0.01 init_scale = 1 init_bias = 2 init_mean = 3 init_var = 4 init_count = 2 shape = (channels, input_size, input_size) param_shape = (C.InferredDimension,) i = C.input_variable(shape, dtype=dtype) scale = C.parameter(param_shape, init=init_scale, dtype=dtype, device=dev) bias = C.parameter(param_shape, init=init_bias, dtype=dtype, device=dev) run_mean = C.constant(init_mean, shape=param_shape, dtype=dtype, device=dev) run_var = C.constant(init_var, shape=param_shape, dtype=dtype, device=dev) run_count = C.constant(init_count, shape=(), dtype=dtype, device=dev) bn = C.batch_normalization(i, scale, bias, run_mean, run_var, spatial, normalization_time_constant=-1, epsilon=epsilon, running_count = run_count) for param in [scale, bias, run_mean, run_var]: assert(param.shape == (channels,))
def vggblock(x, arrays, layer_map, name): f = arrays[0] b = arrays[1] k = C.constant(value=f) t = C.constant(value=np.reshape(b, (-1, 1, 1))) y = C.relu(C.convolution(k, x, auto_padding=[False, True, True]) + t) layer_map[name] = y return y
def _test_eval_plus_two_constants(): result = cntk.eval( cntk.plus(cntk.constant([1., 2., 3., 4.]), cntk.constant([1., 1., 0., 0.]))) TOLERANCE_ABSOLUTE = 1E-06 assert np.allclose(result, np.asarray([2., 3., 3., 4.]), atol=TOLERANCE_ABSOLUTE)
def test_floor_division(): x = [-3, 1, 2, 3, 4, 5.2] y = [2, 2, 2, 2, 2, 2] a = C.constant(x) b = C.constant(y) desired = [i // j for i, j in zip(x, y)] # [-2, 0, 1, 1, 2, 2] result = floor_division(a, b).eval().tolist() assert result == desired
def test_gather_op_with_axis(device_id, precision): data = np.array([ [1.0, 1.2, 1.9], [2.3, 3.4, 3.9], [4.5, 5.7, 5.9], ]).astype(PRECISION_TO_TYPE[precision]) indices = np.array([ 0, 2]).astype(PRECISION_TO_TYPE[precision]).astype(PRECISION_TO_TYPE[precision]) output = np.array([ [1.0, 1.9], [2.3, 3.9], [4.5, 5.9], ]).astype(PRECISION_TO_TYPE[precision]) x = C.constant(data) i = C.constant(indices) y = C.gather(x, i, axis=1) z = y.eval({}, device=cntk_device(device_id)) assert np.allclose(output, z)
def test_remainder(): x = [-3, 1, 2, 3, 4, 3, 5.123] y = [2, 2, 2, 2, 2, -2, -1.234] a = C.constant(x) b = C.constant(y) desired = [i % j for i, j in zip(x, y)] # [1, 1, 0, 1, 0, -1, ...] result = remainder(a, b).eval().tolist() assert pytest.approx(result) == desired
def total_variation_loss(x): xx = C.reshape(x, (1,)+x.shape) delta = np.array([-1, 1], dtype=np.float32) kh = C.constant(value=delta.reshape(1, 1, 1, 1, 2)) kv = C.constant(value=delta.reshape(1, 1, 1, 2, 1)) dh = C.convolution(kh, xx, auto_padding=[False]) dv = C.convolution(kv, xx, auto_padding=[False]) avg = 0.5 * (C.reduce_mean(C.square(dv)) + C.reduce_mean(C.square(dh))) return avg
def _load_proj(self): with h5py.File(self.weight_file,'r') as fin: weight = fin['CNN_proj']['W_proj'][...] bias = fin['CNN_proj']['b_proj'][...] W_proj = C.constant(weight) b_proj = C.constant(bias) @C.Function def dense(x): return C.relu(C.times(x, W_proj)+b_proj) self.proj = dense
def __init__(self): self.EmbSrc = C.layers.Embedding(Config.EmbeddingSize, init=Config.defaultInit()) self.EmbTrg = C.layers.Embedding(Config.EmbeddingSize, init=Config.defaultInit()) self.EncoderL2R = RNN.GRUN(Config.EmbeddingSize, Config.SrcHiddenSize) self.EncoderR2L = RNN.GRUN(Config.EmbeddingSize, Config.SrcHiddenSize) self.Decoder = RNN.GRUN( Config.EmbeddingSize + Config.SrcHiddenSize * 2, Config.TrgHiddenSize) self.Wt = C.parameter( shape=(Config.TrgHiddenSize + Config.EmbeddingSize, Config.TrgVocabSize), init=Config.defaultInit()) self.Wtb = C.parameter(shape=(Config.TrgVocabSize), init=Config.defaultInit()) self.WI = C.parameter(shape=(Config.SrcHiddenSize, Config.TrgHiddenSize), init=Config.defaultInit()) self.WIb = C.parameter(shape=(Config.TrgHiddenSize), init=Config.defaultInit()) self.Was = C.parameter(shape=(Config.SrcHiddenSize * 2, Config.TrgHiddenSize), init=Config.defaultInit()) self.Wat = C.parameter(shape=(Config.TrgHiddenSize, Config.TrgHiddenSize), init=Config.defaultInit()) self.Wav = C.parameter(shape=(Config.TrgHiddenSize, 1), init=Config.defaultInit()) self.firstHidden = C.constant(0, shape=(Config.BatchSize, Config.SrcHiddenSize)) self.initTrgEmb = C.constant(0, shape=(1, Config.BatchSize, Config.EmbeddingSize)) self.inputMatrixSrc = C.input_variable( shape=(Config.SrcMaxLength * Config.BatchSize, Config.SrcVocabSize), is_sparse=True) self.inputMatrixTrg = C.input_variable( shape=(Config.TrgMaxLength * Config.BatchSize, Config.TrgVocabSize), is_sparse=True) self.maskMatrixSrc = C.input_variable(shape=(Config.SrcMaxLength, Config.BatchSize)) self.maskMatrixTrg = C.input_variable(shape=(Config.TrgMaxLength, Config.BatchSize)) self.Parameters = [ self.EmbSrc.E, self.EmbTrg.E, self.Wt, self.Wtb, self.WI, self.WIb, self.Was, self.Wat, self.Wav ] self.Parameters.extend(self.EncoderL2R.Parameters) self.Parameters.extend(self.EncoderR2L.Parameters) self.Parameters.extend(self.Decoder.Parameters)
def __init__(self, loc, scale_diag): self.loc = np.array(loc) self.scale = np.array(scale_diag) * np.eye(self.loc.shape[0]) self.loc, self.scale = self.loc.astype(np.float32), self.scale.astype( np.float32) self.shape = self.loc.shape self.mvn_pdf = C.mvn_pdf(C.constant(self.loc, name='loc'), C.constant(self.scale, name='scale')) self.mvn_log_prob = C.mvn_log_prob( C.constant(self.loc, name='loc'), C.constant(self.scale, name='scale'))
def output_layer(self, query, match_context): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) mat_context = C.placeholder(shape=(2*self.hidden_dim,)) #output layer r_q = question_pooling(q_processed, 2*self.hidden_dim) #shape n*(2*self.hidden_dim) p1_logits = attention_weight(mat_context, r_q, 2*self.hidden_dim) attention_pool = C.sequence.reduce_sum(p1_logits * mat_context) state = C.layers.GRU(2*self.hidden_dim)(attention_pool, r_q) p2_logits = attention_weight(mat_context, state, 2*self.hidden_dim) @C.Function def start_ave_point(p1_logits, p2_logits, point): @C.Function def start_ave(last, now): now = now + last - last new_start = now * C.sequence.gather(p2_logits, point) point = C.sequence.future_value(point) return new_start start_logits_ave = C.layers.Recurrence(start_ave)(p1_logits) return start_logits_ave point = C.sequence.is_first(p1_logits) point = C.layers.Sequential([For(range(2), lambda: C.layers.Recurrence(C.plus))])(point) point = C.greater(C.constant(16), point) start_logits_ave = start_ave_point(p1_logits, p2_logits, point) @C.Function def end_ave_point(p1_logits, p2_logits, point): @C.Function def end_ave(last, now): now = now + last - last new_end = now * C.sequence.gather(p2_logits, point) point = C.sequence.past_value(point) return new_end end_logits_ave = C.layers.Recurrence(end_ave, go_backwards=True)(p2_logits) return end_logits_ave point = C.sequence.is_last(p1_logits) point = C.layers.Sequential([For(range(2), lambda: C.layers.Recurrence(C.plus, go_backwards=True))])(point) point = C.greater(C.constant(16),point) end_logits_ave = end_ave_point(p1_logits, p2_logits, point) start_logits = seq_hardmax(start_logits_ave) end_logits = seq_hardmax(end_logits_ave) ''' start_logits = seq_hardmax(p1_logits) end_logits = seq_hardmax(p2_logits) ''' return C.as_block( C.combine([start_logits, end_logits]), [(q_processed, query), (mat_context, match_context)], 'output_layer', 'output_layer')
def var(array,W=_W,B=None,square=0,sqrt=0,V=False,sizz=0): #W=tf.transpose(W, [0,2,3,1]) arrs=array.shape ashp=W.shape sb=(W.shape[1],1,1) WV=W.shape[-2:] xi=(-2,-1) x2=(-2,-1,-3) if V: print(W.eval()) print(arrs,ashp) mul=(array*W) if V: print('Wsamp',W[-1,-1].eval()) print('array*w',(mul.eval())[0,-1]) size=C.reduce_sum(W,axis=xi)#shape=(outputs, channel) if V: print("sizesamp",size.shape,size.eval()) if B is None: B=C.constant(0,shape=W.shape[0:2],dtype=np.float32)#channel B=C.reshape(B,(*B.shape,*[1 for _ in range(len(ashp)-len(B.shape))])) if sizz==1: mean=C.reduce_sum(mul,axis=xi)/size else: mean=C.reduce_sum(mul,axis=xi)/C.constant(value=WV[0]*WV[1],shape=sb,dtype=np.float32) if V: print("meansamp",mean.eval()[0,-1]) if square: i=(C.square(mul-mean)+B) else: i=(((mul)-mean)+B) di=i/size if V==2: print("i",i.eval(),"i") print("di",di.eval(),"di") if V: print('isamp',i.shape,i.eval()[-1,-1,]) out=C.reduce_sum(i+B,axis=x2) #out=np.rollaxis(np.sum(i+B,axis=x2),-1,1) print(out.shape) if sqrt: out=C.sqrt(out) out=C.swapaxes(C.reshape(out,out.shape[:4]), 3, 1) print(out.shape) assert out.shape==(arrs[0],ashp[0],arrs[1],arrs[2]) return(out)
def test_nce_backward_indices(classes, xdim, batch, expected_value, device_id, precision): """ Simple test that makes sure that the derivatives have the correct sparsity pattern """ # ignore precision, only sparsity pattern matters for this test dt = np.float32 from cntk.losses import nce_loss import scipy trials = 10 # Establish baseline expected_count = np.zeros(classes) I = C.constant(np.eye(classes, dtype=dt)) q = np.arange(classes, dtype=dt) + 1 z = C.reduce_sum(C.times(C.random_sample(q, 32, True, seed=98052), I), axis=0) for i in range(trials): expected_count[np.nonzero(z.eval().ravel())] += 1 # Set things up to measure the same thing with nce_loss x = C.input_variable(xdim, needs_gradient=True) y = C.input_variable(classes, is_sparse=True) x0 = np.arange(batch * xdim, dtype=dt).reshape( (batch, xdim)) / (batch * xdim) data = np.ones(batch, dtype=dt) indices = list(range(10, 10 * batch + 1, 10)) indptr = list(range(batch + 1)) y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes)) b = C.parameter((classes, 1)) W = C.parameter((classes, C.InferredDimension)) gb = np.zeros(classes) vb = C.input_variable((classes, 1), dtype=dt) Ib = C.constant(np.eye(1, dtype=dt)) zb = C.times(vb, Ib) loss = C.nce_loss(W, b, x, y, q, seed=98052) for i in range(trials): v = loss.grad({x: x0, y: y0}, wrt=loss.parameters, as_numpy=False) gb[np.nonzero(zb.eval({vb: v[b]}).ravel())] += 1 for i in range(classes): assert gb[i] == expected_count[i] or (i in indices and gb[i] == trials)
def test_override_serialize(tmpdir): dev = C.cpu() a, b = 1.2322341, -0.29084 op = MyPlusPlus([C.constant(a), C.constant(b)], '++') op = MyPlusPlus([op, op], '+++') op = MyPlusPlus([op, op], '++++') op = C.user_function(op) result1 = op.eval({}, device=dev) filepath = str(tmpdir / 'test_udf_with_renamed_deserialize.dat') op.save(filepath) op_reloaded = Function.load(filepath, device=dev) assert result1 == op_reloaded.eval({}, device=dev)
def _simple_dict(): d = {} d['i1'] = C.input_variable(shape=(2, 3), name='i1') d['c1'] = C.constant(shape=(2, 3), value=6, name='c1') d['p1'] = C.parameter(shape=(3, 2), init=7, name='p1') d['op1'] = C.plus(d['i1'], d['c1'], name='op1') d['op2'] = C.times(d['op1'], d['p1'], name='op2') d['root'] = d['op2'] d['target'] = C.input_variable((), name='label') d['all'] = C.combine([d['root'], C.minus( d['target'], C.constant(1, name='c2'), name='minus')], name='all') return d
def test_convolution_attributes(): x = C.input_variable( (1, 5, 5) ) filter = np.reshape(np.array([2, -1, -1, 2], dtype = np.float32), (1, 2, 2)) kernel = C.constant(value = filter) f = C.convolution(kernel , x, auto_padding = [False]) d = f.root_function.attributes expected = {'autoPadding': [False, False, False], 'sharing': [True, True, True], 'strides': (1, 1, 1), 'maxTempMemSizeInSamples': 0, 'upperPad': (0, 0, 0), 'lowerPad': (0, 0, 0), 'transpose': False, 'outputShape': (0,) } _check(expected, d) f = C.convolution(kernel , x, auto_padding = [False, True]) d = f.root_function.attributes expected = {'autoPadding': [False, False, True], 'sharing': [True, True, True], 'strides': (1, 1, 1), 'maxTempMemSizeInSamples': 0, 'upperPad': (0, 0, 0), 'lowerPad': (0, 0, 0), 'transpose': False, 'outputShape': (0,) } _check(expected, d)
def multiFunc(self, arg1): # load or create the inputs we need multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1,)) # lets compute the means we need # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits, # it is the difference between the previous bits approximation and the true value. carry_over = multiIn approx = C.element_times(multiIn, 0) # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization for i in range(max_bits): # determine which values of the input should be binarized to i bits or more hot_vals = C.greater(bit_map, i) # select only the values which we need to binarize valid_vals = C.element_select(hot_vals, carry_over, 0) # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels) mean = C.element_divide(C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1)) # reshape the mean to match the dimensionality of the input mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1)) # binarize the carry over bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) # add in the equivalent binary representation to the approximation approx = C.plus(approx, C.element_times(mean, bits)) # compute the new carry over carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def test_get_data_type(): pa32 = C.parameter(init=np.asarray(2, dtype=np.float32)) pa64 = C.parameter(init=np.asarray(2, dtype=np.float64)) pl = C.placeholder(shape=(2)) c = C.constant(value=3.0) n32 = AA(1, dtype=np.float32) n64 = AA(1, dtype=np.float64) assert get_data_type(pa32) == np.float32 assert get_data_type(pa32, n32) == np.float32 assert get_data_type(n32, n32) == np.float32 assert get_data_type(n32, n64) == np.float64 assert get_data_type(pl, n64) == np.float64 assert get_data_type(pl, n32) == np.float32 assert get_data_type(pl, pl) is None # variable's type shall take precedence over provided data assert get_data_type(pa32, n64) == np.float32 assert get_data_type(pa64, n64) == np.float64 assert get_data_type(pa32, pl, n64) == np.float32 assert get_data_type(pa64, pl, n64) == np.float64 assert get_data_type(np.float64(1)) == np.float64 assert get_data_type(np.float32(1)) == np.float32 assert get_data_type(np.int64(1)) == np.float32 # special case for cntk assert get_data_type(1) == np.float32 assert get_data_type(1.0) == np.float32
def test_proposal_layer(): cls_prob_shape_cntk = (18,61,61) cls_prob_shape_caffe = (18,61,61) rpn_bbox_shape = (36, 61, 61) im_info = [1000, 1000, 1] # Create input tensors with values cls_prob = np.random.random_sample(cls_prob_shape_cntk).astype(np.float32) rpn_bbox_pred = np.random.random_sample(rpn_bbox_shape).astype(np.float32) # Create CNTK layer and call forward cls_prob_var = input_variable(cls_prob_shape_cntk) rpn_bbox_var = input_variable(rpn_bbox_shape) cntk_layer = user_function(CntkProposalLayer(cls_prob_var, rpn_bbox_var, cntk.constant(im_info, (3,)))) state, cntk_output = cntk_layer.forward({cls_prob_var: [cls_prob], rpn_bbox_var: [rpn_bbox_pred]}) cntk_proposals = cntk_output[next(iter(cntk_output))][0] # Create Caffe layer and call forward cls_prob_caffe = cls_prob.reshape(cls_prob_shape_caffe) bottom = [np.array([cls_prob_caffe]),np.array([rpn_bbox_pred]),np.array([im_info])] top = None # handled through return statement in caffe layer for unit testing param_str = "'feat_stride': 16" caffe_layer = CaffeProposalLayer() caffe_layer.set_param_str(param_str) caffe_layer.setup(bottom, top) caffe_output = caffe_layer.forward(bottom, top) caffe_proposals = caffe_output[:,1:] # assert that results are exactly the same assert cntk_proposals.shape == caffe_proposals.shape assert np.allclose(cntk_proposals, caffe_proposals, rtol=0.0, atol=0.0) print("Verified ProposalLayer")
def test_constant_data_type_mismatch(): a = C.constant(np.triu(np.ones(5)), shape=(5,5)) i = C.input_variable(shape=(5,5)) b = a * i with pytest.raises(ValueError): b.eval({i:[[np.asarray(np.random.rand(5,5),dtype=np.float32)]]})
def cumsum(x, axis=-1): if axis != -1 and axis != K.ndim(x) - 1: raise ValueError('Only the last axis could be used, found: {}'.format(axis)) dim = x.shape[-1] U = C.constant(np.triu(np.ones((dim, dim))).astype(x.dtype)) out = C.times(x, U) return out
def scale_dot_product_attention_block(self, contextQ, contextV, contextK, name): Q = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) V = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) K = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) Ql = C.layers.Dense(100)(Q) Vl = C.layers.Dense(100)(V) Kl = C.layers.Dense(100)(K) kvw, kvw_mask = C.sequence.unpack(Kl, padding_value=0).outputs vvw, _ = C.sequence.unpack(Vl, padding_value=0).outputs KT = C.swapaxes(kvw) S = C.reshape(C.times(Ql, KT) / math.sqrt(100), -1) kvw_mask_expanded = C.sequence.broadcast_as(kvw_mask, Ql) S = C.softmax( C.element_select(kvw_mask_expanded, S, C.constant(-1e+30))) att = C.times(S, vvw) return C.as_block(att, [(Q, contextQ), (V, contextV), (K, contextK)], 'sdp_attention_block' + name, 'sdp_attention_block' + name)
def attention(encoded, network): abk = dense(network) a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures) # print("abk shape:", a.shape, b.shape, k.shape) # a, b, k: [#, n] [nb_mixture, 1] # context: [#, c] [char_ohe] encoded_unpacked = C.sequence.unpack(encoded, padding_value=0, no_mask_output=True) # context_unpacked: [#] [*=c, char_ohe] u = Cx.sequence.position(encoded) # position gives shape=(1, ) # u: [#, c], [1] u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs # u_values: [#] [*=c, 1] # u_valid: [#] [*=c] u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k)) # u_values_broadcast: [#, n] [1, *=c] u_valid_broadcast = C.sequence.broadcast_as(C.reshape(u_valid, (1,), 1), k) # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point # print("u_values_broadcast shape:", u_values_broadcast.shape) # print("abk shape:", a.shape, b.shape, k.shape) phi = window_weight(a, b, k, u_values_broadcast) # phi: [#, n] [*=c, 1] zero = C.constant(0) phi = C.element_select(u_valid_broadcast, phi, zero, name="phi") # phi: [#, n] [*=c, 1] attended = C.reduce_sum(phi * C.sequence.broadcast_as(encoded_unpacked, phi), axis=0) # [#, n] [1, char_ohe] # print("attended_context shape:", attended_context.shape) output = C.squeeze(attended, name="GaussianWindowAttention") # [#, n] [char_ohe] return output
def _graph_dict(): # This function creates a graph that has no real meaning other than # providing something to traverse. d = {} d['i1'] = C.sequence.input_variable(shape=(2, 3), sequence_axis=Axis('ia'), name='i1') d['c1'] = C.constant(shape=(2, 3), value=6, name='c1') d['p1'] = C.parameter(shape=(3, 2), init=7, name='p1') d['op1'] = C.plus(d['i1'], d['c1'], name='op1') d['op2'] = C.times(d['op1'], d['p1'], name='op2') #d['slice'] = slice(d['c1'], Axis.default_dynamic_axis(), 0, 3) #label_sentence_start = sequence.first(raw_labels) # no name d['p2'] = C.parameter(shape=(2, 2)) # duplicate names d['op3a'] = C.plus(d['op2'], d['p2'], name='op3') d['op3b'] = C.plus(d['op3a'], d['p2'], name='op3') d['first'] = C.sequence.first(d['op3b'], name='past') d['root'] = d['first'] return d
def test_convolution_attributes(): x = C.input((1, 5, 5)) filter = np.reshape(np.array([2, -1, -1, 2], dtype=np.float32), (1, 2, 2)) kernel = C.constant(value=filter) f = C.convolution(kernel, x, auto_padding=[False]) d = f.root_function.attributes expected = { 'autoPadding': [False, False, False], 'sharing': [True, True, True], 'strides': (1, 1, 1), 'maxTempMemSizeInSamples': 0, 'upperPad': (0, 0, 0), 'lowerPad': (0, 0, 0), 'transpose': False, 'outputShape': (0, ) } _check(expected, d) f = C.convolution(kernel, x, auto_padding=[False, True]) d = f.root_function.attributes expected = { 'autoPadding': [False, False, True], 'sharing': [True, True, True], 'strides': (1, 1, 1), 'maxTempMemSizeInSamples': 0, 'upperPad': (0, 0, 0), 'lowerPad': (0, 0, 0), 'transpose': False, 'outputShape': (0, ) } _check(expected, d)
def test_Gather(tmpdir): c = np.asarray([[[0], [1]], [[4], [5]]]).astype('f') x = C.input_variable((2, 1)) d = np.arange(12).reshape(6, 2).astype('f') y = C.constant(d) model = C.gather(y, x) verify_one_input(model, c, tmpdir, 'Gather_1')
def embed(self): npglove = np.zeros((self.wg_dim, 1024 + 300), dtype=np.float32) hf = h5py.File( os.path.join(self.abs_path, '../data/elmo_embedding.bin'), 'r') with open(os.path.join(self.abs_path, '../data/glove.840B.300d.txt'), encoding='utf-8') as f: for line in f: parts = line.split() word = parts[0].lower() if word in self.vocab: try: if len(parts) == 301: npglove[self.vocab[word], :300] = np.asarray( [float(p) for p in parts[-300:]]) npglove[self.vocab[word], 300:] = np.average(hf[word][:], axis=0) except: npglove[self.vocab[word], 300:] = np.average(hf['<UNK>'][:], axis=0) glove = C.constant(npglove) nonglove = C.parameter(shape=(self.wn_dim, 1024 + 300), init=C.glorot_uniform(), name='TrainableE') def func(wg, wn): return C.times(wg, glove) + C.times(wn, nonglove) return func
def test_ConvTranspose(tmpdir, dtype, device_id): if device_id == -1 and dtype == np.float16: pytest.skip('Test is skipped on CPU with float16 data') device = cntk_device(device_id) with C.default_options(dtype=dtype): # Keep the shapes below as they are, because this tests an earlier bug. input_shape = (48, 16, 16) img = np.reshape(np.arange(np.prod(input_shape), dtype=dtype), input_shape) x = C.input_variable(input_shape) kernel_shape = ( 48, 32, 3, 3 ) # For convolution_transpose the shape is (I x O x W x H) kernel = C.constant(value=np.ones(shape=(kernel_shape), dtype=dtype)) conv_trans_model = C.convolution_transpose( kernel, x, strides=(2, 2), output_shape=(32, 32, 32), auto_padding=[False, True, True]) verify_one_input(conv_trans_model, img, tmpdir, 'ConvTranspose_0', device)
def test_constant_data_type_mismatch(): a = C.constant(np.triu(np.ones(5)), shape=(5, 5)) i = C.input_variable(shape=(5, 5)) b = a * i with pytest.raises(ValueError): b.eval({i: [[np.asarray(np.random.rand(5, 5), dtype=np.float32)]]})
def attention(query, key, value): dk = C.reduce_sum(C.ones_like(query)) # cannot use sequence.last, will conflict with recurrence # dk: [#, *] [1, ] and value = int(dim_of_query) unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True) # [#] [-3, key_dim] unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True) # [#] [-3, value_dim] broadcasted_key = C.sequence.broadcast_as(unpacked_key, query) # [#, *] [-3, key_dim] scaled = C.times_transpose(query, broadcasted_key) / dk # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score # masked out invalid temporal connections to obey_sequence_order if obey_sequence_order and max_seq_len: unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs # unpacked_scaled: [#] [-3, -3] <== matrix will be top right diagonally zero-ed # scaled_mask: [#] [-3,] minus_inf = C.constant(-1e+30) valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0)) # [] [max_seq, max_seq] valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled) # [#] [max_seq, max_seq] valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0) # [#] [-3, -3] unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf) # [#] [-3, -3] scaled = C.to_sequence_like(unpacked_scaled, query) # [#, *] [-3] elif obey_sequence_order and not max_seq_len: raise ValueError("max_seq_len must be defined when obey_sequence_order is True") attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query)) # [#, *] [value_dim,] return attended
def _to_dense(val, is_sequence=False): if is_sequence: x = C.sequence.input_variable(val.shape[2:], is_sparse=True) else: x = C.input_variable(val.shape[1:], is_sparse=True) dense = C.times(x, C.constant(value=np.eye(val.shape[-1], dtype=np.float32))) return dense.eval({x : val}, device=val.device)
def build_test_function(): dev = C.cpu() w_value = np.asarray([[0.5, 2], [-0.5, 1.5]]).astype(np.float32) c1_value = 2.718 c2_value = -3.141 if not C.cntk_py.is_native_user_function_registered('NativeUserTimesOp'): C.ops.register_native_user_function('NativeUserTimesOp', 'Cntk.ExtensibilityExamples-' + C.__version__.rstrip('+'), 'CreateUserTimesFunction') x = C.input_variable((2)) w = C.parameter((2, 2), init=w_value, device=dev) op = C.user_function(MyPlus(x, C.constant(c1_value))) op = C.ops.native_user_function('NativeUserTimesOp', [w, op], user_function_instance_name='my_times') return dev, w_value, c1_value, c2_value, C.user_function(MyPlus(op, C.constant(c2_value)))
def test_nce_backward_indices(classes, xdim, batch, expected_value, device_id, precision): """ Simple test that makes sure that the derivatives have the correct sparsity pattern """ # ignore precision, only sparsity pattern matters for this test dt = np.float32 from cntk.losses import nce_loss import scipy trials = 10 # Establish baseline expected_count = np.zeros(classes) I = C.constant(np.eye(classes, dtype=dt)) q = np.arange(classes, dtype=dt) + 1 z = C.reduce_sum(C.times(C.random_sample(q, 32, True, seed=98052), I), axis=0) for i in range(trials): expected_count[np.nonzero(z.eval().ravel())] += 1 # Set things up to measure the same thing with nce_loss x = C.input_variable(xdim, needs_gradient=True) y = C.input_variable(classes, is_sparse=True) x0 = np.arange(batch * xdim, dtype=dt).reshape((batch, xdim))/(batch * xdim) data = np.ones(batch, dtype=dt) indices = list(range(10,10*batch+1,10)) indptr = list(range(batch+1)) y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes)) b = C.parameter((classes, 1)) W = C.parameter((classes, C.InferredDimension)) gb = np.zeros(classes) vb = C.input_variable((classes, 1), dtype=dt) Ib = C.constant(np.eye(1, dtype=dt)) zb = C.times(vb, Ib) loss = C.nce_loss(W, b, x, y, q, seed=98052) for i in range(trials): v = loss.grad({x: x0, y: y0}, wrt=loss.parameters, as_numpy=False) gb[np.nonzero(zb.eval({vb: v[b]}).ravel())] += 1 for i in range(classes): assert gb[i] == expected_count[i] or (i in indices and gb[i] == trials)
def test_ext_eval_3_no_input(): dim = 4 p = C.parameter(shape=(dim,), init=10, name='p') m = C.user_function(MyPlus(p, C.constant(3))) z = m + 0 result = z.eval() # No batch dimension since we have no input assert np.allclose(result, np.zeros_like(p) + 10 + 3)
def test_sequence_unpack_with_convolution(device_id, precision): x = C.sequence.input((20, 20)) y = C.sequence.unpack(x, 0, no_mask_output=True) z = C.reshape(y, (3, 20, 20)) kernel = C.constant(1.0, (4, 3, 3, 3)) t = C.convolution(kernel, z, auto_padding=[False, True, True]) val = np.random.random((2, 3, 20, 20)).astype(np.float32) result = t.eval({x: val}) assert np.array_equal(result.shape, (2, 4, 20, 20))
def returnFunction(): left_val = [[10,2]] right_val = [[2],[3]] p = placeholder(shape=(1,2)) op = times(p, right_val) c = constant(left_val) return op.replace_placeholders({p:c})
def test_ext_eval_4_b_inside_graph(): dim = 4 p_init = 10 p = C.parameter(shape=(dim,), init=p_init, name='p') z = C.user_function(p * MyPlus(p, C.constant(3))) result = z.eval() # No batch dimension since we have no input assert np.allclose(result, ((p_init * np.ones_like(result)) + 3) * p_init)
def test_gather_op(device_id, precision): a_data = [AA([[0],[1]], dtype=PRECISION_TO_TYPE[precision]), AA([[3],[4]], dtype=PRECISION_TO_TYPE[precision])] a = C.input_variable((2,1)) r_data = np.arange(12).reshape(6,2).astype('f') r = C.parameter(shape=r_data.data, init=r_data) res = C.gather(r, a).eval({a:a_data}) expectd = np.asarray([[[[0., 1.]],[[2., 3.]]],[[[6., 7.]],[[8.,9.]]]]) assert np.array_equal(res, expectd) grads = C.gather(r, a).grad({a:a_data}, [r]) expectd_grad = np.asarray([[1,1],[1,1],[0,0],[1,1],[1,1],[0,0]], dtype=np.float32) assert np.array_equal(grads, expectd_grad) #gather with indices from learning parameter (no gradients should passed through the indices -- 0s should be passed) indices_params = C.parameter(shape=(1,), init=1.0) grads = C.gather(r, (indices_params *a)).grad({a:a_data}, [r, indices_params]) assert np.array_equal(grads[r], expectd_grad) assert np.array_equal(grads[indices_params], np.asarray([0.0], dtype=np.float32)) b_data = [AA([[0,2],[1,3]], dtype=PRECISION_TO_TYPE[precision]), AA([[2,4],[3,5]], dtype=PRECISION_TO_TYPE[precision])] b = C.input_variable((2,2)) res2 = C.gather(r, b).eval({b:b_data}) expectd2 = np.asarray([[[[0., 1.],[4.,5.]],[[2., 3.],[6., 7.]]],[[[4., 5.],[8.,9.]],[[6., 7.], [10., 11.]]]]) assert np.array_equal(res2, expectd2) #the following small model is to test the memory reuse issue of gather node. x = C.input((3, 4)) x1 = C.to_sequence(x) w = C.parameter((5, 6), init=1) z = C.gather(w, x1) assert z.shape == (4, 6) #need the unpack node to trigger memory reuse. f = C.sequence.unpack(z, 0, no_mask_output=True) y = C.input((3, 4, 6)) loss = C.reduce_mean(C.square(f - y), axis=-1) loss = C.reduce_mean(loss, axis=C.Axis.all_axes()) g = C.constant(0, shape=w.shape) u = C.assign(w, g + 1) learner = C.cntk_py.universal_learner([w], [g], u) trainer = C.trainer.Trainer(loss, [loss], [learner]) indices = np.asarray([[[1, 2, 1, 2]]]) input = np.repeat(np.repeat(indices, 3, axis=1), 10, axis=0) lable = np.full((10, 3, 4, 6), 2) trainer.train_minibatch({x: input, y: lable}) # the 2nd and 3rd rows should be udpated by gradients. assert np.mean(w.value[1, :]) < 1 assert np.mean(w.value[2, :]) < 1 # the other three rows should keep as 1 assert np.isclose(np.mean(w.value[0, :]), 1) assert np.isclose(np.mean(w.value[3, :]), 1) assert np.isclose(np.mean(w.value[4, :]), 1)
def test_ext_eval_1(): dim = 4 p = C.parameter(shape=(dim,), init=10, name='p') i = C.sequence.input_variable(dim, needs_gradient=True, name='i_var') m = C.user_function(MyPlus(i, C.constant(3))) z = m + p input_data = np.random.rand(dim) result = z.eval([input_data]) assert np.allclose(result[0][0], input_data + 3 + 10)
def test_Concat(tmpdir): data1 = np.asarray([[[1, 2], [4, 5]]], dtype=np.float32) x = C.constant(value=data1) # create 3x2 matrix in a sequence of length 1 in a batch of one sample data2 = np.asarray([[[10, 20], [30, 40], [50, 60]]],dtype=np.float32) y = C.constant(value=data2) # splice both inputs on axis=0 returns a 5x2 matrix model = C.splice(x, y, axis=1) verify_no_input(model, tmpdir, 'Concat_0') x = C.input_variable(data1.shape) model = C.splice(x, y, axis=1) verify_one_input(model, data1, tmpdir, 'Concat__1')
def test_ext_eval_5_times(): dim = 2 p_init = 10 p = C.parameter(shape=(dim,), init=p_init, name='p') m = C.user_function(MyPlus(p, C.constant(3))) z = C.times(m, C.parameter(shape=(2, 50), init=2)) result = z.eval() # No batch dimension since we have no input assert np.allclose(result, ((p_init * np.ones_like(result)) + 3) * 2 * 2)
def test_Gather(tmpdir, dtype): if (dtype == np.float16): pytest.skip("TO BE FIXED") with C.default_options(dtype = dtype): c = np.asarray([[[0],[1]],[[4],[5]]]).astype(dtype) x = C.input_variable((2,1)) d = np.arange(12).reshape(6,2).astype(dtype) y = C.constant(d) model = C.gather(y, x) verify_one_input(model, c, tmpdir, 'Gather_1')
def create_binary_convolution_model(): # Input variables denoting the features and label data feature_var = C.input((num_channels, image_height, image_width)) label_var = C.input((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), feature_var) # first layer is ok to be full precision z = C.layers.Convolution((3, 3), 32, pad=True, activation=C.relu)(scaled_input) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=32, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=128, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (1,1), num_classes, channels=128, pad=True) z = C.layers.AveragePooling((z.shape[1], z.shape[2]))(z) z = C.reshape(z, (num_classes,)) # Add binary regularization (ala Gang Hua) weight_sum = C.constant(0) for p in z.parameters: if (p.name == "filter"): weight_sum = C.plus(weight_sum, C.reduce_sum(C.minus(1, C.square(p)))) bin_reg = C.element_times(.000005, weight_sum) # After the last layer, we need to apply a learnable scale SP = C.parameter(shape=z.shape, init=0.001) z = C.element_times(z, SP) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) ce = C.plus(ce, bin_reg) pe = C.classification_error(z, label_var) return C.combine([z, ce, pe])
def lrn(x, depth_radius, bias, alpha, beta, name=''): x2 = C.square(x) # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1) W = C.constant(alpha/(2*depth_radius+1), shape=(1,2*depth_radius+1,1,1), dtype=dtype, name='W') # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1 y = C.convolution (W, x2s) # reshape back to remove the fake singleton reduction dimension b = C.reshape(y, C.InferredDimension, 0, 2) den = C.exp(beta * C.log(bias + b)) return C.element_divide(x, den)
def train_eval_mnist_onelayer_from_file(criterion_name=None, eval_name=None): # Network definition feat_dim = 784 label_dim = 10 hidden_dim = 200 cur_dir = os.path.dirname(__file__) training_filename = os.path.join(cur_dir, "Data", "Train-28x28_text.txt") test_filename = os.path.join(cur_dir, "Data", "Test-28x28_text.txt") features = C.input(feat_dim) features.name = 'features' feat_scale = C.constant(0.00390625) feats_scaled = C.element_times(features, feat_scale) labels = C.input(label_dim) labels.tag = 'label' labels.name = 'labels' traning_reader = C.CNTKTextFormatReader(training_filename) test_reader = C.CNTKTextFormatReader(test_filename) h1 = add_dnn_sigmoid_layer(feat_dim, hidden_dim, feats_scaled, 1) out = add_dnn_layer(hidden_dim, label_dim, h1, 1) out.tag = 'output' ec = C.cross_entropy_with_softmax(labels, out) ec.name = criterion_name ec.tag = 'criterion' eval = C.ops.square_error(labels, out) eval.name = eval_name eval.tag = 'eval' # Specify the training parameters (settings are scaled down) my_sgd = C.SGDParams(epoch_size=600, minibatch_size=32, learning_rates_per_mb=0.1, max_epochs=5, momentum_per_mb=0) # Create a context or re-use if already there with C.LocalExecutionContext('mnist_one_layer', clean_up=True) as ctx: # CNTK actions ctx.train( root_nodes=[ec, eval], training_params=my_sgd, input_map=traning_reader.map(labels, alias='labels', dim=label_dim).map(features, alias='features', dim=feat_dim)) result = ctx.test( root_nodes=[ec, eval], input_map=test_reader.map(labels, alias='labels', dim=label_dim).map(features, alias='features', dim=feat_dim)) return result
def hierarchical_softmax_layer(input_var, label_index, label_dim, label_classes=None): ''' A two layers hierarchical softmax function: Args: input_var: Variable with shape: [#,*](dim_x) label_index: index of label's category: [#,*](1) label_dim: number of the label categories label_classes: number of classes of the label categories Returns: output_prob: the probability of the given label [#,*](1) class_probs: the probability of all the label classes [#,*](label_classes) all_probs: the probability of all label classes ''' input_dim = input_var.shape[0] if not label_classes: label_classes = int(np.ceil(np.sqrt(float(label_dim)))) n_outputs_per_class = int(np.ceil(label_dim / label_classes)) target_class = C.floor((label_index + 0.5) / n_outputs_per_class) target_output_in_class = C.round(label_index - target_class * n_outputs_per_class) w1 = parameter(shape=(input_dim, label_classes), init=C.glorot_normal(), name='hsoftmax_w1') b1 = parameter(shape=(label_classes), init=C.glorot_normal(), name='hsoftmax_b1') w2s = parameter(shape=(label_classes, input_dim, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_w2s') b2s = parameter(shape=(label_classes, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_b2s') class_probs = softmax(b1 + times(input_var, w1)) # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate target_class_one_hot = C.one_hot(target_class, num_classes=label_classes, sparse_output=False) w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2), [input_dim, -1]) b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1]) probs_in_class = softmax(b2 + times(input_var, w2)) prob_in_class = C.times_transpose(C.one_hot(target_output_in_class, num_classes=n_outputs_per_class, sparse_output=False), probs_in_class) class_prob = C.times_transpose(C.one_hot(target_class, num_classes=label_classes, sparse_output=False), class_probs) output_prob = prob_in_class * class_prob # this is for calculating all the outputs' probabilities all_probs = [] for i in range(label_classes): ci = C.constant(i) ci_one_hot = C.one_hot(ci, num_classes=label_classes, sparse_output=False) w2a = C.times(ci_one_hot, w2s, output_rank=2) b2a = C.times(ci_one_hot, b2s, output_rank=1) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) class_proba = C.times_transpose(ci_one_hot, class_probs) output_proba = probs_in_classa * class_proba all_probs.append(output_proba) return output_prob, class_probs, all_probs
def test_udf_clone(): dim = 4 i = C.sequence.input_variable(dim, needs_gradient=True, name='i_var') m_udf = C.user_function(MyPlus(i, C.constant(3))) p = C.parameter(shape=(dim,), init=10, name='p') z = m_udf + p z_clone = z.clone('share') input_data = np.random.rand(dim) result = z_clone.eval([input_data]) assert np.allclose(result[0][0], input_data + 3 + 10)