def gru(dh, x): dhs = Sdh(dh) # previous value, stabilized # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias projx3 = b + times(x, W) projh2 = times(dhs, H) zt_proj = slice (projx3, stack_axis, 0*stacked_dim, 1*stacked_dim) + slice (projh2, stack_axis, 0*stacked_dim, 1*stacked_dim) rt_proj = slice (projx3, stack_axis, 1*stacked_dim, 2*stacked_dim) + slice (projh2, stack_axis, 1*stacked_dim, 2*stacked_dim) ct_proj = slice (projx3, stack_axis, 2*stacked_dim, 3*stacked_dim) zt = sigmoid (zt_proj) # update gate z(t) rt = sigmoid (rt_proj) # reset gate r(t) rs = dhs * rt # "cell" c ct = activation (ct_proj + times(rs, H1)) ht = (1 - zt) * ct + zt * dhs # hidden state ht / output # for comparison: CUDNN_GRU # i(t) = sigmoid(W_i x(t) + R_i h(t-1) + b_Wi + b_Ru) # r(t) = sigmoid(W_r x(t) + R_r h(t-1) + b_Wr + b_Rr) --same up to here # h'(t) = tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh) --r applied after projection? Would make life easier! # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1) --TODO: need to confirm bracketing with NVIDIA h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters return Function.NamedOutput(h=h)
def lstm(dh, dc, x): # projected contribution from input(s), hidden, and bias dropped_H = dropout(H) if weight_drop_rate is not None else H proj4 = b + times(x, W) + times(dh, dropped_H) # slicing layout different from cntk's implementation it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis ft_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) bit_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) # g gate ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) it = sigmoid(it_proj) # input gate(t) bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(ft_proj) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(ot_proj) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) return ht, ct
def rnn(dh, x): dhs = Sdh(dh) # previous value, stabilized ht = activation(times(x, W) + times(dhs, H) + b) h = times(Sht(ht), Wmr) if has_projection else \ ht #return Function.NamedOutput(h=h) return h
def weight_dropped_lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, dropout(H)) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else ht return h, c
def dense(x): r = times(x, W1) r = times(r, W2) if b: r = r + b if activation is not None: r = activation(r) return r
def lstm(dh, dc, sv, x): # projected contribution from input(s), hidden, and bias proj3 = b + times(x, W) + times(dh, H) + times(sv, Hsv) it_proj = slice(proj3, stack_axis, 0 * stacked_dim, 1 * stacked_dim) ft_proj = slice(proj3, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ot_proj = slice(proj3, stack_axis, 2 * stacked_dim, 3 * stacked_dim) it = sigmoid(it_proj) # input gate(t) ft = sigmoid(ft_proj) # forget-me-not gate(t) ot = sigmoid(ot_proj) # output gate(t) # the following is reading gate proj3rg = sigmoid( times(x, Wrg) + times(dh, Hrg) + times(sv, Hsvrg) + brg) v = proj3rg * sv cx_t = tanh(times(x, Wcx) + times(dh, Hcx)) # need to do stablization ?? # update memory cell c = it * cx_t + ft * dc + tanh(times(v, Wfc)) h = ot * tanh(c) return (h, c, v)
def project_cosine_sim(att_dim, init=glorot_uniform(), name=''): """ Compute the project cosine similarity of two input sequences, where each of the input will be projected to a new dimention space (att_dim) via Wi/Wm """ Wi = Parameter(_INFERRED + tuple((att_dim, )), init=init, name='Wi') Wm = Parameter(_INFERRED + tuple((att_dim, )), init=init, name='Wm') status = placeholder_variable(name='status') memory = placeholder_variable(name='memory') projected_status = times(status, Wi, name='projected_status') projected_memory = times(memory, Wm, name='projected_memory') sim = cosine_similarity(projected_status, projected_memory, name=name + '_sim') return seq_softmax(sim, name=name)
def project_cosine(project_dim, init = glorot_uniform(), name=''): """ Compute the project cosine similarity of two input sequences, where each of the input will be projected to a new dimention space (project_dim) via Wi/Wm """ Wi = Parameter(_INFERRED + (project_dim,), init = init, name='Wi') Wm = Parameter(_INFERRED + (project_dim,), init = init, name='Wm') status = placeholder(name='status') memory = placeholder(name='memory') projected_status = times(status, Wi, name = 'projected_status') projected_memory = times(memory, Wm, name = 'projected_memory') status_br = sequence.broadcast_as(projected_status, projected_memory, name='status_broadcast') sim = cosine_distance(status_br, projected_memory, name= name) return sim
def frcn_predictor(features, rois, n_classes): # Load the pretrained classification net and find nodes loaded_model = load_model(model_file) feature_node = find_by_name(loaded_model, feature_node_name) conv_node = find_by_name(loaded_model, last_conv_node_name) pool_node = find_by_name(loaded_model, pool_node_name) last_node = find_by_name(loaded_model, last_hidden_node_name) # Clone the conv layers and the fully connected layers of the network conv_layers = combine([conv_node.owner ]).clone(CloneMethod.freeze, {feature_node: Placeholder()}) fc_layers = combine([last_node.owner]).clone(CloneMethod.clone, {pool_node: Placeholder()}) # Create the Fast R-CNN model feat_norm = features - Constant(114) conv_out = conv_layers(feat_norm) roi_out = roipooling(conv_out, rois, (roi_dim, roi_dim)) fc_out = fc_layers(roi_out) # z = Dense(rois[0], num_classes, map_rank=1)(fc_out) # --> map_rank=1 is not yet supported W = parameter(shape=(4096, n_classes), init=glorot_uniform()) b = parameter(shape=n_classes, init=0) z = times(fc_out, W) + b return z
def Embedding(shape=None, init=None, weights=None): if init is not None or weights is not None: raise ValueError('Embedding: init and weights options are mutually exclusive') # parameters bound to this Function: # no weights given: learn the embedding if weights is None: if shape is None: raise ValueError('Embedding: output shape must be specified') if init is None: init = init_default_or_glorot_uniform shape = _as_tuple(shape) weight_shape = _INFERRED + shape E = Parameter(weight_shape, init=init, name='E') # weights given: use them as constant else: UntestedBranchError("Embedding, from constant") import numpy as np if not isinstance(weights, array): # TODO: is this the correct test for a numpy array UntestedBranchError("Embedding, from constant that is not an array") # TODO: can 'weights' be a CNTK object? Then how to do this? raise ValueError('Embedding: weights must be a numpy array') weight_shape = np.shape(weights) if shape is not None: # user may give shape, then it must match if len(shape) >= len(weight_shape) or weight_shape[-len(shape):] != shape: raise ValueError('Embedding: shape parameter must match weights') E = Constant(weights, name='E') # expression x = Placeholder(name='embedding_arg') apply_x = times(x, E) return Block(apply_x, 'Embedding', Record(E=E))
def test_trainer_with_some_params_not_learned(): input_dim = 2 proj_dim = 2 x = input_variable(shape=(input_dim, )) W = parameter(shape=(input_dim, proj_dim), init=glorot_uniform()) B = parameter(shape=(proj_dim, ), init=glorot_uniform()) t = times(x, W) z = t + B W_orig_value = W.value B_orig_value = B.value labels = input_variable(shape=(proj_dim, )) ce = cross_entropy_with_softmax(z, labels) pe = classification_error(z, labels) lr_per_sample = learning_rate_schedule(0.1, UnitType.sample) trainer = Trainer(z, (ce, pe), sgd([W], lr_per_sample)) x_value = [[1, 1], [2, 2]] label_value = [[0, 1], [1, 0]] arguments = {x: x_value, labels: label_value} num_iters = 3 for i in range(num_iters): trainer.train_minibatch(arguments) assert np.array_equal(B.value, B_orig_value) assert not np.array_equal(W.value, W_orig_value) W_orig_value = W.value trainer.test_minibatch(arguments)
def test_trainer_with_some_params_not_learned(): input_dim = 2 proj_dim = 2 x = input_variable(shape=(input_dim,)) W = parameter(shape=(input_dim, proj_dim), init=glorot_uniform()) B = parameter(shape=(proj_dim,), init=glorot_uniform()) t = times(x, W) z = t + B W_orig_value = W.value B_orig_value = B.value labels = input_variable(shape=(proj_dim,)) ce = cross_entropy_with_softmax(z, labels) pe = classification_error(z, labels) lr_per_sample = learning_rate_schedule(0.1, UnitType.sample) trainer = Trainer(z, (ce, pe), sgd([W], lr_per_sample)) x_value = [[1, 1],[2, 2]] label_value = [[0, 1], [1, 0]] arguments = {x: x_value, labels: label_value} num_iters = 3 for i in range(num_iters): trainer.train_minibatch(arguments) assert np.array_equal(B.value, B_orig_value) assert not np.array_equal(W.value, W_orig_value) W_orig_value = W.value trainer.test_minibatch(arguments)
def fully_connected_layer(input, output_dim, device_id, nonlinearity): input_dim = input.shape()[0] times_param = parameter(shape=(input_dim,output_dim)) t = times(input,times_param) plus_param = parameter(shape=(output_dim,)) p = plus(plus_param,t.output()) return nonlinearity(p.output());
def resnet_classifer(input, num_classes): conv_w_scale = 7.07 conv_b_value = 0 fc1_w_scale = 0.4 fc1_b_value = 0 sc_value = 1 bn_time_const = 4096 kernel_width = 3 kernel_height = 3 conv1_w_scale = 0.26 c_map1 = 16 conv1 = conv_bn_relu_layer(input, c_map1, kernel_width, kernel_height, 1, 1, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn1_1 = resnet_node2(conv1, c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn1_2 = resnet_node2(rn1_1, c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn1_3 = resnet_node2(rn1_2, c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) c_map2 = 32 rn2_1_wProj = get_projection_map(c_map2, c_map1) rn2_1 = resnet_node2_inc(rn1_3, c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn2_1_wProj) rn2_2 = resnet_node2(rn2_1, c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn2_3 = resnet_node2(rn2_2, c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) c_map3 = 64 rn3_1_wProj = get_projection_map(c_map3, c_map2) rn3_1 = resnet_node2_inc(rn2_3, c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn3_1_wProj) rn3_2 = resnet_node2(rn3_1, c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn3_3 = resnet_node2(rn3_2, c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) # Global average pooling poolw = 8 poolh = 8 poolh_stride = 1 poolv_stride = 1 pool = pooling(rn3_3, AVG_POOLING, (1, poolh, poolw), (1, poolv_stride, poolh_stride)) out_times_params = parameter(shape=(c_map3, 1, 1, num_classes), init=glorot_uniform()) out_bias_params = parameter(shape=(num_classes), init=0) t = times(pool, out_times_params) return t + out_bias_params
def test_eval_sparse_dense(tmpdir, device_id): from cntk import Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk.device import cpu, gpu, set_default_device from cntk.ops import input_variable, times from scipy.sparse import csr_matrix input_vocab_dim = label_vocab_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True) )), randomize=False, epoch_size = 2) batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable( shape=input_vocab_dim, dynamic_axes=input_dynamic_axes, name='raw_input', is_sparse=True) mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, input_map={raw_input : mbs.streams.features}) z = times(raw_input, np.eye(input_vocab_dim)) e_reader = z.eval(mb_valid) # CSR with the raw_input encoding in ctf_data one_hot_data = [ [3, 4, 5, 4, 7, 12, 1], [60, 61] ] data = [csr_matrix(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data] e_csr = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)]) # One-hot with the raw_input encoding in ctf_data data = one_hot(one_hot_data, num_classes=input_vocab_dim) e_hot = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
def test_eval_sparse_dense(tmpdir, device_id): from cntk import Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk.ops import input_variable, times input_vocab_dim = label_vocab_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True) )), randomize=False, epoch_size = 2) batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable( shape=input_vocab_dim, dynamic_axes=input_dynamic_axes, name='raw_input', is_sparse=True) mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, input_map={raw_input : mbs.streams.features}, device=cntk_device(device_id)) z = times(raw_input, np.eye(input_vocab_dim)) e_reader = z.eval(mb_valid, device=cntk_device(device_id)) # CSR with the raw_input encoding in ctf_data one_hot_data = [ [3, 4, 5, 4, 7, 12, 1], [60, 61] ] data = [csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data] e_csr = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)]) # One-hot with the raw_input encoding in ctf_data data = one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id)) e_hot = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
def Dense(shape, init=init_default_or_glorot_uniform, activation=activation_default_or_None, input_rank=None, map_rank=None, bias=bias_default_or_True, init_bias=init_bias_default_or_0): activation = _resolve_activation(activation) bias = bias if _is_given(bias) else _current_default_options.bias output_shape = _as_tuple(shape) if input_rank is not None and map_rank is not None: raise ValueError( "Dense: input_rank and map_rank cannot be specified at the same time." ) # determine meaning of axes # W gets dimension (input_shape + shape) # where input_shape is determined as: # - by default, equal to the dimensions of the input passed to Dense() # - if input_rank is given, then the last 'input_rank' dimensions of the input (all others are not reduced over) # - if map_rank is given, then the all but the first 'map_rank' dimensions of the input (those are not reduced over) # where input_rank and map_rank are mutuallly exclusive. #output_rank = -len(output_shape) # support outputs with tensor layouts # BUGBUG: Should this be a negative number now, since output is the last axis in Python? output_rank = len(output_shape) # support outputs with tensor layouts # If input_rank not given then pass a single _INFERRED; map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED * (input_rank if input_rank is not None else 1) if input_rank is not None: UntestedBranchError("Dense, input_rank option not implemented") infer_input_rank_to_map = -1 # means map_rank is not specified; input_rank rules elif map_rank is None: infer_input_rank_to_map = 0 # neither given: default to 'infer W to use all input dims' else: UntestedBranchError("Dense, map_rank option not implemented") infer_input_rank_to_map = map_rank # infer W to use all input dims except the first static 'map_rank' ones # parameters bound to this Function init_weights = _initializer_for(init, Record(output_rank=output_rank)) W = Parameter(input_shape + output_shape, init=init_weights, name='W') b = Parameter(output_shape, init=init_bias, name='b') if bias else None # expression of this function x = Placeholder(name='dense_arg') apply_x = times(x, W, output_rank=output_rank, infer_input_rank_to_map=infer_input_rank_to_map) if b: apply_x = apply_x + b apply_x = apply_x >> activation return Block(apply_x, 'Dense', Record(W=W, b=b))
def _sparse_to_dense_network_cache(input_shape, is_sequence, device): from cntk.ops import times, input, sequence if is_sequence: temp_input = sequence.input(input_shape, is_sparse=True) else: temp_input = input(input_shape, is_sparse=True) eye_shape = input_shape[-1] return times(temp_input, np.eye(eye_shape))
def fully_connected_classifier_net(input, num_output_classes, hidden_layer_dim, num_hidden_layers, device, nonlinearity): classifier_root = fully_connected_layer(input, hidden_layer_dim, device, nonlinearity) for i in range(1, num_hidden_layers): classifier_root = fully_connected_layer(classifier_root.output(), hidden_layer_dim, device, nonlinearity) output_times_param = parameter(shape=(hidden_layer_dim,num_output_classes)) output_plus_param = parameter(shape=(num_output_classes,)) t = times(classifier_root.output(),output_times_param) classifier_root = plus(output_plus_param,t.output()) return classifier_root;
def test_eval_sparse_no_seq(batch_index_data, device_id): dim = 10 multiplier = 2 for var_is_sparse in [True, False]: in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse) z = times(in1, multiplier*np.eye(dim)) batch = np.eye(dim)[batch_index_data] expected = batch * multiplier sparse_val = csr(batch.astype('f')) result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id)) assert np.allclose(result, [expected])
def test_eval_sparse_no_seq(batch_index_data, device_id): dim = 10 multiplier = 2 for var_is_sparse in [True, False]: in1 = input_variable(shape=(dim, ), is_sparse=var_is_sparse) z = times(in1, multiplier * np.eye(dim)) batch = np.eye(dim)[batch_index_data] expected = batch * multiplier sparse_val = csr(batch.astype('f')) result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id)) assert np.allclose(result, [expected])
def test_disallow_seq_starts_with_Value_objects(): one_hot_batch = [[2,5], [0,1,6]] dim = 10 in1 = input_variable(shape=(dim,), is_sparse=True) z = times(in1, np.eye(dim)) batch = one_hot(one_hot_batch, num_classes=dim) with pytest.raises(ValueError): result = z.eval(({in1: batch}, len(batch)*[True])) with pytest.raises(ValueError): result = z.eval({in1: (batch, len(batch)*[True])})
def test_eval_one_hot_seq(one_hot_batch, device_id): dim = 10 multiplier = 2 for var_is_sparse in [True, False]: in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse) # Convert CNTK node value to dense so that we can compare it later z = times(in1, np.eye(dim)*multiplier) # Convert expectation to dense expected = [np.eye(dim)[seq]*multiplier for seq in one_hot_batch] batch = one_hot(one_hot_batch, num_classes=dim, device=cntk_device(device_id)) result = z.eval({in1: batch}, device=cntk_device(device_id)) assert np.all([np.allclose(a,b) for a,b in zip(result, expected)])
def test_disallow_seq_starts_with_Value_objects(): one_hot_batch = [[2, 5], [0, 1, 6]] dim = 10 in1 = input_variable(shape=(dim, ), is_sparse=True) z = times(in1, np.eye(dim)) batch = one_hot(one_hot_batch, num_classes=dim) with pytest.raises(ValueError): result = z.eval(({in1: batch}, len(batch) * [True])) with pytest.raises(ValueError): result = z.eval({in1: (batch, len(batch) * [True])})
def lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid (peep (it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation (bit_proj) # applied to tanh of input network ft = sigmoid (peep (ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid (peep (ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation (ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht # returns the new state as a tuple with names but order matters return (Function.NamedOutput(h=h), Function.NamedOutput(c=c))
def test_eval_sparse_seq_1(batch, device_id): dim = 4 multiplier = 2 for var_is_sparse in [True, False]: in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse) z = times(in1, multiplier*np.eye(dim)) if isinstance(batch[0], list): expected = [np.vstack([m.todense() * multiplier for m in seq]) for seq in batch] else: expected = [seq.todense() * multiplier for seq in batch] result = z.eval({in1: batch}, device=cntk_device(device_id)) assert np.all([np.allclose(a,b) for a,b in zip(result, expected)]), \ "%s != %s"%(result,expected)
def Dense(shape, init=init_default_or_glorot_uniform, activation=activation_default_or_None, input_rank=None, map_rank=None, bias=bias_default_or_True, init_bias=init_bias_default_or_0): activation = _resolve_activation(activation) bias = bias if _is_given(bias) else _current_default_options.bias output_shape = _as_tuple(shape) if input_rank is not None and map_rank is not None: raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.") # determine meaning of axes # W gets dimension (input_shape + shape) # where input_shape is determined as: # - by default, equal to the dimensions of the input passed to Dense() # - if input_rank is given, then the last 'input_rank' dimensions of the input (all others are not reduced over) # - if map_rank is given, then the all but the first 'map_rank' dimensions of the input (those are not reduced over) # where input_rank and map_rank are mutuallly exclusive. #output_rank = -len(output_shape) # support outputs with tensor layouts # BUGBUG: Should this be a negative number now, since output is the last axis in Python? output_rank = len(output_shape) # support outputs with tensor layouts # If input_rank not given then pass a single _INFERRED; map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED * (input_rank if input_rank is not None else 1) if input_rank is not None: UntestedBranchError("Dense, input_rank option not implemented") infer_input_rank_to_map = -1 # means map_rank is not specified; input_rank rules elif map_rank is None: infer_input_rank_to_map = 0 # neither given: default to 'infer W to use all input dims' else: UntestedBranchError("Dense, map_rank option not implemented") infer_input_rank_to_map = map_rank # infer W to use all input dims except the first static 'map_rank' ones # parameters bound to this Function init_weights = _initializer_for(init, Record(output_rank=output_rank)) W = Parameter(input_shape + output_shape, init=init_weights, name='W') b = Parameter( output_shape, init=init_bias, name='b') if bias else None # expression of this function x = Placeholder(name='dense_arg') apply_x = times(x, W, output_rank=output_rank, infer_input_rank_to_map=infer_input_rank_to_map) if b: apply_x = apply_x + b apply_x = apply_x >> activation return Block(apply_x, 'Dense', Record(W=W, b=b))
def test_model_one_output_of_multi_output_function(): input_dim = 2 proj_dim = 11 x = input_variable((input_dim,)) x_placeholder = placeholder_variable() w = parameter((input_dim, proj_dim)) b = parameter((proj_dim,)) proj = times(x_placeholder, w) proj_plus_bias = proj + b combined_model = as_block(combine([proj, proj_plus_bias]), [(x_placeholder, x)], 'dense_op') labels = input_variable((proj_dim,)) lr_schedule = learning_rate_schedule(0.003, UnitType.sample) ce = cross_entropy_with_softmax(combined_model.outputs[0], labels) pe = classification_error(combined_model.outputs[0], labels) trainer_multitask = Trainer(combined_model.outputs[0], (ce, pe), sgd(ce.parameters, lr=lr_schedule))
def test_model_one_output_of_multi_output_function(): input_dim = 2 proj_dim = 11 x = input_variable((input_dim, )) x_placeholder = placeholder_variable() w = parameter((input_dim, proj_dim)) b = parameter((proj_dim, )) proj = times(x_placeholder, w) proj_plus_bias = proj + b combined_model = as_block(combine([proj, proj_plus_bias]), [(x_placeholder, x)], 'dense_op') labels = input_variable((proj_dim, )) lr_schedule = learning_rate_schedule(0.003, UnitType.sample) ce = cross_entropy_with_softmax(combined_model.outputs[0], labels) pe = classification_error(combined_model.outputs[0], labels) trainer_multitask = Trainer(combined_model.outputs[0], (ce, pe), sgd(ce.parameters, lr=lr_schedule))
def frcn_predictor(features, rois, n_classes, base_path): # model specific variables for AlexNet model_file = base_path + "/../../../resources/cntk/AlexNet.model" roi_dim = 6 feature_node_name = "features" last_conv_node_name = "conv5.y" pool_node_name = "pool3" last_hidden_node_name = "h2_d" # Load the pretrained classification net and find nodes print("Loading pre-trained model...") loaded_model = load_model(model_file) print("Loading pre-trained model... DONE.") feature_node = find_by_name(loaded_model, feature_node_name) conv_node = find_by_name(loaded_model, last_conv_node_name) pool_node = find_by_name(loaded_model, pool_node_name) last_node = find_by_name(loaded_model, last_hidden_node_name) # Clone the conv layers and the fully connected layers of the network conv_layers = combine([conv_node.owner ]).clone(CloneMethod.freeze, {feature_node: placeholder()}) fc_layers = combine([last_node.owner]).clone(CloneMethod.clone, {pool_node: placeholder()}) # Create the Fast R-CNN model feat_norm = features - constant(114) conv_out = conv_layers(feat_norm) roi_out = roipooling(conv_out, rois, (roi_dim, roi_dim)) fc_out = fc_layers(roi_out) #fc_out.set_name("fc_out") # z = Dense(rois[0], num_classes, map_rank=1)(fc_out) # --> map_rank=1 is not yet supported W = parameter(shape=(4096, n_classes), init=glorot_uniform()) b = parameter(shape=n_classes, init=0) z = times(fc_out, W) + b return z, fc_out
def frcn_predictor(features, rois, n_classes): # Load the pretrained classification net and find nodes loaded_model = load_model(model_file) feature_node = find_by_name(loaded_model, feature_node_name) conv_node = find_by_name(loaded_model, last_conv_node_name) pool_node = find_by_name(loaded_model, pool_node_name) last_node = find_by_name(loaded_model, last_hidden_node_name) # Clone the conv layers and the fully connected layers of the network conv_layers = combine([conv_node.owner]).clone(CloneMethod.freeze, {feature_node: Placeholder()}) fc_layers = combine([last_node.owner]).clone(CloneMethod.clone, {pool_node: Placeholder()}) # Create the Fast R-CNN model feat_norm = features - Constant(114) conv_out = conv_layers(feat_norm) roi_out = roipooling(conv_out, rois, (roi_dim, roi_dim)) fc_out = fc_layers(roi_out) # z = Dense(rois[0], num_classes, map_rank=1)(fc_out) # --> map_rank=1 is not yet supported W = parameter(shape=(4096, n_classes), init=glorot_uniform()) b = parameter(shape=n_classes, init=0) z = times(fc_out, W) + b return z
def gru_cell(shape, init=glorot_uniform(), name=''): # (x, (h,c)) """ GRU cell function """ shape = _as_tuple(shape) if len(shape) != 1: raise ValueError("gru_cell: shape must be vectors (rank-1 tensors)") # determine stacking dimensions cell_shape_stacked = shape * 2 # patched dims with stack_axis duplicated 2 times # parameters Wz = Parameter(cell_shape_stacked, init=init, name='Wz') Wr = Parameter(cell_shape_stacked, init=init, name='Wr') Wh = Parameter(cell_shape_stacked, init=init, name='Wh') Uz = Parameter(_INFERRED + shape, init=init, name='Uz') Ur = Parameter(_INFERRED + shape, init=init, name='Ur') Uh = Parameter(_INFERRED + shape, init=init, name='Uh') def create_s_placeholder(): # we pass the known dimensions here, which makes dimension inference easier return Placeholder(shape=shape, name='S') # (h, c) # parameters to model function x = Placeholder(name='gru_block_arg') prev_status = create_s_placeholder() # formula of model function Sn_1 = prev_status z = sigmoid(times(x, Uz, name='x*Uz') + times(Sn_1, Wz, name='Sprev*Wz'), name='z') r = sigmoid(times(x, Ur, name='x*Ur') + times(Sn_1, Wr, name='Sprev*Wr'), name='r') h = tanh(times(x, Uh, name='x*Uh') + times(element_times(Sn_1, r, name='Sprev*r'), Wh), name='h') s = plus(element_times((1 - z), h, name='(1-z)*h'), element_times(z, Sn_1, name='z*SPrev'), name=name) apply_x_s = combine([s]) apply_x_s.create_placeholder = create_s_placeholder return apply_x_s
def resnet_classifer(input, num_classes, device, output_name): conv_w_scale = 7.07 conv_b_value = 0 fc1_w_scale = 0.4 fc1_b_value = 0 sc_value = 1 bn_time_const = 4096 kernel_width = 3 kernel_height = 3 conv1_w_scale = 0.26 c_map1 = 16 conv1 = conv_bn_relu_layer(input, c_map1, kernel_width, kernel_height, 1, 1, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn1_1 = resnet_node2(conv1.output(), c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn1_2 = resnet_node2(rn1_1.output(), c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn1_3 = resnet_node2(rn1_2.output(), c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) c_map2 = 32 rn2_1_wProj = get_projection_map(c_map2, c_map1, device) rn2_1 = resnet_node2_inc(rn1_3.output(), c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn2_1_wProj, device) rn2_2 = resnet_node2(rn2_1.output(), c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn2_3 = resnet_node2(rn2_2.output(), c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) c_map3 = 64 rn3_1_wProj = get_projection_map(c_map3, c_map2, device) rn3_1 = resnet_node2_inc(rn2_3.output(), c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn3_1_wProj, device) rn3_2 = resnet_node2(rn3_1.output(), c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) rn3_3 = resnet_node2(rn3_2.output(), c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, device) # Global average pooling poolw = 8 poolh = 8 poolh_stride = 1 poolv_stride = 1 pool = pooling(rn3_3.output(), AVG_POOLING, (1, poolh, poolw), (1, poolv_stride, poolh_stride)) out_times_params = parameter(shape=(c_map3, 1, 1, num_classes), device_id=device) out_bias_params = parameter(shape=(num_classes, ), device_id=device) t = times(pool.output(), out_times_params) return plus(t.output(), out_bias_params, output_name)
def LSTM(shape, cell_shape=None, use_peepholes=use_peepholes_default_or_False, init=init_default_or_glorot_uniform, init_bias=init_bias_default_or_0, enable_self_stabilization=enable_self_stabilization_default_or_False): # (x, (h, c)) use_peepholes = use_peepholes if _is_given(use_peepholes) else _current_default_options.use_peepholes enable_self_stabilization = enable_self_stabilization if _is_given(enable_self_stabilization) else _current_default_options.enable_self_stabilization has_projection = cell_shape is not None has_aux = False if has_aux: UntestedBranchError("LSTM, has_aux option") shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape if len(shape) != 1 or len(cell_shape) != 1: raise ValueError("LSTM: shape and cell_shape must be vectors (rank-1 tensors)") # otherwise we'd need to fix slicing and Param initializers stack_axis = -1 # stacking along the fastest-changing one, to match BS # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[0] cell_shape_list[stack_axis] = stacked_dim*4 cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter( cell_shape_stacked, init=init_bias, name='b') # a bias W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W') # input A = Parameter(_INFERRED + cell_shape_stacked, init=init, name='A') if has_aux else None # aux input (optional) H = Parameter(shape + cell_shape_stacked, init=init, name='H') # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co') if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = Parameter(cell_shape + shape, init=init) if has_projection else None # final projection Sdh = Stabilizer() if enable_self_stabilization else identity Sdc = Stabilizer() if enable_self_stabilization else identity Sct = Stabilizer() if enable_self_stabilization else identity Sht = Stabilizer() if enable_self_stabilization else identity def create_hc_placeholder(): # we pass the known dimensions here, which makes dimension inference easier return (Placeholder(shape=shape, name='hPh'), Placeholder(shape=cell_shape, name='cPh')) # (h, c) # parameters to model function x = Placeholder(name='lstm_block_arg') prev_state = create_hc_placeholder() # formula of model function dh, dc = prev_state dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \ b + times(x, W) + times(dhs, H) it_proj = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim) # split along stack_axis bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim) ft_proj = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim) ot_proj = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim) # add peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid (peep (it_proj, dcs, Ci)) # input gate(t) bit = it * tanh (bit_proj) # applied to tanh of input network ft = sigmoid (peep (ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid (peep (ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * tanh (ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht _name_node(h, 'h') if _trace_layers: _log_node(h) # this looks right _name_node(c, 'c') # TODO: figure out how to do scoping, and also rename all the apply... to expression apply_x_h_c = combine ([h, c]) # return to caller a helper function to create placeholders for recurrence # Note that this function will only exist in the object returned here, but not any cloned version of it. apply_x_h_c.create_placeholder = create_hc_placeholder #return Block(apply_x_h_c, 'LSTM') # BUGBUG: fails with "RuntimeError: A Function instance with more than one output cannot be implicitly converted to a Variable" return apply_x_h_c
def no_op(input): return times(input, I)
def test_op_times(left_operand, right_operand, device_id, precision, left_matrix_type, right_matrix_type): if right_matrix_type == 'sparse': pytest.skip('second operator of times() has to be dense') dt = PRECISION_TO_TYPE[precision] # Forward pass test #================== # we compute the expected output for the forward pass # we need two surrounding brackets # the first for sequences (length=1, since we have dynamic_axis='') # the second for batch of one sample expected = [[np.dot(AA(left_operand, dtype=dt), AA(right_operand, dtype=dt))]] if left_matrix_type == 'sparse': a = SI(*batch_dense_to_sparse([left_operand])) else: a = I([left_operand]) b = I([right_operand]) from cntk.ops import times, constant left_as_input = times(a, constant(right_operand)) right_as_input = times(constant(left_operand), b) unittest_helper(left_as_input, None, expected, device_id=device_id, precision=precision, clean_up=True, backward_pass=False) unittest_helper(right_as_input, None, expected, device_id=device_id, precision=precision, clean_up=True, backward_pass=False) unittest_helper(times(a, b), None, expected, device_id=device_id, precision=precision, clean_up=True, backward_pass=False) # Backward pass test #================== def op_grad(A, B): ''' Compute derivative of A with respect to B. For simplicity, assume A and B to be matrices. Let A be 2x2 and B be 2x1, then we have [a11 a12] [b11] = [ a11 b11 + a12 b21 ] [a21 a22] [b21] [ a21 b11 + a22 b21 ] The derivative for A with respect to B is [b11 b21] [b11 b21] The derivative for B with respect to A: [a11 + a12] [a21 + a22] ''' assert len(A.shape) == len(B.shape) == 2 D = np.zeros_like(A) D[:,:] = B.sum(axis=1) return D if 'sparse' not in [left_matrix_type, right_matrix_type]: # FIXME: disabling until the Pass node supports sparse expected_left = [[op_grad(AA(left_operand, dtype=dt), AA(right_operand, dtype=dt))]] expected_right = [[op_grad(AA(right_operand, dtype=dt).T, AA(left_operand, dtype=dt).T).T]] unittest_helper(left_as_input, None, expected_left, device_id=device_id, precision=precision, clean_up=True, backward_pass=True, input_node=a) # BUG: Fails because of Pass node? unittest_helper(right_as_input, None, expected_right, device_id=device_id, precision=precision, clean_up=True, backward_pass=True, input_node=b)
def termination_gate(init=glorot_uniform(), name=''): Wt = Parameter(_INFERRED + tuple((1, )), init=init, name='Wt') status = placeholder_variable(name='status') return sigmoid(times(status, Wt), name=name)
def _linear(x): apply_x = ops.times(x, sc) apply_x += b return apply_x
else: recurrence_hook_h = lambda operand: element_select( is_first_label, thought_vector_broadcast_h, past_value(operand)) recurrence_hook_c = lambda operand: element_select( is_first_label, thought_vector_broadcast_c, past_value(operand)) (decoder_output_h, decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim, recurrence_hook_h, recurrence_hook_c) # 1. # Add the linear layer W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim), init=glorot_uniform()) B = parameter(shape=(label_vocab_dim), init=0) z = plus(B, times(decoder_output_h, W)) def create_model(): # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis]
def create_model(): # Source and target inputs to the model batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable(shape=(input_vocab_dim), dynamic_axes=input_dynamic_axes, name='raw_input') label_dynamic_axes = [batch_axis, label_seq_axis] raw_labels = input_variable(shape=(label_vocab_dim), dynamic_axes=label_dynamic_axes, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = sequence.slice( raw_labels, 1, 0, name='label_sequence') # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> # Setup primer for decoder is_first_label = sequence.is_first(label_sequence) # 1 0 0 0 ... label_sentence_start_scattered = sequence.scatter(label_sentence_start, is_first_label) # Encoder stabilize = Stabilizer() encoder_output_h = stabilize(input_sequence) for i in range(0, num_layers): (encoder_output_h, encoder_output_c) = LSTM_layer(encoder_output_h.output, hidden_dim, future_value, future_value) # Prepare encoder output to be used in decoder thought_vector_h = sequence.first(encoder_output_h) thought_vector_c = sequence.first(encoder_output_c) thought_vector_broadcast_h = sequence.broadcast_as(thought_vector_h, label_sequence) thought_vector_broadcast_c = sequence.broadcast_as(thought_vector_c, label_sequence) # Decoder decoder_history_hook = alias( label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value(decoder_history_hook)) decoder_output_h = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hook_h = past_value recurrence_hook_c = past_value else: recurrence_hook_h = lambda operand: element_select( is_first_label, thought_vector_broadcast_h, past_value(operand) ) recurrence_hook_c = lambda operand: element_select( is_first_label, thought_vector_broadcast_c, past_value(operand) ) (decoder_output_h, decoder_output_c) = LSTM_layer(decoder_output_h.output, hidden_dim, recurrence_hook_h, recurrence_hook_c) # Linear output layer W = parameter(shape=(decoder_output_h.shape[0], label_vocab_dim), init=glorot_uniform()) B = parameter(shape=(label_vocab_dim), init=0) z = plus(B, times(stabilize(decoder_output_h), W)) return z
def rnn(dh, x): dhs = Sdh(dh) # previous value, stabilized ht = activation (times(x, W) + times(dhs, H) + b) h = times(Sht(ht), Wmr) if has_projection else \ ht return Function.NamedOutput(h=h)
def _sparse_to_dense_network_cache(input_shape): from cntk.ops import times, input_variable temp_input = input_variable(input_shape) eye_shape = input_shape[-1] return times(temp_input, np.eye(eye_shape))
def rnn_step(dh, x): dhs = Sdh(dh) # previous value, stabilized ht = activation(times(x, W) + dhs * H + b) h = times(Sht(ht), Wmr) if has_projection else \ ht return h