def gaussian_mdn_coeff(x, nmix: int, ndim: int): """ Extracts the coefficients for gaussian mixture density network. Assumes independence between gaussian dimensions. Example: ndim, nmix = 1, 3 a = C.input_variable(ndim) prediction = Dense((ndim + 2) * nmix)(a) coeffs = C.combine(gaussian_mdn_coeff(prediction_tensor, nmix=nmix, ndim=ndim)).eval({a: x}) alpha, mu, sigma = coeffs.values() Arguments: x: input tensor nmix (int): number of mixture ndim (int): number of dimension of gaussian Returns: tuple """ if len(x.shape) != 1: raise ValueError("Must be a 1d tensor, but input has shape {0}".format( x.shape)) alpha = C.softmax(C.slice(x, 0, 0, nmix), name='alpha') sigma = C.exp( C.slice(x, 0, nmix, 2 * nmix), name='sigma' ) # common variance for all components in single gaussian kernel mu = C.reshape(C.slice(x, 0, 2 * nmix, (ndim + 2) * nmix), shape=(nmix, ndim), name='mu') return alpha, mu, sigma
def inner(query, key, value): mixed_queries = query_linear(query) # [#, *] {model_dim,] mixed_keys = key_linear(key) # [#, *] {model_dim,] mixed_values = value_linear(value) # [#, *] {model_dim,] # TODO: re-implement `ScaledDotProductAttention` when cntk has BatchMatMul so there's no need to slice here queries = [ C.slice(mixed_queries, 0, i * head_dim, (i + 1) * head_dim) for i in range(num_heads) ] keys = [ C.slice(mixed_keys, 0, i * head_dim, (i + 1) * head_dim) for i in range(num_heads) ] values = [ C.slice(mixed_values, 0, i * head_dim, (i + 1) * head_dim) for i in range(num_heads) ] # list of num_heads heads with shape (-3, head_dim) each attention_outputs = [ scaled_dot_product_attention(q, k, v) for q, k, v in zip(queries, keys, values) ] result = multihead_liner(C.splice(*attention_outputs)) return result
def create_model(input_dim): row = sequence.input_variable(shape=input_dim) col = sequence.input_variable(shape=input_dim) rowh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(row) colh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(col) x = C.splice(rowh, colh, axis=-1) x = lightlstm(opt.embed, opt.nhid)(x) x = For(range(opt.layer-1), lambda: lightlstm(opt.nhid, opt.nhid))(x) rowh = C.slice(x, -1, opt.nhid * 0, opt.nhid * 1) colh = C.slice(x, -1, opt.nhid * 1, opt.nhid * 2) row_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(rowh) col_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(colh) # variable : row label and col label row_label = sequence.input_variable(shape=input_dim) col_label = sequence.input_variable(shape=input_dim) model = C.combine([row_predict, col_predict]) return {'row': row, 'col': col, 'row_label': row_label, 'col_label': col_label, 'model': model}
def create_network(): # Create the input and target variables input_var = cntk.input_variable( (sequence_length, frame_height, frame_width), name='input_var') target_var = cntk.input_variable((num_classes, ), is_sparse=True, name='target_var') input_head = cntk.slice(input_var, axis=0, begin_index=0, end_index=19) input_tail = cntk.slice(input_var, axis=0, begin_index=1, end_index=20) diff = input_tail - input_head model = Sequential([ resnet_model(cntk.placeholder()), Label('resnet'), Dense(num_classes, name='output') ])(diff) return { 'input': input_var, 'target': target_var, 'model': model, 'loss': cntk.cross_entropy_with_softmax(model, target_var), 'metric': cntk.classification_error(model, target_var) }
def createDecoderNetwork(self, networkHiddenSrc, srcLength, trgLength): timeZeroHidden = C.slice(networkHiddenSrc, 0, 0, 1) srcSentEmb = C.slice(timeZeroHidden, -1, Config.SrcHiddenSize, Config.SrcHiddenSize * 2) networkHiddenTrg = {} inputTrg = C.reshape(self.inputMatrixTrg, shape=(Config.TrgMaxLength, Config.BatchSize, Config.TrgVocabSize)) attProbAll = [] tce = 0 for i in range(0, trgLength, 1): preTrgEmb = self.initTrgEmb if i == 0 else self.EmbTrg(inputTrg[i - 1]) if (i == 0): networkHiddenTrg[i] = self.createDecoderInitNetwork(srcSentEmb) else: (networkHiddenTrg[i], attProb) = self.createDecoderRNNNetwork( networkHiddenSrc, preTrgEmb, networkHiddenTrg[i - 1], srcLength) attProbAll = attProb if i == 1 else C.splice( attProbAll, attProb, axis=0) preSoftmax = self.createReadOutNetwork(networkHiddenTrg[i], preTrgEmb) ce = C.cross_entropy_with_softmax(preSoftmax, inputTrg[i], 2) ce = C.reshape(ce, shape=(1, Config.BatchSize)) tce += C.times_transpose(ce, self.maskMatrixTrg[i]) return tce
def lightlstm(input_dim, cell_dim): x = C.placeholder(name='x') dh = C.placeholder(name='dh') dc = C.placeholder(name='dc') x1 = C.slice(x, -1, input_dim * 0, input_dim * 1) x2 = C.slice(x, -1, input_dim * 1, input_dim * 2) def LSTMCell(x, y, dh, dc): '''LightLSTM Cell''' b = C.parameter(shape=(4 * cell_dim), init=0) W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform()) H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform()) # projected contribution from input x, hidden, and bias proj4 = b + C.times(x, W) + C.times(dh, H) it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim) bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim) ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim) ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim) it = C.sigmoid(it_proj) # input gate bit = it * C.tanh(bit_proj) ft = C.sigmoid(ft_proj) # forget gate bft = ft * dc ct = bft + bit ot = C.sigmoid(ot_proj) # output gate ht = ot * C.tanh(ct) # projected contribution from input y, hidden, and bias proj4_2 = b + C.times(y, W) + C.times(ht, H) it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim) bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim) ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim) ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim) it_2 = C.sigmoid(it_proj_2) # input gate bit_2 = it_2 * C.tanh(bit_proj_2) ft_2 = C.sigmoid(ft_proj_2) # forget gate bft_2 = ft_2 * ct ct2 = bft_2 + bit_2 ot_2 = C.sigmoid(ot_proj_2) # output gate ht2 = ot_2 * C.tanh(ct2) return (ht, ct, ht2, ct2) Cell = LSTMCell(x1, x2, dh, dc) actualDh = past_value(Cell[2]) actualDc = past_value(Cell[3]) Cell[0].replace_placeholders( {dh: actualDh.output, dc: actualDc.output}) return C.splice(Cell[0], Cell[2], axis=-1)
def inner(a): values, valid = C.sequence.unpack(a, padding_value=0).outputs values_reversed = C.slice(values, 0, 0, 0, -1) valid_reversed = C.slice(valid, 0, 0, 0, -1) values_seq = C.to_sequence(values_reversed) valid_seq = C.to_sequence(C.expand_dims(valid_reversed, axis=-1)) a_reversed = C.sequence.gather(values_seq, valid_seq) return a_reversed
def test_slice_attributes(): x = C.input_variable((2,3)) f = C.slice(x, 0, 1, 2) d = f.root_function.attributes expected = {'endIndex': 2, 'beginIndex': 1, 'axis': ('ordered', 'static', 1), 'sliceStrides': 1} _check(expected, d) f = C.slice(x, [0,1], [1,0], [2,2], [-1,1]) d = f.root_function.attributes expected = {'endIndexVec': [2,2], 'beginIndexVec': [1,0], 'axisVec': [('ordered', 'static', 1), ('ordered', 'static', 0)], 'sliceStridesVec': [-1, 1]} _check(expected, d)
def gaussian_windows_attention_coefficients(abk, nb_mixtures): """ Split into 3 equal tensor of dim nb_mixtures """ a = C.slice(abk, 0, 0, nb_mixtures) b = C.slice(abk, 0, nb_mixtures, 2 * nb_mixtures) k = C.slice(abk, 0, 2 * nb_mixtures, 0) k = Recurrence(C.plus)(k) a = C.expand_dims(a, axis=-1) b = C.expand_dims(b, axis=-1) k = C.expand_dims(k, axis=-1) return a, b, k
def test_Slice(tmpdir): data = np.asarray([[1, 2, -3], [4, 5, 6]], dtype=np.float32) x1 = C.input_variable((2, 3)) model = C.slice(data, 0, 1, 2) verify_no_input(model, tmpdir, 'Slice_0') model = C.slice(x1, 0, 1, 2) verify_one_input(model, data, tmpdir, 'Slice_1') model = C.slice(x1, [0, 1], [1, 0], [2, 1]) verify_one_input(model, data, tmpdir, 'Slice2_1')
def test_Slice(tmpdir, dtype): with C.default_options(dtype = dtype): data = np.asarray([[1,2,-3], [4, 5, 6]],dtype=dtype) x1 = C.input_variable((2,3)) model = C.slice(data, 0, 1, 2) verify_no_input(model, tmpdir, 'Slice_0') model = C.slice(x1, 0, 1, 2) verify_one_input(model, data, tmpdir, 'Slice_1') model = C.slice(x1, [0,1], [1,0], [2,1]); verify_one_input(model, data, tmpdir, 'Slice2_1')
def test_Slice(tmpdir, dtype): with C.default_options(dtype=dtype): data = np.asarray([[1, 2, -3], [4, 5, 6]], dtype=dtype) x1 = C.input_variable((2, 3)) model = C.slice(data, 0, 1, 2) verify_no_input(model, tmpdir, 'Slice_0') model = C.slice(x1, 0, 1, 2) verify_one_input(model, data, tmpdir, 'Slice_1') model = C.slice(x1, [0, 1], [1, 0], [2, 1]) verify_one_input(model, data, tmpdir, 'Slice2_1')
def test_op_slice_sequence(input_data, slice_params, expected_result, device_id, precision): input_data = AA(input_data, dtype=PRECISION_TO_TYPE[precision]) t = Axis.new_unique_dynamic_axis('t') sample_shape = input_data.shape[1:] a = I(shape=sample_shape, data_type=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, dynamic_axes=[Axis.default_batch_axis(), t], name='a') result = C.slice(a, axis=t, begin_index=slice_params[ 0], end_index=slice_params[1]) def grad_slice(x, beg_index, end_index): res = np.zeros_like(x) res[beg_index:end_index] = 1 return res expected_gradient = grad_slice(np.asarray(input_data), *slice_params) expected_forward = AA( [expected_result], dtype=PRECISION_TO_TYPE[precision]) expected_backward = { a: [grad_slice(np.asarray(input_data), *slice_params)] } # create batch input_data.shape = (1,) + input_data.shape forward_input = {a: input_data} unittest_helper(result, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision)
def test_op_slice_sequence(input_data, slice_params, expected_result, device_id, precision): # Forward pass test #================== # We compute the expected output for the forward pass. # We need two surrounding brackets: # The first for sequences (length=1, since we have dynamic_axis=''). # The second for batch of one sample. # 1 sample with 2 sequence element of a vector of 3 t = C.dynamic_axis(name='t') a = I([input_data], dynamic_axis=t) # slice using the operator result = C.slice(a, slice_params[0], slice_params[1], axis='t') result = C.identity(result) # required hack because Slice doesn't propagate tag unittest_helper(result, None, [expected_result], device_id=device_id, precision=precision, clean_up=False, backward_pass=False) # Backward pass test # ================== # The gradient of the slice operator is a tensor of the same shape as the # input tensor, having 1 for elements that were taken and 0 for elements # that were dropped. def grad_slice(x, beg_index, end_index): res = np.zeros_like(x) res[beg_index:end_index] = 1 return res expected_gradient = grad_slice(np.asarray(input_data), *slice_params) unittest_helper(result, None, [expected_gradient], device_id = device_id, precision=precision, clean_up=True, backward_pass=True, input_node=a)
def process_history(hist, inp): wk = C.slice(hist, 0, 0, myConfig['wg_dim']) wn = hist[myConfig['wg_dim']:] hist_processed = embed_layer(wk, wn) out_logits = s2smodel(hist_processed, inp) hamax = C.reshape(C.hardmax(out_logits), (-1, )) return hamax
def test_op_slice(input_data, slice_params, expected_result, device_id, precision): # Forward pass test #================== # We compute the expected output for the forward pass. # We need two surrounding brackets: # The first for sequences (length=1, since we have dynamic_axis=''). # The second for batch of one sample. a = I([input_data]) def op_slice(x, beg_index, end_index, axis): return x[beg_index:end_index] def _ax_slices(x, beg_index, end_index, axis): ''' Creates a NumPy slicing array from slice operator's arguments ''' ax_slices = [] for i in range(0, len(x.shape)): if i==axis: if end_index >= x.shape[i]: ax_slices.append([beg_index,]) else: ax_slices.append([beg_index,end_index]) else: ax_slices.append(slice(None)) # corresponds to ':' return ax_slices # slice using the operator result = C.slice(a, *slice_params) unittest_helper(result, None, [[expected_result]], device_id=device_id, precision=precision, clean_up=True, backward_pass=False) # slice using the overload ax_slices = _ax_slices(a, *slice_params) result = a[ax_slices] unittest_helper(result, None, [[expected_result]], device_id=device_id, precision=precision, clean_up=True, backward_pass=False) # Backward pass test # ================== # The gradient of the slice operator is a tensor of the same shape as the # input tensor, having 1 for elements that were taken and 0 for elements # that were dropped. def grad_slice(x, beg_index, end_index, axis): res = np.zeros_like(x) ax_slices = _ax_slices(x, beg_index, end_index, axis) res[ax_slices] = x[ax_slices] res[res!=0] = 1 return res expected_gradient = grad_slice(np.asarray(input_data), *slice_params) unittest_helper(result, None, [[expected_gradient]], device_id = device_id, precision=precision, clean_up=True, backward_pass=True, input_node=a)
def test_slice_attributes(): x = C.input((2, 3)) f = C.slice(x, 0, 1, 2) d = f.root_function.attributes expected = { 'endIndex': 2, 'beginIndex': 1, 'axis': ('ordered', 'static', 1) } _check(expected, d) f = C.slice(x, [0, 1], [1, 0], [2, 2]) d = f.root_function.attributes expected = { 'endIndexVec': [2, 2], 'beginIndexVec': [1, 0], 'axisVec': [('ordered', 'static', 1), ('ordered', 'static', 0)] } _check(expected, d)
def test_slice_attributes(): x = C.input_variable((2, 3)) f = C.slice(x, 0, 1, 2) d = f.root_function.attributes expected = { 'endIndex': 2, 'beginIndex': 1, 'axis': ('ordered', 'static', 1) } _check(expected, d)
def createDecodingNetworks(self, srcHiddenStates, trgPreWord, trgPreHidden, srcLength): preTrgEmb = self.EmbTrg(trgPreWord) (decoderHidden, attProb) = self.createDecoderRNNNetwork( C.slice(srcHiddenStates, 0, 0, srcLength), preTrgEmb, trgPreHidden, srcLength) preSoftmax = self.createReadOutNetwork(decoderHidden, preTrgEmb) decoderPredict = self.createPredictionNetwork(preSoftmax) decoderPredictNet = C.combine(decoderHidden, decoderPredict) return (decoderPredictNet, [decoderHidden.output, decoderPredict.output])
def criteria(label, output, block_size, c_classes, weights): ''' Define the loss function and metric ''' probs = cntk.softmax(output, axis=0) log_probs = cntk.log(probs) ce = cntk.times(weights, -cntk.element_times(log_probs, label), output_rank=2) mean_ce = cntk.reduce_mean(ce) _, w, h = label.shape pe = cntk.classification_error(probs, label, axis=0) - \ cntk.reduce_sum(cntk.slice(label, 0, 0, 1)) / cntk.reduce_sum(label) return (mean_ce, pe)
def cnwindow(mna,window): mnas=mna.shape mnout=(*mnas[:-2],*window,((mnas[-2]-window[-2])+1),((mnas[-1]-window[-1])+1)) mne2=None for R in range(window[0]): j_lim = R + mnout[-2] for H in range(window[1]): tdata=C.slice(mna,[-2,-1], [R,H], [j_lim,(H + mnout[-1])]) if mne2 is None: mne2=tdata else: mne2=C.splice(mne2,tdata,axis=1) return(C.reshape(C.transpose(C.reshape(mne2, shape=mnout),(0,5,4,3,2,1)), (mnout[0],*mnout[5:3:-1],1,*mnout[3:0:-1])))
def resu_model(input, num_stack_layers, c_map, num_classes, block_size): r = cntk.slice(input, 0, 0, 1) g = cntk.slice(input, 0, 1, 2) b = cntk.slice(input, 0, 2, 3) i = cntk.slice(input, 0, 3, 4) r -= reduce_mean(r) g -= reduce_mean(g) b -= reduce_mean(b) #i -= reduce_mean(i) input_do = splice(splice(splice(r, g, axis=0), b, axis=0), i, axis=0) conv = conv_bn(input_do, (3, 3), c_map[0]) r1 = resnet_basic_stack(conv, num_stack_layers, c_map[0]) r2_1 = resnet_basic_inc(r1, c_map[1]) r2_2 = resnet_basic_stack(r2_1, num_stack_layers-1, c_map[1]) r3_1 = resnet_basic_inc(r2_2, c_map[2]) r3_2 = resnet_basic_stack(r3_1, num_stack_layers-1, c_map[2]) r4_1 = resnet_basic_inc(r3_2, c_map[3]) r4_2 = resnet_basic_stack(r4_1, num_stack_layers-1, c_map[3]) r4_us = layers.ConvolutionTranspose((3, 3), c_map[3], strides=2, output_shape=(block_size/4, block_size/4), pad=True, bias=False, init=bilinear(3, 3))(r4_2) o3 = relu(layers.Convolution((1, 1), c_map[2])(r3_2) + layers.Convolution((1, 1), c_map[2])(r4_us)) o3_us = layers.ConvolutionTranspose((3, 3), c_map[2], strides=2, output_shape=(block_size/2, block_size/2), pad=True, bias=False, init=bilinear(3, 3))(o3) o2 = relu(layers.Convolution((1, 1), c_map[1])(r2_2) + layers.Convolution((1, 1), c_map[1])(o3_us)) o2_us = layers.ConvolutionTranspose((3, 3), c_map[1], strides=2, output_shape=(block_size, block_size), pad=True, bias=False, init=bilinear(3, 3))(o2) o1 = relu(layers.Convolution((3, 3), c_map[0], pad=True)(input_do) + layers.Convolution((1, 1), c_map[0])(r1) + layers.Convolution((1, 1), c_map[0])(o2_us)) return layers.Convolution((3, 3), num_classes, pad=True, activation=relu)(o1)
def test_op_slice_sequence(input_data, slice_params, expected_result, device_id, precision): # Forward pass test #================== # We compute the expected output for the forward pass. # We need two surrounding brackets: # The first for sequences (length=1, since we have dynamic_axis=''). # The second for batch of one sample. # 1 sample with 2 sequence element of a vector of 3 t = C.dynamic_axis(name='t') a = I([input_data], dynamic_axis=t) # slice using the operator result = C.slice(a, slice_params[0], slice_params[1], axis='t') result = C.identity( result) # required hack because Slice doesn't propagate tag unittest_helper(result, None, [expected_result], device_id=device_id, precision=precision, clean_up=False, backward_pass=False) # Backward pass test # ================== # The gradient of the slice operator is a tensor of the same shape as the # input tensor, having 1 for elements that were taken and 0 for elements # that were dropped. def grad_slice(x, beg_index, end_index): res = np.zeros_like(x) res[beg_index:end_index] = 1 return res expected_gradient = grad_slice(np.asarray(input_data), *slice_params) unittest_helper(result, None, [expected_gradient], device_id=device_id, precision=precision, clean_up=True, backward_pass=True, input_node=a)
def LSTMCell(x, y, dh, dc): '''LightLSTM Cell''' b = C.parameter(shape=(4 * cell_dim), init=0) W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform()) H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform()) # projected contribution from input x, hidden, and bias proj4 = b + C.times(x, W) + C.times(dh, H) it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim) bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim) ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim) ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim) it = C.sigmoid(it_proj) # input gate bit = it * C.tanh(bit_proj) ft = C.sigmoid(ft_proj) # forget gate bft = ft * dc ct = bft + bit ot = C.sigmoid(ot_proj) # output gate ht = ot * C.tanh(ct) # projected contribution from input y, hidden, and bias proj4_2 = b + C.times(y, W) + C.times(ht, H) it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim) bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim) ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim) ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim) it_2 = C.sigmoid(it_proj_2) # input gate bit_2 = it_2 * C.tanh(bit_proj_2) ft_2 = C.sigmoid(ft_proj_2) # forget gate bft_2 = ft_2 * ct ct2 = bft_2 + bit_2 ot_2 = C.sigmoid(ot_proj_2) # output gate ht2 = ot_2 * C.tanh(ct2) return (ht, ct, ht2, ct2)
def slice(x, axis, begin_index, end_index, name=''): ''' Slice the input along an axis. Examples: >>> # create 2x3 matrix in a sequence of length 1 in a batch of one sample >>> data = np.asarray([[[1, 2, -3], ... [4, 5, 6]]]) >>> x = C.input_numpy(data) >>> # slice index 1 (second) at first axis >>> C.eval(C.slice(x, 1, 2, 0)) [array([[[ 4., 5., 6.]]])] >>> # slice index 0 (first) at second axis >>> C.eval(C.slice(x, 0, 1, 1)) [array([[[ 1.], [ 4.]]])] NumPy's way of slicing works, too: Examples: >>> C.eval(x[1]) [array([[[ 4., 5., 6.]]])] >>> C.eval(x[:,:2,:]) [array([[[ 1., 2.], [ 4., 5.]]])] Args: x: input tensor axis (:class:`cntk.Axis`): axis along which `begin_index` and `end_index` will be used. begin_index (int): the index along axis where the slicing starts end_index (int): the index along axis where the slicing ends name (str): the name of the node in the network See also: Indexing in NumPy: http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html Returns: :class:`cntk.Function` ''' from cntk import slice x = sanitize_input(x) return slice(x, axis, begin_index, end_index, name).output()
def createNetwork(self, inputEmb, preHidden, preMem): WX = C.times(inputEmb, self.W) + self.Wb UH = C.times(preHidden, self.U) + self.Ub I = C.sigmoid( C.slice(WX, -1, 0, self.hiddenSize) + C.slice(UH, -1, 0, self.hiddenSize)) O = C.sigmoid( C.slice(WX, -1, self.hiddenSize, self.hiddenSize * 2) + C.slice(UH, -1, self.hiddenSize, self.hiddenSize * 2)) F = C.sigmoid( C.slice(WX, -1, self.hiddenSize * 2, self.hiddenSize * 3) + C.slice(UH, -1, self.hiddenSize * 2, self.hiddenSize * 3)) N = C.tanh( C.slice(WX, -1, self.hiddenSize * 3, self.hiddenSize * 4) + C.slice(UH, -1, self.hiddenSize * 3, self.hiddenSize * 4)) NI = C.element_times(N, I) FM = C.element_times(F, preMem) CurMem = NI + FM CurH = C.element_times(C.tanh(CurMem), O) return (CurH, CurMem)
def test_op_slice_sequence(input_data, slice_params, expected_result, device_id, precision): input_data = AA(input_data, dtype=PRECISION_TO_TYPE[precision]) t = C.Axis.new_unique_dynamic_axis('t') sample_shape = input_data.shape[1:] a = I(shape=sample_shape, data_type=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, dynamic_axes=[C.Axis.default_batch_axis(), t], name='a') result = C.slice(a, axis=t, begin_index=slice_params[0], end_index=slice_params[1]) def grad_slice(x, beg_index, end_index): res = np.zeros_like(x) res[beg_index:end_index] = 1 return res expected_gradient = grad_slice(np.asarray(input_data), *slice_params) expected_forward = AA([expected_result], dtype=PRECISION_TO_TYPE[precision]) expected_backward = { a: [grad_slice(np.asarray(input_data), *slice_params)] } # create batch input_data.shape = (1, ) + input_data.shape forward_input = {a: input_data} unittest_helper(result, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision)
def createNetwork(self, inputEmb, preHidden): WX = C.times(inputEmb, self.W) + self.Wb UH = C.times(preHidden, self.U) + self.Ub R = C.sigmoid( C.slice(WX, -1, 0, self.hiddenSize) + C.slice(UH, -1, 0, self.hiddenSize)) Z = C.sigmoid( C.slice(WX, -1, self.hiddenSize, self.hiddenSize * 2) + C.slice(UH, -1, self.hiddenSize, self.hiddenSize * 2)) UHR = C.element_times( C.slice(UH, -1, self.hiddenSize * 2, self.hiddenSize * 3), R) HTilde = C.tanh( C.slice(WX, -1, self.hiddenSize * 2, self.hiddenSize * 3) + UHR) CurH = C.element_times(HTilde, 1 - Z) + C.element_times(preHidden, Z) return CurH
def createAttentionNet(self, hiddenSrc, curHiddenTrg, srcLength): srcHiddenSize = Config.SrcHiddenSize * 2 hsw = C.times(hiddenSrc, self.Was) htw = C.times(curHiddenTrg, self.Wat) hst = C.reshape( hsw, shape=(srcLength, Config.BatchSize * Config.TrgHiddenSize) ) + C.reshape(htw, shape=(1, Config.BatchSize * Config.TrgHiddenSize)) hstT = C.reshape(C.tanh(hst), shape=(srcLength * Config.BatchSize, Config.TrgHiddenSize)) attScore = C.reshape(C.times(hstT, self.Wav), shape=(srcLength, Config.BatchSize)) maskOut = (C.slice(self.maskMatrixSrc, 0, 0, srcLength) - 1) * 99999999 nAttScore = attScore + maskOut attProb = C.reshape(C.softmax(nAttScore, axis=0), shape=(srcLength, Config.BatchSize, 1)) attVector = hiddenSrc * attProb contextVector = C.reduce_sum(C.reshape( attVector, shape=(srcLength, Config.BatchSize * srcHiddenSize)), axis=0) contextVector = C.reshape(contextVector, shape=(1, Config.BatchSize, srcHiddenSize)) return (contextVector, attProb)
def inner(a): return C.slice(C.reshape(a, (-1, )), 0, 0, 1)
def test_Slice(tmpdir): data = np.asarray([[[1,2,-3], [4, 5, 6]]],dtype=np.float32) x1 = C.input_variable((2,3)) model = C.slice(x1, 0, 1, 2) verify_one_input(model, data, tmpdir, 'Slice_0')
def freq_grid(input, output_dim, slice_size=10, slice_overlap=5): # slice the input vector along frequency input_dim = input.shape[0] right_ind = slice_size # array with freq outputs at the prev time step m_t_1_k_list = [] c_t_1_k_list = [] while (right_ind <= input_dim): name1 = 'm_t_1_k' + str(right_ind) m_t_1_k_list.append( C.placeholder(shape=(output_dim), dynamic_axes=input.dynamic_axes, name=name1)) name1 = 'c_t_1_k' + str(right_ind) c_t_1_k_list.append( C.placeholder(shape=(output_dim), dynamic_axes=input.dynamic_axes, name=name1)) right_ind = right_ind + slice_overlap left_ind = 0 right_ind = slice_size k_ind = 0 GLSTM_cell_list = [] GLSTM_cell = grid_lstm_factory(slice_size, output_dim) while (right_ind <= input_dim): freq_slice = C.slice(input, 0, left_ind, right_ind) if k_ind == 0: f_x_h_c = GLSTM_cell(m_t_1_k_list[k_ind], C.Constant(0, (output_dim)), c_t_1_k_list[0], C.Constant(0, (output_dim)), freq_slice) else: f_x_h_c = GLSTM_cell(m_t_1_k_list[k_ind], GLSTM_cell_list[k_ind - 1].outputs[1], c_t_1_k_list[k_ind], GLSTM_cell_list[k_ind - 1].outputs[3], freq_slice) GLSTM_cell_list.append(f_x_h_c) right_ind = right_ind + slice_overlap left_ind = left_ind + slice_overlap k_ind = k_ind + 1 result = C.splice(C.combine([GLSTM_cell_list[0].outputs[0]]), C.combine([GLSTM_cell_list[0].outputs[1]])) i = 0 while i < k_ind: replacements = { m_t_1_k_list[i]: C.sequence.past_value(GLSTM_cell_list[i].outputs[0]).output, c_t_1_k_list[i]: C.sequence.past_value(GLSTM_cell_list[i].outputs[2]).output } GLSTM_cell_list[i].replace_placeholders(replacements) result = C.splice(result, C.combine([GLSTM_cell_list[i].outputs[0]]), C.combine([GLSTM_cell_list[i].outputs[1]])) i = i + 1 assert ((right_ind - slice_overlap) == input_dim) return result
def inner(x): return C.slice(C.reshape(x, (-1, )), 0, 0, 1)
def test_slice_attributes(): x = C.input_variable((2,3)) f = C.slice(x, 0, 1, 2) d = f.root_function.attributes expected = {'endIndex': 2, 'beginIndex': 1, 'axis': ('ordered', 'static', 1)} _check(expected, d)
def find_embed(x): gx, ngx = C.slice(x, 0, 0, self.wg_dim), C.slice(x, 0, self.wg_dim, self.vocab_size) return embed(gx, ngx)
def test_op_slice(input_data, slice_params, expected_result, device_id, precision): input_data = AA(input_data, dtype=PRECISION_TO_TYPE[precision]) a = I( shape=input_data.shape, data_type=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name="a", ) def _ax_slices(x, beg_index, end_index, axis): """ Creates a NumPy slicing array from slice operator's arguments """ ax_slices = [] for i in range(0, len(x.shape)): if i == axis: if end_index >= x.shape[i]: ax_slices.append([beg_index]) else: ax_slices.append([beg_index, end_index]) else: ax_slices.append(slice(None)) # corresponds to ':' return ax_slices # slice using the overload if False: # FIXME remove ones the overloads are in place # slice using the operator result = C.slice(a, *slice_params) ax_slices = _ax_slices(a, *slice_params) result = a[ax_slices] unittest_helper( result, None, [[expected_result]], device_id=device_id, precision=precision, clean_up=True, backward_pass=False, ) # Backward pass test # ================== # The gradient of the slice operator is a tensor of the same shape as the # input tensor, having 1 for elements that were taken and 0 for elements # that were dropped. def grad_slice(x, beg_index, end_index, axis): res = np.zeros_like(x) ax_slices = _ax_slices(x, beg_index, end_index, axis) res[ax_slices] = x[ax_slices] res[res != 0] = 1 return res expected_forward = [AA([expected_result], dtype=PRECISION_TO_TYPE[precision])] expected_backward = {"arg": [[grad_slice(np.asarray(input_data), *slice_params)]]} _test_unary_op( precision, device_id, C.slice, input_data, expected_forward, expected_backward, {"begin_index": slice_params[0], "end_index": slice_params[1], "axis": slice_params[2]}, )
def LSTM(shape, _inf, cell_shape=None, use_peepholes=False, init=_default_initializer, init_bias=0, enable_self_stabilization=False): # (x, (h, c)) has_projection = cell_shape is not None has_aux = False if has_aux: UntestedBranchError("LSTM, has_aux option") if enable_self_stabilization: UntestedBranchError("LSTM, enable_self_stabilization option") shape = _as_tuple(shape) cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape #stack_axis = -1 # stack_axis = 0 # BUGBUG: should be -1, i.e. the fastest-changing one, to match BS # determine stacking dimensions cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[0] cell_shape_list[stack_axis] = stacked_dim * 4 cell_shape_stacked = tuple( cell_shape_list) # patched dims with stack_axis duplicated 4 times # parameters b = Parameter(cell_shape_stacked, init=init_bias, name='b') # a bias W = Parameter(_inf.shape + cell_shape_stacked, init=init, name='W') # input A = Parameter(_inf.shape + cell_shape_stacked, init=init, name='A') if has_aux else None # aux input (optional) H = Parameter(shape + cell_shape_stacked, init=init, name='H') # hidden-to-hidden Ci = Parameter( cell_shape, init=init, name='Ci' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Cf = Parameter( cell_shape, init=init, name='Cf' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Co = Parameter( cell_shape, init=init, name='Co' ) if use_peepholes else None # cell-to-hiddden {note: applied elementwise} Wmr = ParameterTensor( cell_shape + shape, init=init, init_value_scale=init_value_scale ) if has_projection else None # final projection Sdh = Stabilizer(_inf=_inf.with_shape( shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(shape)) Sdc = Stabilizer(_inf=_inf.with_shape( cell_shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(cell_shape)) Sct = Stabilizer(_inf=_inf.with_shape( cell_shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(cell_shape)) Sht = Stabilizer(_inf=_inf.with_shape( shape)) if enable_self_stabilization else Identity( _inf=_inf.with_shape(shape)) def create_hc_placeholder(): return (Placeholder(_inf=_inf.with_shape(shape), name='hPh'), Placeholder(_inf=_inf.with_shape(cell_shape), name='cPh')) # (h, c) # parameters to model function x = Placeholder(_inf=_inf, name='lstm_block_arg') prev_state = create_hc_placeholder() # formula of model function dh, dc = prev_state dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + times(dhs, H) + times(aux, A) if has_aux else \ b + times(x, W) + times(dhs, H) it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis bit_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ft_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) # add peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) bit = it * tanh(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * tanh(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else \ ht _name_node(h, 'h') if _trace_layers: _log_node(h) # this looks right _name_node(c, 'c') # TODO: figure out how to do scoping, and also rename all the apply... to expression apply_x_h_c = combine([h, c]) # return to caller a helper function to create placeholders for recurrence apply_x_h_c.create_placeholder = create_hc_placeholder _name_and_extend_Function(apply_x_h_c, 'LSTM') return apply_x_h_c
def center_square(output, block_size, padding): return (cntk.slice(cntk.slice(output, 1, padding, block_size - padding), 2, padding, block_size - padding))
def test_op_slice(input_data, slice_params, expected_result, device_id, precision): input_data = AA(input_data, dtype=PRECISION_TO_TYPE[precision]) a = I(shape=input_data.shape, data_type=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name='a') def _ax_slices(x, beg_index, end_index, axis): ''' Creates a NumPy slicing array from slice operator's arguments ''' ax_slices = [] for i in range(0, len(x.shape)): if i == axis: if end_index >= x.shape[i]: ax_slices.append([ beg_index, ]) else: ax_slices.append([beg_index, end_index]) else: ax_slices.append(slice(None)) # corresponds to ':' return ax_slices # slice using the overload if False: # FIXME remove ones the overloads are in place # slice using the operator result = C.slice(a, *slice_params) ax_slices = _ax_slices(a, *slice_params) result = a[ax_slices] unittest_helper(result, None, [[expected_result]], device_id=device_id, precision=precision, clean_up=True, backward_pass=False) # Backward pass test # ================== # The gradient of the slice operator is a tensor of the same shape as the # input tensor, having 1 for elements that were taken and 0 for elements # that were dropped. def grad_slice(x, beg_index, end_index, axis): res = np.zeros_like(x) ax_slices = _ax_slices(x, beg_index, end_index, axis) res[ax_slices] = x[ax_slices] res[res != 0] = 1 return res expected_forward = [ AA([expected_result], dtype=PRECISION_TO_TYPE[precision]) ] expected_backward = { 'arg': [[grad_slice(np.asarray(input_data), *slice_params)]] } _test_unary_op( precision, device_id, C.slice, input_data, expected_forward, expected_backward, { 'begin_index': slice_params[0], 'end_index': slice_params[1], 'axis': slice_params[2] })