def build_model(self): phmap = self.get_inputs() df = phmap['df'] qf = phmap['qf'] ab = phmap['ab'] ae = phmap['ae'] #input layer cc = C.reshape(phmap['cc'], (1, -1)) qc = C.reshape(phmap['qc'], (1, -1)) c_processed, q_processed = self.input_layer(phmap['cgw'],phmap['cnw'],cc,\ phmap['qgw'],phmap['qnw'],qc).outputs c_processed = C.splice(c_processed, df) q_processed = C.splice(q_processed, qf) # attention layer output:[#,c][8*hidden_dim] att_context, wei1 = self.attention_layer(c_processed, q_processed, dimc=2 * self.hidden_dim + 3, dimq=2 * self.hidden_dim + 1, common_dim=2 * self.hidden_dim).outputs # modeling layer output:[#][1] [#,c][2*hidden_dim] mod_context_reg = self.modeling_layer(att_context) # output layer mod_context_reg = C.splice(mod_context_reg, df) start_logits, end_logits = self.output_layer(att_context, mod_context_reg).outputs # loss new_loss = all_spans_loss(start_logits, ab, end_logits, ae) res = C.combine([start_logits, end_logits]) self._model = res self._loss = new_loss return self._model, self._loss, self._input_phs
def simi_attention(self, input, memory): ''' return: memory weighted vectors over input [#,c][d] weight ''' input_ph = C.placeholder() # [#,c][d] mem_ph = C.placeholder() # [#,q][d] input_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) mem_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) bias = C.Parameter(shape=(2 * self.hidden_dim, ), init=0.0) weight_dense = Dense(1, bias=False, input_rank=1) proj_inp = input_dense(input_ph) # [#,c][d] proj_mem = mem_dense(mem_ph) # [#,q][d] unpack_memory, mem_mask = C.sequence.unpack( proj_mem, 0).outputs # [#][*=q, d] [#][*=q] expand_mem = C.sequence.broadcast_as(unpack_memory, proj_inp) # [#,c][*=q,d] expand_mask = C.sequence.broadcast_as(mem_mask, proj_inp) # [#,c][*=q] matrix = C.reshape(weight_dense(C.tanh(proj_inp + expand_mem + bias)), (-1, )) # [#,c][*=q] matrix = C.element_select(expand_mask, matrix, -1e30) logits = C.softmax(matrix, axis=0) # [#,c][*=q] weight_mem = C.reduce_sum(C.reshape(logits, (-1, 1)) * expand_mem, axis=0) # [#,c][d] weight_mem = C.reshape(weight_mem, (-1, )) return C.as_block(C.combine(weight_mem, logits), [(input_ph, input), (mem_ph, memory)], 'simi_attention', 'simi_attention')
def UpSampling(x): xr = C.reshape(x, (x.shape[0], x.shape[1], 1, x.shape[2], 1)) xx = C.splice(xr, xr, axis=-1) xy = C.splice(xx, xx, axis=-3) r = C.reshape(xy, (x.shape[0], x.shape[1] * 2, x.shape[2] * 2)) return r
def UpSampling2D(x): xr = C.reshape(x, (x.shape[0], x.shape[1], 1, x.shape[2], 1)) xx = C.splice(xr, xr, axis=-1) # axis=-1 refers to the last axis xy = C.splice(xx, xx, axis=-3) # axis=-3 refers to the middle axis r = C.reshape(xy, (x.shape[0], x.shape[1] * 2, x.shape[2] * 2)) return r
def createDecoderNetwork(self, networkHiddenSrc, srcLength, trgLength): timeZeroHidden = C.slice(networkHiddenSrc, 0, 0, 1) srcSentEmb = C.slice(timeZeroHidden, -1, Config.SrcHiddenSize, Config.SrcHiddenSize * 2) networkHiddenTrg = {} inputTrg = C.reshape(self.inputMatrixTrg, shape=(Config.TrgMaxLength, Config.BatchSize, Config.TrgVocabSize)) attProbAll = [] tce = 0 for i in range(0, trgLength, 1): preTrgEmb = self.initTrgEmb if i == 0 else self.EmbTrg(inputTrg[i - 1]) if (i == 0): networkHiddenTrg[i] = self.createDecoderInitNetwork(srcSentEmb) else: (networkHiddenTrg[i], attProb) = self.createDecoderRNNNetwork( networkHiddenSrc, preTrgEmb, networkHiddenTrg[i - 1], srcLength) attProbAll = attProb if i == 1 else C.splice( attProbAll, attProb, axis=0) preSoftmax = self.createReadOutNetwork(networkHiddenTrg[i], preTrgEmb) ce = C.cross_entropy_with_softmax(preSoftmax, inputTrg[i], 2) ce = C.reshape(ce, shape=(1, Config.BatchSize)) tce += C.times_transpose(ce, self.maskMatrixTrg[i]) return tce
def test_depth_to_space(image_shape, num_channels, block_size, device_id, precision): dev = cntk_device(device_id) from cntk.internal import sanitize_dtype_cntk input_val = np.array(np.reshape(range(num_channels), (num_channels, 1, 1)), dtype=PRECISION_TO_TYPE[precision]) input_val = np.tile(input_val, (1, ) + image_shape) img = C.input_variable( (num_channels, ) + image_shape, dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision])) # Result from depth_to_space node. depth_to_space_op = C.depth_to_space(img, block_size) output_test = depth_to_space_op.eval({img: input_val}) # Reference result from simulating depth_to_space with other CNTK ops. h, w = image_shape reshape_node = C.reshape(img, (block_size, block_size, num_channels // (block_size**2), h, w)) transpose_node = C.transpose(reshape_node, [2, 3, 0, 4, 1]) depth_to_space_sim_op = C.reshape( transpose_node, (num_channels // (block_size**2), h * block_size, w * block_size)) output_ref = depth_to_space_sim_op.eval({img: input_val}) assert np.array_equal(output_test, output_ref)
def create_word2vec_cbow_model(word_one_hot, context_one_hots, negative_one_hots): # shared_embedding_layer = Embedding(G.embedding_dimension, uniform(scale=1.0/2.0/G.embedding_dimension)) shared_embedding_layer = Embedding(G.embedding_dimension) word_embedding = shared_embedding_layer(word_one_hot) context_embeddings = [shared_embedding_layer(x) for x in context_one_hots] negative_embeddings = [shared_embedding_layer(x) for x in negative_one_hots] print(word_embedding.shape) word_embedding_reshaped = C.reshape(word_embedding, shape=(1, G.embedding_dimension)) print(word_embedding_reshaped.shape) context_embeddings_all = C.reshape(C.splice(*context_embeddings), shape=(context_size, G.embedding_dimension)) negative_embeddings_all = C.reshape(C.splice(*negative_embeddings), shape=(G.negative, G.embedding_dimension)) print(context_embeddings_all.shape) print(negative_embeddings_all.shape) cbow = C.reshape(C.reduce_mean(context_embeddings_all, 0), shape=(G.embedding_dimension)) print(cbow.shape) # word_context_product = C.times_transpose(word_embedding_reshaped, cbow) word_context_product = C.times_transpose(word_embedding, cbow) print(word_context_product.shape) negative_context_product = C.reshape(C.times_transpose(negative_embeddings_all, cbow), shape=(G.negative)) print(negative_context_product.shape) word_negative_context_product = C.splice(word_context_product, negative_context_product) print(word_negative_context_product.shape) # return model and shared embedding layer return word_negative_context_product, shared_embedding_layer
def multiFunc(self, arg1): # load or create the inputs we need multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1,)) # lets compute the means we need # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits, # it is the difference between the previous bits approximation and the true value. carry_over = multiIn approx = C.element_times(multiIn, 0) # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization for i in range(max_bits): # determine which values of the input should be binarized to i bits or more hot_vals = C.greater(bit_map, i) # select only the values which we need to binarize valid_vals = C.element_select(hot_vals, carry_over, 0) # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels) mean = C.element_divide(C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1)) # reshape the mean to match the dimensionality of the input mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1)) # binarize the carry over bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) # add in the equivalent binary representation to the approximation approx = C.plus(approx, C.element_times(mean, bits)) # compute the new carry over carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def new_attention(encoder_hidden_state, decoder_hidden_state): # encode_hidden_state: [#, e] [h] # decoder_hidden_state: [#, d] [H] unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs # unpacked_encoder_hidden_state: [#] [*=e, h] # valid_mask: [#] [*=e] projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state) # projected_encoder_hidden_state: [#, d] [*=e, attention_dim] broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state) # broadcast_valid_mask: [#, d] [*=e] projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state) # projected_decoder_hidden_state: [#, d] [attention_dim] tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state) # tanh_output: [#, d] [*=e, attention_dim] attention_logits = attn_proj_tanh(tanh_output) # attention_logits = [#, d] [*=e, 1] minus_inf = C.constant(-1e+30) masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf) # masked_attention_logits = [#, d] [*=e] attention_weights = C.softmax(masked_attention_logits, axis=0) attention_weights = Label('attention_weights')(attention_weights) # attention_weights = [#, d] [*=e] attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0) # attended_encoder_hidden_state = [#, d] [1, h] output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1)) # output = [#, d], [h] return output
def criterion(self): # hyperparameters lambda_val = 0.5 # Margin loss left = ct.square(ct.relu(0.9 - self.length)) right = ct.square(ct.relu(self.length - 0.1)) left = ct.reshape(left, (-1)) right = ct.reshape(right, (-1)) lc = self.labels * left + lambda_val * (1 - self.labels) * right margin_loss = ct.reduce_sum(lc, axis=0) margin_loss = ct.reduce_mean(margin_loss, axis=ct.axis.Axis.default_batch_axis()) # classification_error predict = ct.softmax(self.length, axis=0) error = ct.classification_error(ct.reshape(predict, (10)), self.labels) total_loss = margin_loss reconstruction_err = 0 if self.use_reconstruction: features = ct.reshape(self.features, shape=(-1,)) encoder = ct.reshape(self.training_model, shape=(-1,)) squared = ct.square(encoder - features) reconstruction_err = ct.reduce_mean(squared, axis=0) reconstruction_err = ct.reduce_mean(reconstruction_err, axis=ct.axis.Axis.default_batch_axis()) total_loss = margin_loss + (0.0005*784) * reconstruction_err return total_loss, error
def test_reshape_free_static_axis(): x = C.input((C.FreeDimension, 2, 3)) x_reshaped = C.reshape(x, (-1), 0, 2) assert x_reshaped.shape == (C.FreeDimension, 3) x_data = np.arange(12).reshape(2, 2, 3) result = x_reshaped.eval({x: x_data}) assert np.array_equal(result[0], x_data.reshape(4, 3)) x_data = np.arange(18).reshape(3, 2, 3) result = x_reshaped.eval({x: x_data}) assert np.array_equal(result[0], x_data.reshape(6, 3)) x_reshaped = C.reshape(x, (-1), 1, 3) assert x_reshaped.shape == (C.FreeDimension, 6) x_data = np.arange(12).reshape(2, 2, 3) result = x_reshaped.eval({x: x_data}) assert np.array_equal(result[0], x_data.reshape(2, 6)) x_reshaped = C.reshape(x, (4), 0, 2) assert x_reshaped.shape == (4, 3) x_data = np.arange(12).reshape(2, 2, 3) result = x_reshaped.eval({x: x_data}) assert np.array_equal(result[0], x_data.reshape(4, 3)) x_data = np.arange(6).reshape(1, 2, 3) with pytest.raises(ValueError): result = x_reshaped.eval({x: x_data})
def test_reshape_free_static_axis(): x = C.input_variable((C.FreeDimension, 2, 3)) x_reshaped = C.reshape(x, (-1), 0, 2) assert x_reshaped.shape == (C.FreeDimension, 3) x_data = np.arange(12).reshape(2, 2, 3) result = x_reshaped.eval({x : x_data}) assert np.array_equal(result[0], x_data.reshape(4, 3)) x_data = np.arange(18).reshape(3, 2, 3) result = x_reshaped.eval({x : x_data}) assert np.array_equal(result[0], x_data.reshape(6, 3)) x_reshaped = C.reshape(x, (-1), 1, 3) assert x_reshaped.shape == (C.FreeDimension, 6) x_data = np.arange(12).reshape(2, 2, 3) result = x_reshaped.eval({x : x_data}) assert np.array_equal(result[0], x_data.reshape(2, 6)) x_reshaped = C.reshape(x, (4), 0, 2) assert x_reshaped.shape == (4, 3) x_data = np.arange(12).reshape(2, 2, 3) result = x_reshaped.eval({x : x_data}) assert np.array_equal(result[0], x_data.reshape(4, 3)) x_data = np.arange(6).reshape(1, 2, 3) with pytest.raises(ValueError): result = x_reshaped.eval({x : x_data})
def instance_normalization(x): mean = C.reduce_mean(x, axis=(1, 2)) x0 = x - mean std = C.sqrt(C.reduce_mean(x0 * x0, axis=(1, 2))) if epsilon != 0: std += epsilon x_hat = x0 / std return x_hat * C.reshape(scale, (-1, 1, 1)) + C.reshape(bias, (-1, 1, 1))
def __local_response_normalization(self, k, n, alpha, beta, name=''): x = cntk.placeholder(name='lrn_arg') x2 = cntk.square(x) x2s = cntk.reshape(x2, (1, cntk.InferredDimension), 0, 1) W = cntk.constant(alpha / (2 * n + 1), (1, 2 * n + 1, 1, 1), name='W') y = cntk.convolution(W, x2s) b = cntk.reshape(y, cntk.InferredDimension, 0, 2) den = cntk.exp(beta * cntk.log(k + b)) apply_x = cntk.element_divide(x, den) return apply_x
def lrn(x, depth_radius, bias, alpha, beta, name=''): x2 = C.square(x) # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1) W = C.constant(alpha/(2*depth_radius+1), shape=(1,2*depth_radius+1,1,1), dtype=dtype, name='W') # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1 y = C.convolution (W, x2s) # reshape back to remove the fake singleton reduction dimension b = C.reshape(y, C.InferredDimension, 0, 2) den = C.exp(beta * C.log(bias + b)) return C.element_divide(x, den)
def LocalResponseNormalization(k, n, alpha, beta, name=''): x = C.placeholder(name='lrn_arg') x2 = C.square(x) x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1) W = C.constant(alpha / (2 * n + 1), (1, 2 * n + 1, 1, 1), name='W') y = C.convolution(W, x2s) b = C.reshape(y, C.InferredDimension, 0, 2) den = C.exp(beta * C.log(k + b)) apply_x = C.element_divide(x, den) return apply_x
def hierarchical_softmax_layer(input_var, label_index, label_dim, label_classes=None): ''' A two layers hierarchical softmax function: Args: input_var: Variable with shape: [#,*](dim_x) label_index: index of label's category: [#,*](1) label_dim: number of the label categories label_classes: number of classes of the label categories Returns: output_prob: the probability of the given label [#,*](1) class_probs: the probability of all the label classes [#,*](label_classes) all_probs: the probability of all label classes ''' input_dim = input_var.shape[0] if not label_classes: label_classes = int(np.ceil(np.sqrt(float(label_dim)))) n_outputs_per_class = int(np.ceil(label_dim / label_classes)) target_class = C.floor((label_index + 0.5) / n_outputs_per_class) target_output_in_class = C.round(label_index - target_class * n_outputs_per_class) w1 = parameter(shape=(input_dim, label_classes), init=C.glorot_normal(), name='hsoftmax_w1') b1 = parameter(shape=(label_classes), init=C.glorot_normal(), name='hsoftmax_b1') w2s = parameter(shape=(label_classes, input_dim, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_w2s') b2s = parameter(shape=(label_classes, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_b2s') class_probs = softmax(b1 + times(input_var, w1)) # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate target_class_one_hot = C.one_hot(target_class, num_classes=label_classes, sparse_output=False) w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2), [input_dim, -1]) b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1]) probs_in_class = softmax(b2 + times(input_var, w2)) prob_in_class = C.times_transpose(C.one_hot(target_output_in_class, num_classes=n_outputs_per_class, sparse_output=False), probs_in_class) class_prob = C.times_transpose(C.one_hot(target_class, num_classes=label_classes, sparse_output=False), class_probs) output_prob = prob_in_class * class_prob # this is for calculating all the outputs' probabilities all_probs = [] for i in range(label_classes): ci = C.constant(i) ci_one_hot = C.one_hot(ci, num_classes=label_classes, sparse_output=False) w2a = C.times(ci_one_hot, w2s, output_rank=2) b2a = C.times(ci_one_hot, b2s, output_rank=1) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) class_proba = C.times_transpose(ci_one_hot, class_probs) output_proba = probs_in_classa * class_proba all_probs.append(output_proba) return output_prob, class_probs, all_probs
def masking(input, labels): if not is_onehot_encoded: mask = ct.reshape(ct.one_hot( ct.reshape(ct.argmax(labels, axis=0), shape=(-1, )), 10), shape=(10, 1, 1)) mask = ct.stop_gradient(mask) else: mask = ct.reshape(labels, shape=(10, 1, 1)) mask = ct.splice(*([mask] * 16), axis=1) return ct.reshape(ct.element_times(input, mask), shape=(-1, ))
def attention_layer(self, context, query, layer): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) p_processed = C.placeholder(shape=(2*self.hidden_dim,)) qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform()) v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) # seq[tensor[2d]] p_len x 2d wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim)) # q_len x 2d wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim)) # seq[tensor[q_len]] S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1)) qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed) # seq[tensor[q_len]] S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) # seq[tensor[q_len]] A = C.softmax(S, axis=0) # seq[tensor[2d]] swap_qvw = C.swapaxes(qvw) cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1)) # seq[tensor[4d]] uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq) # seq[tensor[4d]] gt = C.tanh(C.times(uc_concat, wg)) # seq[tensor[4d]] uc_concat_star = gt * uc_concat # seq[tensor[4d]] vp = C.layers.Sequential([ C.layers.Dropout(self.dropout), OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star) return C.as_block( vp, [(p_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def LocalResponseNormalization(k, n, alpha, beta, name=''): x = C.placeholder(name='lrn_arg') x2 = C.square(x) # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1) W = C.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W') # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1 y = C.convolution (W, x2s) # reshape back to remove the fake singleton reduction dimension b = C.reshape(y, C.InferredDimension, 0, 2) den = C.exp(beta * C.log(k + b)) apply_x = C.element_divide(x, den) return apply_x
def cnwindow(mna,window): mnas=mna.shape mnout=(*mnas[:-2],*window,((mnas[-2]-window[-2])+1),((mnas[-1]-window[-1])+1)) mne2=None for R in range(window[0]): j_lim = R + mnout[-2] for H in range(window[1]): tdata=C.slice(mna,[-2,-1], [R,H], [j_lim,(H + mnout[-1])]) if mne2 is None: mne2=tdata else: mne2=C.splice(mne2,tdata,axis=1) return(C.reshape(C.transpose(C.reshape(mne2, shape=mnout),(0,5,4,3,2,1)), (mnout[0],*mnout[5:3:-1],1,*mnout[3:0:-1])))
def cgan_discriminator(x, y): with C.layers.default_options(init=C.normal(scale=0.02), map_rank=1, use_cntk_engine=True): hx = C.reshape(x, (1, 28, 28)) hy = C.ones_like(hx) * C.reshape(y, (label_dim, 1, 1)) h = C.splice(hx, hy, axis=0) h = C.leaky_relu((Convolution2D((5, 5), 1, strides=(2, 2))(h)), alpha=0.2) h = C.leaky_relu(BatchNormalization()(Convolution2D((5, 5), 64, strides=(2, 2))(h)), alpha=0.2) h = C.leaky_relu(BatchNormalization()(Dense(1024)(h)), alpha=0.2) h = Dense(1, activation=C.sigmoid)(h) return h
def UpSampling2D(x): xr = C.reshape(x, (x.shape[0], x.shape[1], 1, x.shape[2], 1)) xx = C.splice(xr, xr, axis=-1) # axis=-1 refers to the last axis xy = C.splice(xx, xx, axis=-3) # axis=-3 refers to the middle axis r = C.reshape(xy, (x.shape[0], x.shape[1] * 2, x.shape[2] * 2)) ''' print ("upsampling") print(xr.shape) print(xx.shape) print(xy.shape) print(r.shape) ''' return r
def LocalResponseNormalization(k, n, alpha, beta, name=''): x = C.placeholder(name='lrn_arg') x2 = C.square(x) # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1) W = C.constant(alpha / (2 * n + 1), (1, 2 * n + 1, 1, 1), name='W') # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1 y = C.convolution(W, x2s) # reshape back to remove the fake singleton reduction dimension b = C.reshape(y, C.InferredDimension, 0, 2) den = C.exp(beta * C.log(k + b)) apply_x = C.element_divide(x, den) return apply_x
def var(array,W=_W,B=None,square=0,sqrt=0,V=False,sizz=0): #W=tf.transpose(W, [0,2,3,1]) arrs=array.shape ashp=W.shape sb=(W.shape[1],1,1) WV=W.shape[-2:] xi=(-2,-1) x2=(-2,-1,-3) if V: print(W.eval()) print(arrs,ashp) mul=(array*W) if V: print('Wsamp',W[-1,-1].eval()) print('array*w',(mul.eval())[0,-1]) size=C.reduce_sum(W,axis=xi)#shape=(outputs, channel) if V: print("sizesamp",size.shape,size.eval()) if B is None: B=C.constant(0,shape=W.shape[0:2],dtype=np.float32)#channel B=C.reshape(B,(*B.shape,*[1 for _ in range(len(ashp)-len(B.shape))])) if sizz==1: mean=C.reduce_sum(mul,axis=xi)/size else: mean=C.reduce_sum(mul,axis=xi)/C.constant(value=WV[0]*WV[1],shape=sb,dtype=np.float32) if V: print("meansamp",mean.eval()[0,-1]) if square: i=(C.square(mul-mean)+B) else: i=(((mul)-mean)+B) di=i/size if V==2: print("i",i.eval(),"i") print("di",di.eval(),"di") if V: print('isamp',i.shape,i.eval()[-1,-1,]) out=C.reduce_sum(i+B,axis=x2) #out=np.rollaxis(np.sum(i+B,axis=x2),-1,1) print(out.shape) if sqrt: out=C.sqrt(out) out=C.swapaxes(C.reshape(out,out.shape[:4]), 3, 1) print(out.shape) assert out.shape==(arrs[0],ashp[0],arrs[1],arrs[2]) return(out)
def linear_units(input_var, output_dim): input_dim = input_var.shape[0] # Introduce model parameters weight_param = C.parameter(shape=(output_dim, input_dim), name="weights") bias_param = C.parameter(shape=(output_dim, 1), name="biases") # Reshape to facilitate matrix multiplication input_reshaped = C.reshape(input_var, (input_dim, 1)) # Weighted sums params['w'], params['b'] = weight_param, bias_param part1 = C.times(weight_param, input_reshaped) # Add biases part2 = part1 + bias_param # Return 1-D representation return C.reshape(part2, (num_classes))
def attention_layer(self, context, query): q_processed = C.placeholder(shape=(2 * self.hidden_dim, )) c_processed = C.placeholder(shape=(2 * self.hidden_dim, )) #convert query's sequence axis to static qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs # This part deserves some explanation # It is the attention layer # In the paper they use a 6 * dim dimensional vector # here we split it in three parts because the different parts # participate in very different operations # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u) ws1 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws2 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws3 = C.parameter(shape=(1, 2 * self.hidden_dim), init=C.glorot_uniform()) att_bias = C.parameter(shape=(), init=0) wh = C.times(c_processed, ws1) wu = C.reshape(C.times(qvw, ws2), (-1, )) whu = C.reshape( C.reduce_sum(c_processed * C.sequence.broadcast_as(qvw * ws3, c_processed), axis=1), (-1, )) S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed) S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) q_attn = C.reshape(C.softmax(S), (-1, 1)) #q_attn = print_node(q_attn) c2q = C.reshape( C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn, axis=0), (-1)) max_col = C.reduce_max(S) c_attn = C.sequence.softmax(max_col) htilde = C.sequence.reduce_sum(c_processed * c_attn) q2c = C.sequence.broadcast_as(htilde, c_processed) q2c_out = c_processed * q2c att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out) return C.as_block(att_context, [(c_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def test_op_reshape(input_shape, output_shape, expected_output_shape, device_id, precision): # Forward pass test #================== # we compute the expected output for the forward pass # we need two surrounding brackets # the first for sequences (length=1, since we have dynamic_axis='') # the second for batch of one sample num_tensor_elements = np.multiply.reduce(input_shape) input_tensor = np.arange(num_tensor_elements).reshape(input_shape) expected_tensor = input_tensor.reshape(expected_output_shape, order='C') a = I([input_tensor]) # reshape into output shape reshaped_input = C.reshape(a, output_shape) unittest_helper(reshaped_input, None, [[expected_tensor]], device_id=device_id, precision=precision, clean_up=True, backward_pass=False) # Backward pass test # ================== # Reshaping is just moving the input values to different indexes of the result tensor. # If we compute the gradients on the unmodified tensor, reshape would get 1 for all inputs. # For testing the gradients we want to have different gradients for each input index otherwise we can't # test if they get wrongly permuted during test. To this end we multiply the reshaping result with itself. # The expected gradient is identical to the input tensor. a = I([input_tensor]) # reshape into output shape reshaped_input = C.reshape(a, output_shape) output = reshaped_input * expected_tensor unittest_helper(output, None, [[input_tensor]], device_id=device_id, precision=precision, clean_up=True, backward_pass=True, input_node=a)
def MyMeanVarNorm(feature_mean_file, feature_inv_stddev_file): m = C.reshape(load_ascii_vector(feature_mean_file, 'feature_mean'), shape=(1, feature_dim)) s = C.reshape(load_ascii_vector(feature_inv_stddev_file, 'feature_invstddev'), shape=(1, feature_dim)) def _func(operand): return C.reshape(C.element_times( C.reshape(operand, shape=(1 + context[0] + context[1], feature_dim)) - m, s), shape=operand.shape) return _func
def scale_dot_product_attention_block(self, contextQ, contextV, contextK, name): Q = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) V = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) K = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) Ql = C.layers.Dense(100)(Q) Vl = C.layers.Dense(100)(V) Kl = C.layers.Dense(100)(K) kvw, kvw_mask = C.sequence.unpack(Kl, padding_value=0).outputs vvw, _ = C.sequence.unpack(Vl, padding_value=0).outputs KT = C.swapaxes(kvw) S = C.reshape(C.times(Ql, KT) / math.sqrt(100), -1) kvw_mask_expanded = C.sequence.broadcast_as(kvw_mask, Ql) S = C.softmax( C.element_select(kvw_mask_expanded, S, C.constant(-1e+30))) att = C.times(S, vvw) return C.as_block(att, [(Q, contextQ), (V, contextV), (K, contextK)], 'sdp_attention_block' + name, 'sdp_attention_block' + name)
def attention(encoded, network): abk = dense(network) a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures) # print("abk shape:", a.shape, b.shape, k.shape) # a, b, k: [#, n] [nb_mixture, 1] # context: [#, c] [char_ohe] encoded_unpacked = C.sequence.unpack(encoded, padding_value=0, no_mask_output=True) # context_unpacked: [#] [*=c, char_ohe] u = Cx.sequence.position(encoded) # position gives shape=(1, ) # u: [#, c], [1] u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs # u_values: [#] [*=c, 1] # u_valid: [#] [*=c] u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k)) # u_values_broadcast: [#, n] [1, *=c] u_valid_broadcast = C.sequence.broadcast_as(C.reshape(u_valid, (1,), 1), k) # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point # print("u_values_broadcast shape:", u_values_broadcast.shape) # print("abk shape:", a.shape, b.shape, k.shape) phi = window_weight(a, b, k, u_values_broadcast) # phi: [#, n] [*=c, 1] zero = C.constant(0) phi = C.element_select(u_valid_broadcast, phi, zero, name="phi") # phi: [#, n] [*=c, 1] attended = C.reduce_sum(phi * C.sequence.broadcast_as(encoded_unpacked, phi), axis=0) # [#, n] [1, char_ohe] # print("attended_context shape:", attended_context.shape) output = C.squeeze(attended, name="GaussianWindowAttention") # [#, n] [char_ohe] return output
def test_cntk_conv2d(): try: import tensorflow has_tensorflow = True except: has_tensorflow = False if has_tensorflow: tf_baseline_conv2d() else: cntk_baseline_conv2d() import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.input_variable(shape=sample_shape) input_reshaped = C.reshape(input_var, (1,)+sample_shape) conv_out = C.layers.Convolution2D(filter_shape, num_filters, activation=None)(input_reshaped) ci.watch(conv_out, 'conv2d', var_type=cstk.Conv2DAttr, attr=cstk.Conv2DAttr(filter_shape=filter_shape, num_filters=num_filters)) ci.watch(conv_out, 'conv2d_out') data = {input_var:input_data} ci.set_data(data) ci.set_workdir(workdir) conv_out.eval(data) # load parameters from crosstalk and verify results are the same ci.assign('conv2d', load=True) assert ci.compare('conv2d_out', rtol=1e-4, atol=1e-6) ci.reset()
def test_sequential_convolution_without_reduction_dim(): c = Convolution(3, init=np.array([4., 2., 1.], dtype=np.float32), sequential=True, pad=False, reduction_rank=0, bias=False) c.update_signature(Sequence[Tensor[()]]) # input is a sequence of scalars data = [np.array([2., 6., 4., 8., 6.])] # like a short audio sequence, in the dynamic dimension out = c(data) exp = [[24., 40., 38.]] np.testing.assert_array_equal(out, exp, err_msg='Error in sequential convolution without reduction dimension') c = Convolution(3, init=np.array([4., 2., 1.], dtype=np.float32), sequential=True, pad=False, reduction_rank=0, bias=False) c.update_signature(Sequence[Tensor[1]]) # input is a sequence of dim-1 vectors data = [np.array([[2.], [6], [4.], [8.], [6.]])] out = c(data) exp = [[[24.], [40.], [38]]] # not reducing; hence, output is also a sequence of dim-1 vectors np.testing.assert_array_equal(out, exp, err_msg='Error in sequential convolution without reduction dimension') # these cases failed before emb_dim = 10 x = C.input_variable(**Sequence[Tensor[20]]) m = Embedding(emb_dim)(x) m = Convolution(filter_shape=3, sequential=True)(m) # this one still fails # Reshape: Operand (sub-)dimensions '[3]' incompatible with desired replacement (sub-)dimensions '[]'. Number of elements must be the same.. m = Embedding(emb_dim)(x) m = reshape(m, (emb_dim,1)) m = Convolution(filter_shape=(3,1), num_filters=13, pad=True, sequential=True)(m) m = Embedding(emb_dim)(x) m = Convolution(filter_shape=3, pad=True, sequential=True)(m)
def create_model(input_sequence, label_sequence, vocab_dim, hidden_dim): # Create the rnn that computes the latent representation for the next token. rnn_with_latent_output = Sequential([ C.layers.Embedding(hidden_dim), For( range(num_layers), lambda: Sequential([ Stabilizer(), Recurrence(LSTM(hidden_dim), go_backwards=False) ])), ]) # Apply it to the input sequence. latent_vector = rnn_with_latent_output(input_sequence) # Connect the latent output to (sampled/full) softmax. if use_sampled_softmax: weights = load_sampling_weights(token_frequencies_file_path) smoothed_weights = np.float32(np.power(weights, alpha)) sampling_weights = C.reshape(C.Constant(smoothed_weights), shape=(1, vocab_dim)) z, ce, errs = cross_entropy_with_sampled_softmax( latent_vector, label_sequence, vocab_dim, hidden_dim, softmax_sample_size, sampling_weights) else: z, ce, errs = cross_entropy_with_full_softmax(latent_vector, label_sequence, vocab_dim, hidden_dim) return z, ce, errs
def input_layer(self, embed, cgw,cnw,cc,qgw,qnw,qc): cgw_ph = C.placeholder() cnw_ph = C.placeholder() cc_ph = C.placeholder() qgw_ph = C.placeholder() qnw_ph = C.placeholder() qc_ph = C.placeholder() input_chars = C.placeholder(shape=(1,self.word_size,self.c_dim)) input_glove_words = C.placeholder(shape=(self.wg_dim,)) input_nonglove_words = C.placeholder(shape=(self.wn_dim,)) # we need to reshape because GlobalMaxPooling/reduce_max is retaining a trailing singleton dimension # todo GlobalPooling/reduce_max should have a keepdims default to False embedded = C.splice( embedded = C.splice( C.reshape(self.charcnn(input_chars), self.convs), embed(input_glove_words, input_nonglove_words), name='splice_embed') highway = HighwayNetwork(dim=2*self.hidden_dim, highway_layers=self.highway_layers)(embedded) highway_drop = C.layers.Dropout(self.dropout)(highway) processed = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn')(highway_drop) qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) # ace = C.one_hot(ac_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) q_processed = processed.clone(C.CloneMethod.share, {input_chars:qce, input_glove_words:qgw_ph, input_nonglove_words:qnw_ph}) c_processed = processed.clone(C.CloneMethod.share, {input_chars:cce, input_glove_words:cgw_ph, input_nonglove_words:cnw_ph}) return C.as_block( C.combine([c_processed, q_processed]), [(cgw_ph, cgw),(cnw_ph, cnw),(cc_ph, cc),(qgw_ph, qgw),(qnw_ph, qnw),(qc_ph, qc)], 'input_layer', 'input_layer')
def input_layer(self,cgw,cnw,cc,qgw,qnw,qc): cgw_ph = C.placeholder() cnw_ph = C.placeholder() cc_ph = C.placeholder() qgw_ph = C.placeholder() qnw_ph = C.placeholder() qc_ph = C.placeholder() input_chars = C.placeholder(shape=(1,self.word_size,self.c_dim)) input_glove_words = C.placeholder(shape=(self.wg_dim,)) input_nonglove_words = C.placeholder(shape=(self.wn_dim,)) # we need to reshape because GlobalMaxPooling/reduce_max is retaining a trailing singleton dimension # todo GlobalPooling/reduce_max should have a keepdims default to False embedded = C.splice( C.reshape(self.charcnn(input_chars), self.convs), self.embed()(input_glove_words, input_nonglove_words), name='splice_embed') processed = C.layers.Sequential([For(range(2), lambda: OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn'))])(embedded) qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) q_processed = processed.clone(C.CloneMethod.share, {input_chars:qce, input_glove_words:qgw_ph, input_nonglove_words:qnw_ph}) c_processed = processed.clone(C.CloneMethod.share, {input_chars:cce, input_glove_words:cgw_ph, input_nonglove_words:cnw_ph}) return C.as_block( C.combine([c_processed, q_processed]), [(cgw_ph, cgw),(cnw_ph, cnw),(cc_ph, cc),(qgw_ph, qgw),(qnw_ph, qnw),(qc_ph, qc)], 'input_layer', 'input_layer')
def test_sequence_unpack_with_convolution(device_id, precision): x = C.sequence.input((20, 20)) y = C.sequence.unpack(x, 0, no_mask_output=True) z = C.reshape(y, (3, 20, 20)) kernel = C.constant(1.0, (4, 3, 3, 3)) t = C.convolution(kernel, z, auto_padding=[False, True, True]) val = np.random.random((2, 3, 20, 20)).astype(np.float32) result = t.eval({x: val}) assert np.array_equal(result.shape, (2, 4, 20, 20))
def test_op_reshape(inputShape, outputShape, expectedOutputShape, device_id, precision): # Forward pass test #================== # we compute the expected output for the forward pass # we need two surrounding brackets # the first for sequences (length=1, since we have dynamic_axis='') # the second for batch of one sample num_tensor_elements = np.multiply.reduce(inputShape) input_tensor = np.arange(num_tensor_elements).reshape(inputShape) expected_tensor = input_tensor.reshape(expectedOutputShape, order='F') a = I([input_tensor]) # reshape into output shape reshaped_input = C.reshape(a, outputShape) unittest_helper(reshaped_input, None, [[expected_tensor]], device_id=device_id, precision=precision, clean_up=True, backward_pass=False) # Backward pass test # ================== # Reshaping is just moving the input values to different indexes of the result tensor. # If we would compute the gradients on the unmodified reshape would would get 1 for all inputs. # For testing the gradients we want to have different gradients for each input index otherwise we can't # test if they get wrongly permuted during test. To this end we multiply the reshaping result with some weight tensor. # For convienience choose '100 * expected_tensor' as weight. # The expected gradient is identical to this weight tensor reshaped according the input shape. a = I([input_tensor]) # reshape into output shape reshaped_input = C.reshape(a, outputShape) some_factor = 100 weight = expected_tensor * some_factor output = reshaped_input * weight expected_gradient = input_tensor * some_factor unittest_helper(output, None, [[expected_gradient]], device_id = device_id, precision=precision, clean_up=True, backward_pass=True, input_node=a)
def test_op_reshape_free_dimension(device_id): dev = cntk_device(device_id) x = C.input_variable((C.FreeDimension, 2, 2)) x_reshaped_1 = C.reshape(x, (-1,), 0, 2) data = [[[1, 2], [3, 4]]] result = x_reshaped_1.eval({x : np.asarray(data, dtype=np.float32)}) assert np.array_equal(result[0], data[0]) data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] result = x_reshaped_1.eval({x : np.asarray(data, dtype=np.float32)}) assert np.array_equal(result[0], np.reshape(data, (4, 2))) x_reshaped_2 = C.reshape(x, (-1,), 1, 3) data = [[[1, 2], [3, 4]]] result = x_reshaped_2.eval({x : np.asarray(data, dtype=np.float32)}) assert np.array_equal(result[0], np.reshape(data, (1, 4))) data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] result = x_reshaped_2.eval({x : np.asarray(data, dtype=np.float32)}) assert np.array_equal(result[0], np.reshape(data, (2, 4)))
def test_depth_to_space(image_shape, num_channels, block_size, device_id, precision): dev = cntk_device(device_id) from cntk.internal import sanitize_dtype_cntk input_val = np.array(np.reshape(range(num_channels), (num_channels, 1, 1)), dtype=PRECISION_TO_TYPE[precision]) input_val = np.tile(input_val, (1,) + image_shape) img = C.input_variable((num_channels,) + image_shape, dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision])) # Result from depth_to_space node. depth_to_space_op = C.depth_to_space(img, block_size) output_test = depth_to_space_op.eval({ img : input_val }) # Reference result from simulating depth_to_space with other CNTK ops. h, w = image_shape reshape_node = C.reshape(img, (block_size, block_size, num_channels // (block_size**2), h, w)) transpose_node = C.transpose(reshape_node, [2, 3, 0, 4, 1]) depth_to_space_sim_op = C.reshape(transpose_node, (num_channels // (block_size**2), h * block_size, w * block_size)) output_ref = depth_to_space_sim_op.eval({ img : input_val }) assert np.array_equal(output_test, output_ref)
def test_sequence_unpack_with_convolution(device_id, precision): dt = PRECISION_TO_TYPE[precision] dev = cntk_device(device_id) x = C.sequence.input((20, 20), dtype=dt) y = C.sequence.unpack(x, 0, no_mask_output=True) z = C.reshape(y, (3, 20, 20)) kernel = C.constant(1.0, (4, 3, 3, 3), device=dev) t = C.convolution(kernel, z, auto_padding=[False, True, True]) val = np.random.random((2, 3, 20, 20)).astype(dt) result = t.eval({x: val}, device=dev) assert np.array_equal(result.shape, (2, 4, 20, 20))
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates = False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: "+str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:"+str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')# [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(target_vector, bias, name='zT2') - C.times_transpose(target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape = (vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def CustomMultibitKernel(input, bit_map, mean_bits=None): if (mean_bits): bit_map = np.asarray(np.maximum(np.round(np.random.normal(mean_bits, 1, input.shape)), 1), dtype=np.int32) print("Mean Bits: ",np.mean(bit_map)) else: if (type(bit_map) == int): length = C.reshape(input, (-1)) bit_map = [bit_map]*length.shape[0] bit_map = np.asarray(bit_map) bit_map = bit_map.reshape(input.shape) else: bit_map = np.asarray(bit_map) assert (bit_map.shape == input.shape) return user_function(MultibitKernel(input, bit_map))
def test_convert_dynamic_axis(): #test fix batch size batch_size = 4 a = C.parameter(shape=(batch_size, 2, 3), init=1) dynamic_a = C.to_batch(a) assert len(dynamic_a.dynamic_axes) == 1 assert dynamic_a.shape == (2, 3) x = C.input_variable((2, 3)) y = x * dynamic_a #test grad data = np.arange(batch_size * 2 * 3).reshape(batch_size, 2, 3).astype('f') assert np.array_equal(y.grad({x:data}, [a]), data) const_a = C.unpack_batch(y) assert len(const_a.dynamic_axes) == 0 assert const_a.shape == (C.FreeDimension, 2, 3) f = C.assign(a, const_a) f.eval({x:data}) assert np.array_equal(a.value, data) #test reshape with batch axis x = C.input_variable((2,3)) const_x = C.unpack_batch(x) assert len(const_x.dynamic_axes) == 0 assert const_x.shape == (C.FreeDimension, 2, 3) const_y = C.reshape(const_x, (-1, 3)) assert const_y.shape == (C.FreeDimension, 3) y = C.to_batch(const_y) assert len(y.dynamic_axes) == 1 assert y.shape == (3,) z = y * 2 expected = data.reshape((8, 3)) * 2 assert np.array_equal(z.eval({x:data}), expected) #test inferred dimension x = C.input_variable((C.InferredDimension, 3)) const_x = C.unpack_batch(x) assert len(const_x.dynamic_axes) == 0 assert const_x.shape == (C.FreeDimension, C.InferredDimension, 3) const_y = const_x * 2 y = C.to_batch(const_y) assert len(y.dynamic_axes) == 1 assert y.shape == (C.InferredDimension, 3)
def cross_entropy_with_full_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim # Dimension of the hidden vector ): bias = C.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) z = C.reshape(C.times_transpose(weights, hidden_vector) + bias, (1,vocab_dim)) zT = C.times_transpose(z, target_vector) ce = C.reduce_log_sum_exp(z) - zT zMax = C.reduce_max(z) error_on_samples = C.less(zT, zMax) return (z, ce, error_on_samples)
def test_op_times_sparse_grad(device_id, precision): dt_precision = PRECISION_TO_TYPE[precision] from cntk import times, times_transpose, parameter, reshape, Value, sequence dim = 5 num_sequences = 2 seq = [i for i in range(dim)] identity = np.identity(dim, dtype=dt_precision) input_data = Value.one_hot([seq]*num_sequences, dim, dtype=dt_precision) input_var = sequence.input_variable(shape=(dim), is_sparse=True, needs_gradient=False, dtype=dt_precision) e = parameter(shape = (dim, dim), init = identity, dtype=dt_precision) z = reshape(times_transpose(e, times(input_var, e)), dim) e_grad = z.grad({input_var : input_data}, [e]) assert np.allclose(e_grad, np.ones((dim,dim))*4)
def cntk_baseline_conv2d(): import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.input_variable(shape=sample_shape) input_reshaped = C.reshape(input_var, (1,)+sample_shape) conv_out = C.layers.Convolution2D(filter_shape, num_filters, init_bias=C.glorot_uniform())(input_reshaped) ci.watch(conv_out, 'conv2d', var_type=cstk.Conv2DAttr, attr=cstk.Conv2DAttr(filter_shape=filter_shape, num_filters=num_filters)) ci.watch(conv_out, 'conv2d_out') data = {input_var:input_data} ci.set_data(data) ci.set_workdir(workdir) ci.fetch('conv2d', save=True) ci.fetch('conv2d_out', save=True) ci.reset()
def test_data_resize(): batch_size = 8 w = C.parameter(shape=(3, 2), name='w1') x = C.input_variable(shape=[3], name='x') y = C.softmax(C.times(x, w)) y = C.unpack_batch(y) y = C.reshape(y, [batch_size * 2]) loss = C.reduce_mean(-C.log(y)) learning_rate = 0.01 lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) learner = C.sgd(y.parameters, lr_schedule, gradient_clipping_threshold_per_sample=1.0) trainer = C.Trainer(y, (loss), [learner]) features = np.random.randn(batch_size, 3) trainer.train_minibatch({x: features})
def test_conv_free_static_with_sequence_unpack(num_features, sequence_len, filter_size, num_output_channels, batch_size, device_id, precision): dt = PRECISION_TO_TYPE[precision] dev = cntk_device(device_id) x_ref = C.input_variable((1, sequence_len, num_features), dtype=dt) conv_map_ref = C.constant(np.random.randn(num_output_channels, 1, filter_size[0], filter_size[1]).astype(dt), device=dev) w2_ref = C.convolution(conv_map_ref, x_ref, auto_padding=[False]) x0_ref = np.arange(batch_size*1*sequence_len*num_features).astype(dt).reshape(batch_size, 1, sequence_len, num_features) output_ref = w2_ref.eval({x_ref: x0_ref}, device=dev) x_test = C.sequence.input_variable(num_features, dtype=dt) y_test, mask_test = C.sequence.unpack(x_test, 0).outputs z_test = C.reshape(y_test, (1, ), 0, 0) w2_test = C.convolution(conv_map_ref, z_test, auto_padding=[False]) output_test = w2_test.eval({x_test: np.squeeze(x0_ref)}, device=dev) assert np.allclose(output_test, output_ref, atol=1e-4)
def multiFunc(self, arg1): multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1,)) carry_over = multiIn approx = C.element_times(multiIn, 0) for i in range(max_bits): hot_vals = C.greater(bit_map, i) valid_vals = C.element_select(hot_vals, carry_over, 0) mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)), C.reduce_sum(hot_vals)) bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) approx = C.plus(approx, C.element_times(mean, bits)) carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def create_binary_convolution_model(): # Input variables denoting the features and label data feature_var = C.input((num_channels, image_height, image_width)) label_var = C.input((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), feature_var) # first layer is ok to be full precision z = C.layers.Convolution((3, 3), 32, pad=True, activation=C.relu)(scaled_input) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=32, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=128, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (1,1), num_classes, channels=128, pad=True) z = C.layers.AveragePooling((z.shape[1], z.shape[2]))(z) z = C.reshape(z, (num_classes,)) # Add binary regularization (ala Gang Hua) weight_sum = C.constant(0) for p in z.parameters: if (p.name == "filter"): weight_sum = C.plus(weight_sum, C.reduce_sum(C.minus(1, C.square(p)))) bin_reg = C.element_times(.000005, weight_sum) # After the last layer, we need to apply a learnable scale SP = C.parameter(shape=z.shape, init=0.001) z = C.element_times(z, SP) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) ce = C.plus(ce, bin_reg) pe = C.classification_error(z, label_var) return C.combine([z, ce, pe])
def create_model(input_sequence, label_sequence, vocab_dim, hidden_dim): # Create the rnn that computes the latent representation for the next token. rnn_with_latent_output = Sequential([ C.Embedding(hidden_dim), For(range(num_layers), lambda: Sequential([Stabilizer(), Recurrence(LSTM(hidden_dim), go_backwards=False)])), ]) # Apply it to the input sequence. latent_vector = rnn_with_latent_output(input_sequence) # Connect the latent output to (sampled/full) softmax. if use_sampled_softmax: weights = load_sampling_weights(token_frequencies_file_path) smoothed_weights = np.float32( np.power(weights, alpha)) sampling_weights = C.reshape(C.Constant(smoothed_weights), shape = (1,vocab_dim)) z, ce, errs = cross_entropy_with_sampled_softmax(latent_vector, label_sequence, vocab_dim, hidden_dim, softmax_sample_size, sampling_weights) else: z, ce, errs = cross_entropy_with_full_softmax(latent_vector, label_sequence, vocab_dim, hidden_dim) return z, ce, errs
def test_cntk_conv2d(): try: import tensorflow has_tensorflow = True except: has_tensorflow = False if has_tensorflow: tf_baseline_conv2d() else: cntk_baseline_conv2d() import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.sequence.input_variable(shape=sample_shape) input_reshaped = C.reshape(input_var, (1,)+sample_shape) conv_out = C.layers.Convolution2D(filter_shape, num_filters, activation=None)(input_reshaped) ci.watch(conv_out, 'conv2d', var_type=cstk.Conv2DAttr, attr=cstk.Conv2DAttr(filter_shape=filter_shape, num_filters=num_filters)) ci.watch(conv_out, 'conv2d_out') data = {input_var:input_data} ci.set_data(data) ci.set_workdir(workdir) conv_out_values = conv_out.eval(data) # load parameters from crosstalk and verify results are the same ci.assign('conv2d', load=True) assert ci.compare('conv2d_out', rtol=1e-4, atol=1e-6) # test assign with value ci.assign('conv2d', value=cstk.Conv2DArgs(W=np.random.random((num_filters,) + filter_shape).astype(np.float32), b=np.random.random((num_filters,)).astype(np.float32))) ci.reset()
def test_Reshape(tmpdir, dtype): with C.default_options(dtype = dtype): data = np.asarray([[[[0., 1.],[2., 3.],[4., 5.]]]], dtype=dtype) i1 = C.input_variable(shape=(3,2)) model = C.reshape(i1, (2,3)) verify_one_input(model, data, tmpdir, 'Reshape_1')
def test_Reshape(tmpdir): data = np.asarray([[[[0., 1.],[2., 3.],[4., 5.]]]], dtype=np.float32) i1 = C.input_variable(shape=(3,2)) model = C.reshape(i1, (2,3)) verify_one_input(model, data, tmpdir, 'Reshape_1')
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None, conv_bias_init=0.0): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: A CNTK variable or constant containing (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height) e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000 add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 num_channels = cfg["CNTK"].RPN_NUM_CHANNELS rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=conv_bias_init)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(rpn_cls_score.shape[0] / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp") p_rpn_cls_score_rshp = cntk.placeholder() rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0) rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob') rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape") # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # classification loss p_rpn_labels = cntk.placeholder() p_rpn_cls_score_rshp = cntk.placeholder() keeps = cntk.greater_equal(p_rpn_labels, 0.0) fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets") bg_labels = minus(1, fg_labels, name="bg_targets") rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0) rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0) rpn_loss_cls = element_times(rpn_ce, keeps) # The terms that are accounted for in the cls loss are those that have a label >= 0 cls_num_terms = reduce_sum(keeps) cls_normalization_factor = 1.0 / cls_num_terms normalized_rpn_cls_loss = reduce_sum(rpn_loss_cls) * cls_normalization_factor reduced_rpn_loss_cls = cntk.as_block(normalized_rpn_cls_loss, [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore', 'norm_rpn_cls_loss') # regression loss p_rpn_bbox_pred = cntk.placeholder() p_rpn_bbox_targets = cntk.placeholder() p_rpn_bbox_inside_weights = cntk.placeholder() rpn_loss_bbox = SmoothL1Loss(cfg["CNTK"].SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0) # The bbox loss is normalized by the rpn batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE normalized_rpn_bbox_loss = reduce_sum(rpn_loss_bbox) * bbox_normalization_factor reduced_rpn_loss_bbox = cntk.as_block(normalized_rpn_bbox_loss, [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets), (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)], 'SmoothL1Loss', 'norm_rpn_bbox_loss') rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses") return rpn_rois, rpn_losses
def hierarchical_softmax_layer_for_sequence(input_var, num_output_classes, target_class, target_output_in_class, batch_size, w1, b1, w2s, b2s): ''' A two layers hierarchical softmax function with sequence axis input: Example: >>> input_dim = 2 >>> num_output_classes = 4 >>> minibatch_size = 3 >>> seq_size = 5 >>> n_classes = int(math.ceil(math.sqrt(num_output_classes))) >>> n_outputs_per_class = n_classes >>> w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1') >>> b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1') >>> w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s') >>> b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s') # neural network structure for hierarchical softmax >>> h_input = C.sequence.input_variable(input_dim) >>> h_target_class = C.sequence.input_variable([1]) >>> h_target_output_in_class = C.sequence.input_variable([1]) >>> h_z, class_probs, all_probs = hierarchical_softmax_layer_for_sequence(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s) >>> a = np.reshape(np.arange(seq_size * minibatch_size * input_dim, dtype = np.float32), (seq_size, minibatch_size, input_dim)) >>> labels = np.reshape(np.arange(seq_size * minibatch_size, dtype = np.float32), (seq_size, minibatch_size, 1)) % num_output_classes >>> target_labels = labels // n_outputs_per_class >>> target_output_in_labels = labels % n_outputs_per_class >>> h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})[1] array([[ 0.000859], [ 0. ], [ 0. ]], dtype=float32) Args: input_var: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis num_output_classes: int target_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis target_output_in_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis batch_size: int w1: C.parameter b1: C.parameter w2s: C.parameter b2s: C.parameter Returns: output_prob: class:`~cntk.ops.functions.Function` class_probs: class:`~cntk.ops.functions.Function` all_probs: a list of class:`~cntk.ops.functions.Function` ''' input_dim = input_var.shape[0] n_classes = int(math.ceil(math.sqrt(num_output_classes))) n_outputs_per_class = n_classes class_probs = C.softmax(b1 + C.times(input_var, w1)) w2_temp = C.gather(w2s, target_class) w2 = reshape(w2_temp, (input_dim, n_outputs_per_class)) w2 = C.sequence.broadcast_as(w2, input_var) b2 = reshape(C.gather(b2s, target_class), (n_outputs_per_class)) b2 = C.sequence.broadcast_as(b2, input_var) times_result = times(input_var, w2) probs_in_class = softmax(b2 + times_result) probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class) target_output_in_class = C.one_hot(target_output_in_class, n_outputs_per_class, False) probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class) prob_in_class = C.times_transpose(probs_in_class, target_output_in_class) target_class = C.one_hot(target_class, n_classes, False) class_probs = C.sequence.broadcast_as(class_probs, target_class) class_prob = C.times_transpose(class_probs, target_class) output_prob = C.element_times(class_prob, prob_in_class) # this is for calculating all the outputs' probabilities all_probs = [] for i in range(n_classes): ci = C.constant(i) w2a = C.reshape(C.gather(w2s, ci), (input_dim, n_outputs_per_class)) w2a = C.sequence.broadcast_as(w2a, input_var) b2a = C.reshape(C.gather(b2s, ci), (n_outputs_per_class)) b2a = C.sequence.broadcast_as(b2a, input_var) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) cia = C.constant(i, shape=[1]) cia = C.reconcile_dynamic_axes(cia, class_probs) cia = C.one_hot(cia, n_outputs_per_class, False) class_proba = C.times_transpose(class_probs, cia) class_proba = C.sequence.broadcast_as(class_proba, probs_in_classa) output_proba = C.element_times(class_proba, probs_in_classa) all_probs.append(output_proba) return output_prob, class_probs, all_probs
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: (image_widht, image_height, image_scale) as CNTK variable or constant add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 rpn_conv_3x3 = Convolution((3, 3), 256, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=0.1)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(np.prod(rpn_cls_score.shape) / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions)) rpn_cls_prob = softmax(rpn_cls_score_rshp, axis=0, name="objness_softmax") rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape) # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # For loss functions: ignore label predictions for the 'ignore label', # i.e. set target and prediction to 0 --> needs to be softmaxed before rpn_labels_rshp = reshape(rpn_labels, (1, num_predictions)) ignore = user_function(IgnoreLabel(rpn_cls_prob, rpn_labels_rshp, ignore_label=-1)) rpn_cls_prob_ignore = ignore.outputs[0] fg_targets = ignore.outputs[1] bg_targets = 1 - fg_targets rpn_labels_ignore = splice(bg_targets, fg_targets, axis=0) # RPN losses rpn_loss_cls = cross_entropy_with_softmax(rpn_cls_prob_ignore, rpn_labels_ignore, axis=0) rpn_loss_bbox = user_function(SmoothL1Loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights)) rpn_losses = plus(reduce_sum(rpn_loss_cls), reduce_sum(rpn_loss_bbox), name="rpn_losses") return rpn_rois, rpn_losses