def multiFunc(self, arg1): # load or create the inputs we need multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1,)) # lets compute the means we need # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits, # it is the difference between the previous bits approximation and the true value. carry_over = multiIn approx = C.element_times(multiIn, 0) # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization for i in range(max_bits): # determine which values of the input should be binarized to i bits or more hot_vals = C.greater(bit_map, i) # select only the values which we need to binarize valid_vals = C.element_select(hot_vals, carry_over, 0) # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels) mean = C.element_divide(C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1)) # reshape the mean to match the dimensionality of the input mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1)) # binarize the carry over bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) # add in the equivalent binary representation to the approximation approx = C.plus(approx, C.element_times(mean, bits)) # compute the new carry over carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def create_faster_rcnn_eval_model(model, image_input, dims_input, cfg, rpn_model=None): print("creating eval model") last_conv_node_name = cfg["MODEL"].LAST_CONV_NODE_NAME conv_layers = clone_model(model, [cfg["MODEL"].FEATURE_NODE_NAME], [last_conv_node_name], CloneMethod.freeze) conv_out = conv_layers(image_input) model_with_rpn = model if rpn_model is None else rpn_model rpn = clone_model(model_with_rpn, [last_conv_node_name], ["rpn_cls_prob_reshape", "rpn_bbox_pred"], CloneMethod.freeze) rpn_out = rpn(conv_out) # we need to add the proposal layer anew to account for changing configs when buffering proposals in 4-stage training rpn_rois = create_proposal_layer(rpn_out.outputs[0], rpn_out.outputs[1], dims_input, cfg) roi_fc_layers = clone_model(model, [last_conv_node_name, "rpn_target_rois"], ["cls_score", "bbox_regr"], CloneMethod.freeze) pred_net = roi_fc_layers(conv_out, rpn_rois) cls_score = pred_net.outputs[0] bbox_regr = pred_net.outputs[1] if cfg.BBOX_NORMALIZE_TARGETS: num_boxes = int(bbox_regr.shape[1] / 4) bbox_normalize_means = np.array(cfg.BBOX_NORMALIZE_MEANS * num_boxes) bbox_normalize_stds = np.array(cfg.BBOX_NORMALIZE_STDS * num_boxes) bbox_regr = plus(element_times(bbox_regr, bbox_normalize_stds), bbox_normalize_means, name='bbox_regr') cls_pred = softmax(cls_score, axis=1, name='cls_pred') eval_model = combine([cls_pred, rpn_rois, bbox_regr]) return eval_model
def element_times(left, right, name=''): ''' The output of this operation is the element-wise product of the two input tensors. It supports broadcasting. In case of scalars its backward pass to left propagates right times the received gradient and vice versa. The operator (*) has been overloaded and can equally be used instead of element_times(). Example: >>> C.eval(C.element_times([1., 1., 1., 1.], [0.5, 0.25, 0.125, 0.])) [array([[ 0.5 , 0.25 , 0.125, 0. ]])] >>> C.eval(C.element_times([5., 10., 15., 30.], [2.])) [array([[ 10., 20., 30., 60.]])] Args: left: left side tensor right: right side tensor name (str): the name of the node in the network Returns: :class:`cntk.Function` ''' from cntk import element_times left = sanitize_input(left, get_data_type(right)) right = sanitize_input(right, get_data_type(left)) return element_times(left, right, name).output()
def test_stop_gradient(): x = C.sequence.input_variable(shape=(2,), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(2,), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.element_times(x, y) w = z + C.stop_gradient(z) a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2)) b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2)) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b)*2 assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) assert np.allclose(grad[x], b) assert np.allclose(grad[y], a) #test stop_gradient with function as input whose arguments should have no gradients (zeros reading) w = C.stop_gradient(z) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b) assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) #there should be no gradients backward to x and y assert np.allclose(grad[x], np.zeros_like(b)) assert np.allclose(grad[y], np.zeros_like(a))
def create_binary_convolution_model(): # Input variables denoting the features and label data feature_var = C.input((num_channels, image_height, image_width)) label_var = C.input((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), feature_var) # first layer is ok to be full precision z = C.layers.Convolution((3, 3), 32, pad=True, activation=C.relu)(scaled_input) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=32, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=128, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (1,1), num_classes, channels=128, pad=True) z = C.layers.AveragePooling((z.shape[1], z.shape[2]))(z) z = C.reshape(z, (num_classes,)) # Add binary regularization (ala Gang Hua) weight_sum = C.constant(0) for p in z.parameters: if (p.name == "filter"): weight_sum = C.plus(weight_sum, C.reduce_sum(C.minus(1, C.square(p)))) bin_reg = C.element_times(.000005, weight_sum) # After the last layer, we need to apply a learnable scale SP = C.parameter(shape=z.shape, init=0.001) z = C.element_times(z, SP) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) ce = C.plus(ce, bin_reg) pe = C.classification_error(z, label_var) return C.combine([z, ce, pe])
def multiFunc(self, arg1): multiIn = C.input(shape=arg1.shape, dynamic_axes=arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() carry_over = multiIn approx = C.element_times(multiIn, 0) for i in range(max_bits): hot_vals = C.greater(bit_map, i) valid_vals = C.element_select(hot_vals, carry_over, 0) mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)), C.reduce_sum(hot_vals)) bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) approx = C.plus(approx, C.element_times(mean, bits)) carry_over = C.plus( C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def multiFunc(self, arg1): multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1,)) carry_over = multiIn approx = C.element_times(multiIn, 0) for i in range(max_bits): hot_vals = C.greater(bit_map, i) valid_vals = C.element_select(hot_vals, carry_over, 0) mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)), C.reduce_sum(hot_vals)) bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) approx = C.plus(approx, C.element_times(mean, bits)) carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def train_eval_mnist_onelayer_from_file(criterion_name=None, eval_name=None): # Network definition feat_dim = 784 label_dim = 10 hidden_dim = 200 cur_dir = os.path.dirname(__file__) training_filename = os.path.join(cur_dir, "Data", "Train-28x28_text.txt") test_filename = os.path.join(cur_dir, "Data", "Test-28x28_text.txt") features = C.input(feat_dim) features.name = 'features' feat_scale = C.constant(0.00390625) feats_scaled = C.element_times(features, feat_scale) labels = C.input(label_dim) labels.tag = 'label' labels.name = 'labels' traning_reader = C.CNTKTextFormatReader(training_filename) test_reader = C.CNTKTextFormatReader(test_filename) h1 = add_dnn_sigmoid_layer(feat_dim, hidden_dim, feats_scaled, 1) out = add_dnn_layer(hidden_dim, label_dim, h1, 1) out.tag = 'output' ec = C.cross_entropy_with_softmax(labels, out) ec.name = criterion_name ec.tag = 'criterion' eval = C.ops.square_error(labels, out) eval.name = eval_name eval.tag = 'eval' # Specify the training parameters (settings are scaled down) my_sgd = C.SGDParams(epoch_size=600, minibatch_size=32, learning_rates_per_mb=0.1, max_epochs=5, momentum_per_mb=0) # Create a context or re-use if already there with C.LocalExecutionContext('mnist_one_layer', clean_up=True) as ctx: # CNTK actions ctx.train( root_nodes=[ec, eval], training_params=my_sgd, input_map=traning_reader.map(labels, alias='labels', dim=label_dim).map(features, alias='features', dim=feat_dim)) result = ctx.test( root_nodes=[ec, eval], input_map=test_reader.map(labels, alias='labels', dim=label_dim).map(features, alias='features', dim=feat_dim)) return result
def finalize_network(reader, model_details, max_amount_of_epochs, samples_per_epoch, samples_per_minibatch, pixel_dimensions, classes, learning_rate): features = input_variable(shape=(pixel_dimensions['depth'], pixel_dimensions['height'], pixel_dimensions['width'])) label = input_variable(shape=len(classes)) # speeds up training normalized_features = element_times(1.0 / 256.0, features) model = create_tf_model(model_details, num_classes=len(classes), input_features=normalized_features, freeze=True) loss = cross_entropy_with_softmax(model, label) metric = classification_error(model, label) learner = momentum_sgd(parameters=model.parameters, lr=learning_rate_schedule(learning_rate, UnitType.minibatch), momentum=0.9, l2_regularization_weight=0.0005) reporter = ProgressPrinter(tag='training', num_epochs=max_amount_of_epochs) trainer = Trainer(model=model, criterion=(loss, metric), parameter_learners=[learner], progress_writers=[reporter]) log_number_of_parameters(model) map_input_to_streams_train = { features: reader.streams.features, label: reader.streams.labels } training_session(trainer=trainer, mb_source=reader, model_inputs_to_streams=map_input_to_streams_train, mb_size=samples_per_minibatch, progress_frequency=samples_per_epoch, checkpoint_config=CheckpointConfig( frequency=samples_per_epoch, filename=os.path.join("./checkpoints", "ConvNet_Lego_VisiOn"), restore=True)).train() network = {'features': features, 'label': label, 'model': softmax(model)} model_name = f"CNN-3200-224-resnet-18.model" export_path = os.path.abspath( os.path.join("..", "..", "Final models", "CNN", model_name)) model.save(export_path) return network
def gradFunc(self, arg): # create an input variable corresponding the inputs of the forward prop function gradIn = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes) # create an input variable for the gradient passed from the next stage gradRoot = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes) # first step is to take absolute value of input arg signGrad = C.abs(gradIn) # then compare its magnitude to 1 signGrad = C.less_equal(signGrad, 1) # finish by multiplying this result with the input gradient return C.element_times(gradRoot, signGrad), gradIn, gradRoot
def inner(a): not_negative = C.greater_equal(a, 0) sign = C.element_select(not_negative, not_negative, -1) abs_x = C.abs(a) # A&S formula 7.1.26 t = 1.0 / (1.0 + p * a) y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * C.exp( -abs_x * abs_x) return C.element_times(sign, y)
def masking(input, labels): if not is_onehot_encoded: mask = ct.reshape(ct.one_hot( ct.reshape(ct.argmax(labels, axis=0), shape=(-1, )), 10), shape=(10, 1, 1)) mask = ct.stop_gradient(mask) else: mask = ct.reshape(labels, shape=(10, 1, 1)) mask = ct.splice(*([mask] * 16), axis=1) return ct.reshape(ct.element_times(input, mask), shape=(-1, ))
def create_deep_model(features): with C.layers.default_options(init=C.layers.glorot_uniform()): encode = C.element_times(C.constant(1.0 / 255.0), features) for encoding_dim in encoding_dims: encode = C.layers.Dense(encoding_dim, activation=C.relu)(encode) global encoded_model encoded_model = encode decode = encode for decoding_dim in decoding_dims: decode = C.layers.Dense(decoding_dim, activation=C.relu)(decode) decode = C.layers.Dense(input_dim, activation=C.sigmoid)(decode) return decode
def criteria(label, output, block_size, c_classes, weights): ''' Define the loss function and metric ''' probs = cntk.softmax(output, axis=0) log_probs = cntk.log(probs) ce = cntk.times(weights, -cntk.element_times(log_probs, label), output_rank=2) mean_ce = cntk.reduce_mean(ce) _, w, h = label.shape pe = cntk.classification_error(probs, label, axis=0) - \ cntk.reduce_sum(cntk.slice(label, 0, 0, 1)) / cntk.reduce_sum(label) return (mean_ce, pe)
def grid_lstm_func(m_t_1_k, m_tk_1, c_t_1_k, c_tk_1, x_tk): common_11 = C.times(m_t_1_k, W_t_im) + C.times( m_tk_1, W_k_im) + C.times(c_t_1_k, W_t_ic) + C.times( c_tk_1, W_k_ic) i_t_tk = C.sigmoid(C.times(x_tk, W_t_ix) + common_11 + b_t_i) i_k_tk = C.sigmoid(C.times(x_tk, W_k_ix) + common_11 + b_k_i) common_12 = C.times(m_t_1_k, W_t_fm) + C.times( m_tk_1, W_k_fm) + C.times(c_t_1_k, W_t_fc) + C.times( c_tk_1, W_k_fc) f_t_tk = C.sigmoid(C.times(x_tk, W_t_fx) + common_12 + b_t_f) f_k_tk = C.sigmoid(C.times(x_tk, W_k_fx) + common_12 + b_k_f) c_t_tk = C.element_times(f_t_tk, c_t_1_k) + C.element_times( i_t_tk, C.tanh( C.times(x_tk, W_t_cx) + C.times(m_t_1_k, W_t_cm) + C.times(m_tk_1, W_k_cm) + b_t_c)) # (13) c_k_tk = C.element_times(f_k_tk, c_tk_1) + C.element_times( i_k_tk, C.tanh( C.times(x_tk, W_k_cx) + C.times(m_t_1_k, W_t_cm) + C.times(m_tk_1, W_k_cm) + b_k_c)) # (14) common_15 = C.times(m_t_1_k, W_t_om) + C.times( m_tk_1, W_k_om) + C.times(c_t_tk, W_t_oc) + C.times( c_k_tk, W_k_oc) o_t_tk = C.sigmoid(C.times(x_tk, W_t_ox) + common_15 + b_t_o) o_k_tk = C.sigmoid(C.times(x_tk, W_k_ox) + common_15 + b_k_o) m_t_tk = C.element_times(o_t_tk, C.tanh(c_t_tk)) m_k_tk = C.element_times(o_k_tk, C.tanh(c_k_tk)) return (m_t_tk, m_k_tk, c_t_tk, c_k_tk)
def create_binary_convolution_model(): feature_var = C.input((num_channels, image_height, image_width)) label_var = C.input((num_classes)) scaled_input = C.element_times(C.constant(0.00390625), feature_var) z = C.layers.Convolution((3, 3), 32, pad=True, activation=C.relu)(scaled_input) z = C.layers.MaxPooling((3, 3), strides=(2, 2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution((3, 3), 128, channels=32, pad=True)(z) z = C.layers.MaxPooling((3, 3), strides=(2, 2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution((3, 3), 128, channels=128, pad=True)(z) z = C.layers.MaxPooling((3, 3), strides=(2, 2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution((1, 1), num_classes, channels=128, pad=True)(z) z = C.layers.AveragePooling((z.shape[1], z.shape[2]))(z) z = C.reshape(z, (num_classes, )) weight_sum = C.constant(0) for p in z.parameters: if (p.name == "filter"): weight_sum = C.plus(weight_sum, C.reduce_sum(C.minus(1, C.square(p)))) bin_reg = C.element_times(.000005, weight_sum) SP = C.parameter(shape=z.shape, init=0.001) z = C.element_times(z, SP) ce = C.cross_entropy_with_softmax(z, label_var) ce = C.plus(ce, bin_reg) pe = C.classification_error(z, label_var) return C.combine([z, ce, pe])
def createNetwork(self, inputEmb, preHidden, preMem): WX = C.times(inputEmb, self.W) + self.Wb UH = C.times(preHidden, self.U) + self.Ub I = C.sigmoid( C.slice(WX, -1, 0, self.hiddenSize) + C.slice(UH, -1, 0, self.hiddenSize)) O = C.sigmoid( C.slice(WX, -1, self.hiddenSize, self.hiddenSize * 2) + C.slice(UH, -1, self.hiddenSize, self.hiddenSize * 2)) F = C.sigmoid( C.slice(WX, -1, self.hiddenSize * 2, self.hiddenSize * 3) + C.slice(UH, -1, self.hiddenSize * 2, self.hiddenSize * 3)) N = C.tanh( C.slice(WX, -1, self.hiddenSize * 3, self.hiddenSize * 4) + C.slice(UH, -1, self.hiddenSize * 3, self.hiddenSize * 4)) NI = C.element_times(N, I) FM = C.element_times(F, preMem) CurMem = NI + FM CurH = C.element_times(C.tanh(CurMem), O) return (CurH, CurMem)
def test_batch_norm_model(tmpdir): pytest.skip('Needs to be fixed after removal of batch axis change.') image_height = 32 image_width = 32 num_channels = 3 num_classes = 10 input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) def create_basic_model_with_batch_normalization(input, out_dims): with C.layers.default_options(activation=C.relu, init=C.glorot_uniform()): model = C.layers.Sequential([ C.layers.For( range(3), lambda i: [ C.layers.Convolution( (5, 5), [image_width, image_height, 64][i], pad=True), C.layers.BatchNormalization(map_rank=1), C.layers.MaxPooling((3, 3), strides=(2, 2)) ]), C.layers.Dense(64), C.layers.BatchNormalization(map_rank=1), C.layers.Dense(out_dims, activation=None) ]) return model(input) feature_scale = 1.0 / 256.0 #TODO: ONNX only do right hand-side broadcast. This test fails # if input_var, feature_scale is swapped. input_var_norm = C.element_times(input_var, feature_scale) # apply model to input z = create_basic_model_with_batch_normalization(input_var_norm, out_dims=10) filename = os.path.join(str(tmpdir), R'bn_model.onnx') z.save(filename, format=C.ModelFormat.ONNX) loaded_node = C.Function.load(filename, format=C.ModelFormat.ONNX) assert z.shape == loaded_node.shape img_shape = (num_channels, image_width, image_height) img = np.asarray(np.random.uniform(-1, 1, img_shape), dtype=np.float32) x = z.arguments[0] x_ = loaded_node.arguments[0] assert np.allclose(loaded_node.eval({x_: img}), z.eval({x: img}))
def multiFunc(self, arg1): # load or create the inputs we need multiIn = C.input(shape=arg1.shape, dynamic_axes=arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1, )) # lets compute the means we need # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits, # it is the difference between the previous bits approximation and the true value. carry_over = multiIn approx = C.element_times(multiIn, 0) # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization for i in range(max_bits): # determine which values of the input should be binarized to i bits or more hot_vals = C.greater(bit_map, i) # select only the values which we need to binarize valid_vals = C.element_select(hot_vals, carry_over, 0) # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels) mean = C.element_divide( C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1)) # reshape the mean to match the dimensionality of the input mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1)) # binarize the carry over bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) # add in the equivalent binary representation to the approximation approx = C.plus(approx, C.element_times(mean, bits)) # compute the new carry over carry_over = C.plus( C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def gradFunc(self, arg): # create an input variable corresponding the inputs of the forward prop function gradIn = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes) # create an input variable for the gradient passed from the next stage gradRoot = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes) signGrad = C.abs(gradIn) # new idea, bound of clipping should be a function of the bit map since higher bits can represent higher numbers bit_map = C.constant(self.bit_map) signGrad = C.less_equal(signGrad, bit_map) outGrad = signGrad outGrad = C.element_times(gradRoot, outGrad) return outGrad, gradIn, gradRoot
def test_stop_gradient(): x = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.element_times(x, y) w = z + C.stop_gradient(z) a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2)) b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2)) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b) * 2 assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) assert np.allclose(grad[x], b) assert np.allclose(grad[y], a)
def GenSimpleMNIST(): input_dim = 784 num_output_classes = 10 num_hidden_layers = 1 hidden_layers_dim = 200 feature = C.input_variable(input_dim, np.float32) scaled_input = C.element_times(C.constant(0.00390625, shape=(input_dim,)), feature) z = C.layers.Sequential([C.layers.For(range(num_hidden_layers), lambda i: C.layers.Dense(hidden_layers_dim, activation=C.relu)), C.layers.Dense(num_output_classes)])(scaled_input) model = C.softmax(z) data_feature = np.random.rand(*feature.shape).astype(np.float32) data_output = model.eval(data_feature) Save('test_simpleMNIST', model, data_feature, data_output)
def std_normalized_l2_loss(output, target): std_inv = np.array([ 6.6864805402, 5.2904440280, 3.7165409939, 4.1421640454, 8.1537399389, 7.0312877415, 2.6712380967, 2.6372177876, 8.4253649884, 6.7482162880, 9.0849960354, 10.2624412692, 3.1325531319, 3.1091179819, 2.7337937590, 2.7336441031, 4.3542467871, 5.4896293687, 6.2003761588, 3.1290341469, 5.7677042738, 11.5460919611, 9.9926451700, 5.4259818848, 20.5060642486, 4.7692101480, 3.1681517575, 3.8582905289, 3.4222250436, 4.6828286809, 3.0070785113, 2.8936539301, 4.0649030157, 25.3068458731, 6.0030623160, 3.1151977458, 7.7773542649, 6.2057372469, 9.9494258692, 4.6865422850, 5.3300697628, 2.7722027974, 4.0658663003, 18.1101618617, 3.5390113731, 2.7794520068 ], dtype=np.float32) weights = C.constant(value=std_inv) #.reshape((1, label_dim))) dif = output - target ret = C.reduce_mean(C.square(C.element_times(dif, weights))) return ret
def test_batch_norm_model(tmpdir): image_height = 32 image_width = 32 num_channels = 3 num_classes = 10 input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) def create_basic_model_with_batch_normalization(input, out_dims): with C.layers.default_options(activation=C.relu, init=C.glorot_uniform()): model = C.layers.Sequential([ C.layers.For(range(3), lambda i: [ C.layers.Convolution((5,5), [image_width,image_height,64][i], pad=True), C.layers.BatchNormalization(map_rank=1), C.layers.MaxPooling((3,3), strides=(2,2)) ]), C.layers.Dense(64), C.layers.BatchNormalization(map_rank=1), C.layers.Dense(out_dims, activation=None) ]) return model(input) feature_scale = 1.0 / 256.0 #TODO: ONNX only do right hand-side broadcast. This test fails # if input_var, feature_scale is swapped. input_var_norm = C.element_times(input_var, feature_scale) # apply model to input z = create_basic_model_with_batch_normalization(input_var_norm, out_dims=10) filename = os.path.join(str(tmpdir), R'bn_model.onnx') z.save(filename, format=C.ModelFormat.ONNX) loaded_node = C.Function.load(filename, format=C.ModelFormat.ONNX) assert z.shape == loaded_node.shape img_shape = (num_channels, image_width, image_height) img = np.asarray(np.random.uniform(-1, 1, img_shape), dtype=np.float32) x = z.arguments[0]; x_ = loaded_node.arguments[0] assert np.allclose(loaded_node.eval({x_:img}), z.eval({x:img}))
def create_fast_rcnn_eval_model(model, image_input, roi_proposals, cfg): print("creating eval model") predictor = clone_model(model, [cfg["MODEL"].FEATURE_NODE_NAME, "roi_proposals"], ["cls_score", "bbox_regr"], CloneMethod.freeze) pred_net = predictor(image_input, roi_proposals) cls_score = pred_net.outputs[0] bbox_regr = pred_net.outputs[1] if cfg.BBOX_NORMALIZE_TARGETS: num_boxes = int(bbox_regr.shape[1] / 4) bbox_normalize_means = np.array(cfg.BBOX_NORMALIZE_MEANS * num_boxes) bbox_normalize_stds = np.array(cfg.BBOX_NORMALIZE_STDS * num_boxes) bbox_regr = plus(element_times(bbox_regr, bbox_normalize_stds), bbox_normalize_means, name='bbox_regr') cls_pred = softmax(cls_score, axis=1, name='cls_pred') eval_model = combine([cls_pred, bbox_regr]) if cfg["CNTK"].DEBUG_OUTPUT: plot(eval_model, os.path.join(cfg.OUTPUT_PATH, "graph_frcn_eval." + cfg["CNTK"].GRAPH_TYPE)) return eval_model
def cnn_network(queryfeatures, passagefeatures, num_classes): with C.layers.default_options(activation=C.ops.relu, pad=False): convA1 = C.layers.Convolution2D((3,10),4,pad=False,activation=C.tanh,name='convA1')(queryfeatures) #input : 12*50 #output : 4*10*41 poolA1 = C.layers.MaxPooling((2,3),(2,3),name='poolA1')(convA1) #output : 4*5*13 convA2 = C.layers.Convolution2D((2,4),2,pad=False,activation=C.tanh,name='convA2')(poolA1) #output : 2*4*10 poolA2 = C.layers.MaxPooling((2,2),(2,2),name='poolA2')(convA2) #output : 2*2*5 denseA = C.layers.Dense(num_classes*num_classes,activation=C.tanh,name='denseA')(poolA2) # output : 4 convB1 = C.layers.Convolution2D((5,10),4,pad=False,activation=C.tanh,name='convB1')(passagefeatures) #input : 50*50 #output : 4*46*41 poolB1 = C.layers.MaxPooling((5,5),(5,5),name='poolB1')(convB1) #output : 4*9*8 convB2 = C.layers.Convolution2D((3,3),2,pad=False,activation=C.tanh,name='convB2')(poolB1) #output : 2*7*6 poolB2 = C.layers.MaxPooling((2,2),(2,2),name='poolB2')(convB2) #output : 2*3*3 denseB = C.layers.Dense(num_classes*num_classes,activation=C.tanh,name='denseB')(poolB2) # output : 4 mergeQP = C.element_times(denseA,denseB) # output : 4 model = C.layers.Dense(num_classes, activation=C.softmax,name="overall")(mergeQP) #outupt : 2 return model
def old_attention(h_enc, h_dec): history_axis = h_dec # we use history_axis wherever we pass this only for the sake of passing its axis # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders # --- encoder state window (h_enc, h_enc_valid) = PastValueWindow(attention_span, axis=attention_axis, go_backwards=go_backwards)(h_enc).outputs h_enc_proj = attn_proj_enc(h_enc) # window must be broadcast to every decoder time step h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis) h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis) # --- decoder state # project decoder hidden state h_dec_proj = attn_proj_dec(h_dec) tanh_out = C.tanh(h_dec_proj + h_enc_proj) # (attention_span, attention_dim) u = attn_proj_tanh(tanh_out) # (attention_span, 1) u_masked = u + (h_enc_valid - 1) * 50 # logzero-out the unused elements for the softmax denominator TODO: use a less arbitrary number than 50 attention_weights = C.softmax(u_masked, axis=attention_axis) #, name='attention_weights') attention_weights = Label('attention_weights')(attention_weights) # now take weighted sum over the encoder state vectors h_att = C.reduce_sum(C.element_times(C.sequence.broadcast_as(h_enc, history_axis), attention_weights), axis=attention_axis) h_att = attn_final_stab(h_att) return h_att
def create_conv_network(): # Input variables denoting the features and label data feature_var = cntk.input_variable( (num_channels, image_height, image_width)) label_var = cntk.input_variable((num_classes)) # apply model to input scaled_input = cntk.element_times(cntk.constant(0.00390625), feature_var) with cntk.layers.default_options(activation=cntk.relu, pad=True): z = cntk.layers.Sequential([ cntk.layers.For( range(2), lambda: [ cntk.layers.Convolution2D((3, 3), 64), cntk.layers.Convolution2D((3, 3), 64), cntk.layers.MaxPooling((3, 3), (2, 2)) ]), cntk.layers.For( range(2), lambda i: [cntk.layers.Dense([256, 128][i]), cntk.layers.Dropout(0.5)]), cntk.layers.Dense(num_classes, activation=None) ])(scaled_input) # loss and metric ce = cntk.cross_entropy_with_softmax(z, label_var) pe = cntk.classification_error(z, label_var) cntk.logging.log_number_of_parameters(z) print() return { 'feature': feature_var, 'label': label_var, 'ce': ce, 'pe': pe, 'output': z }
def create_conv_network(): # Input variables denoting the features and label data feature_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), feature_var) z = create_convnet_cifar10_model(num_classes)(scaled_input) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) C.logging.log_number_of_parameters(z) ; print() return { 'feature': feature_var, 'label': label_var, 'ce' : ce, 'pe' : pe, 'output': z }
def create_eval_model(model, image_input, dims_input, rpn_model=None): print("creating eval model") conv_layers = clone_model(model, [feature_node_name], [last_conv_node_name], CloneMethod.freeze) conv_out = conv_layers(image_input) model_with_rpn = model if rpn_model is None else rpn_model rpn = clone_model(model_with_rpn, [last_conv_node_name, "dims_input"], ["rpn_rois"], CloneMethod.freeze) rpn_rois = rpn(conv_out, dims_input) roi_fc_layers = clone_model(model, [last_conv_node_name, "rpn_target_rois"], ["cls_score", "bbox_regr"], CloneMethod.freeze) pred_net = roi_fc_layers(conv_out, rpn_rois) cls_score = pred_net.outputs[0] bbox_regr = pred_net.outputs[1] if cfg["TRAIN"].BBOX_NORMALIZE_TARGETS and cfg["TRAIN"].BBOX_NORMALIZE_TARGETS_PRECOMPUTED: num_boxes = int(bbox_regr.shape[1] / 4) bbox_normalize_means = np.array(cfg["TRAIN"].BBOX_NORMALIZE_MEANS * num_boxes) bbox_normalize_stds = np.array(cfg["TRAIN"].BBOX_NORMALIZE_STDS * num_boxes) bbox_regr = plus(element_times(bbox_regr, bbox_normalize_stds), bbox_normalize_means, name='bbox_regr') cls_pred = softmax(cls_score, axis=1, name='cls_pred') eval_model = combine([cls_pred, rpn_rois, bbox_regr]) return eval_model
def SmoothL1Loss(sigma, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights): """ From https://github.com/smallcorgi/Faster-RCNN_TF/blob/master/lib/fast_rcnn/train.py ResultLoss = outside_weights * SmoothL1(inside_weights * (bbox_pred - bbox_targets)) SmoothL1(x) = 0.5 * (sigma * x)^2, if |x| < 1 / sigma^2 |x| - 0.5 / sigma^2, otherwise """ sigma2 = sigma * sigma inside_mul_abs = C.abs(C.element_times(bbox_inside_weights, C.minus(bbox_pred, bbox_targets))) smooth_l1_sign = C.less(inside_mul_abs, 1.0 / sigma2) smooth_l1_option1 = C.element_times(C.element_times(inside_mul_abs, inside_mul_abs), 0.5 * sigma2) smooth_l1_option2 = C.minus(inside_mul_abs, 0.5 / sigma2) smooth_l1_result = C.plus(C.element_times(smooth_l1_option1, smooth_l1_sign), C.element_times(smooth_l1_option2, C.minus(1.0, smooth_l1_sign))) return C.element_times(bbox_outside_weights, smooth_l1_result)
def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_epochs=80): _cntk_py.set_computation_network_trace_level(0) # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) # input normalization 1/256 = 0.00396025 scaled_input = C.element_times(C.constant(0.00390625), input_var) f = GlobalAveragePooling() f.update_signature((1, 8, 8)) with C.layers.default_options(): z = C.layers.Sequential([ C.layers.For( range(1), lambda: [ C.layers.Convolution2D( (3, 3), 32, strides=(1, 1), pad=True), C.layers.Activation(activation=C.relu), C.layers.Convolution2D( (1, 1), 64, strides=(1, 1), pad=False), C.layers.MaxPooling((3, 3), strides=(2, 2), pad=True), C.layers.Dropout(0.5) ]), C.layers.For( range(1), lambda: [ C.layers.Convolution2D( (3, 3), 128, strides=(1, 1), pad=True), C.layers.Activation(activation=C.relu), C.layers.Convolution2D( (1, 1), 160, strides=(1, 1), pad=False), C.layers.Activation(activation=C.relu), C.layers.MaxPooling((3, 3), strides=(2, 2), pad=True), C.layers.Dropout(0.5) ]), C.layers.For( range(1), lambda: [ C.layers.Convolution2D( (3, 3), 192, strides=(1, 1), pad=True), C.layers.Activation(activation=C.relu), C.layers.Convolution2D( (1, 1), 256, strides=(1, 1), pad=False), C.layers.Activation(activation=C.relu), C.layers.Convolution2D( (1, 1), 10, strides=(1, 1), pad=False), C.layers.Activation(activation=C.relu), C.layers.AveragePooling((8, 8), strides=(1, 1), pad=False) ]) ])(scaled_input) print('z.shape', z.shape) z = C.flatten(z) print('z.shape now', z.shape) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config minibatch_size = 64 # Set learning parameters # learning rate lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [ 0.00015625 ] * 20 + [0.000046875] * 10 + [0.000015625] lr_schedule = C.learning_parameter_schedule_per_sample( lr_per_sample, epoch_size=epoch_size) # momentum mms = [0] * 20 + [0.9983347214509387] * 20 + [0.9991670137924583] mm_schedule = C.learners.momentum_schedule_per_sample( mms, epoch_size=epoch_size) l2_reg_weight = 0.002 # trainer object learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain=True, l2_regularization_weight=l2_reg_weight) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } C.logging.log_number_of_parameters(z) print() # perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch( min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() # save model modelname = "NIN_test1.dnn" z.save(os.path.join(model_path, modelname)) ### Evaluation action epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) data = reader_test.next_minibatch(current_minibatch, input_map=input_map) metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch sample_count += current_minibatch minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom)) print("") return metric_numer / metric_denom
def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_epochs = 80): _cntk_py.set_computation_network_trace_level(0) # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width)) label_var = C.input_variable((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), input_var) with C.layers.default_options (activation=C.relu, pad=True): z = C.layers.Sequential([ C.layers.For(range(2), lambda : [ C.layers.Convolution2D((3,3), 64), C.layers.Convolution2D((3,3), 64), LocalResponseNormalization (1.0, 4, 0.001, 0.75), C.layers.MaxPooling((3,3), (2,2)) ]), C.layers.For(range(2), lambda i: [ C.layers.Dense([256,128][i]), C.layers.Dropout(0.5) ]), C.layers.Dense(num_classes, activation=None) ])(scaled_input) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) pe = C.classification_error(z, label_var) # training config minibatch_size = 64 # Set learning parameters lr_per_sample = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625] lr_schedule = C.learning_parameter_schedule_per_sample(lr_per_sample, epoch_size=epoch_size) mms = [0]*20 + [0.9983347214509387]*20 + [0.9991670137924583] mm_schedule = C.learners.momentum_schedule_per_sample(mms, epoch_size=epoch_size) l2_reg_weight = 0.002 # trainer object learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain = True, l2_regularization_weight = l2_reg_weight) progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = C.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } C.logging.log_number_of_parameters(z) ; print() # perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size-sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch))) ### Evaluation action epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
def test_Mul(tmpdir): data0 = np.asarray([1., 1., 1., 1.], dtype=np.float32) data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=np.float32) model = C.element_times(data0, data1) verify_no_input(model, tmpdir, 'ElementTimes_0')
def hierarchical_softmax_layer_for_sequence(input_var, num_output_classes, target_class, target_output_in_class, batch_size, w1, b1, w2s, b2s): ''' A two layers hierarchical softmax function with sequence axis input: Example: >>> input_dim = 2 >>> num_output_classes = 4 >>> minibatch_size = 3 >>> seq_size = 5 >>> n_classes = int(math.ceil(math.sqrt(num_output_classes))) >>> n_outputs_per_class = n_classes >>> w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1') >>> b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1') >>> w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s') >>> b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s') # neural network structure for hierarchical softmax >>> h_input = C.sequence.input_variable(input_dim) >>> h_target_class = C.sequence.input_variable([1]) >>> h_target_output_in_class = C.sequence.input_variable([1]) >>> h_z, class_probs, all_probs = hierarchical_softmax_layer_for_sequence(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s) >>> a = np.reshape(np.arange(seq_size * minibatch_size * input_dim, dtype = np.float32), (seq_size, minibatch_size, input_dim)) >>> labels = np.reshape(np.arange(seq_size * minibatch_size, dtype = np.float32), (seq_size, minibatch_size, 1)) % num_output_classes >>> target_labels = labels // n_outputs_per_class >>> target_output_in_labels = labels % n_outputs_per_class >>> h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})[1] array([[ 0.000859], [ 0. ], [ 0. ]], dtype=float32) Args: input_var: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis num_output_classes: int target_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis target_output_in_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis batch_size: int w1: C.parameter b1: C.parameter w2s: C.parameter b2s: C.parameter Returns: output_prob: class:`~cntk.ops.functions.Function` class_probs: class:`~cntk.ops.functions.Function` all_probs: a list of class:`~cntk.ops.functions.Function` ''' input_dim = input_var.shape[0] n_classes = int(math.ceil(math.sqrt(num_output_classes))) n_outputs_per_class = n_classes class_probs = C.softmax(b1 + C.times(input_var, w1)) w2_temp = C.gather(w2s, target_class) w2 = reshape(w2_temp, (input_dim, n_outputs_per_class)) w2 = C.sequence.broadcast_as(w2, input_var) b2 = reshape(C.gather(b2s, target_class), (n_outputs_per_class)) b2 = C.sequence.broadcast_as(b2, input_var) times_result = times(input_var, w2) probs_in_class = softmax(b2 + times_result) probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class) target_output_in_class = C.one_hot(target_output_in_class, n_outputs_per_class, False) probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class) prob_in_class = C.times_transpose(probs_in_class, target_output_in_class) target_class = C.one_hot(target_class, n_classes, False) class_probs = C.sequence.broadcast_as(class_probs, target_class) class_prob = C.times_transpose(class_probs, target_class) output_prob = C.element_times(class_prob, prob_in_class) # this is for calculating all the outputs' probabilities all_probs = [] for i in range(n_classes): ci = C.constant(i) w2a = C.reshape(C.gather(w2s, ci), (input_dim, n_outputs_per_class)) w2a = C.sequence.broadcast_as(w2a, input_var) b2a = C.reshape(C.gather(b2s, ci), (n_outputs_per_class)) b2a = C.sequence.broadcast_as(b2a, input_var) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) cia = C.constant(i, shape=[1]) cia = C.reconcile_dynamic_axes(cia, class_probs) cia = C.one_hot(cia, n_outputs_per_class, False) class_proba = C.times_transpose(class_probs, cia) class_proba = C.sequence.broadcast_as(class_proba, probs_in_classa) output_proba = C.element_times(class_proba, probs_in_classa) all_probs.append(output_proba) return output_prob, class_probs, all_probs
def bigru_with_match(dh, x): c_att = matching_model(att_input, dh) x = C.splice(x, c_att) x = C.element_times(x, C.sigmoid(C.times(x, Wg))) return att_gru(dh, x)
def test_Mul(tmpdir, dtype): with C.default_options(dtype=dtype): data0 = np.asarray([1., 1., 1., 1.], dtype=dtype) data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=dtype) model = C.element_times(data0, data1) verify_no_input(model, tmpdir, 'ElementTimes_0')
def test_Mul(tmpdir, dtype): with C.default_options(dtype = dtype): data0 = np.asarray([1., 1., 1., 1.], dtype=dtype) data1 = np.asarray([0.5, 0.25, 0.125, 0.], dtype=dtype) model = C.element_times(data0, data1) verify_no_input(model, tmpdir, 'ElementTimes_0')
def train_and_evaluate(reader_train, reader_test, max_epochs, model_func): # Input variables denoting the features and label data input_var = input_variable((num_channels, image_height, image_width)) label_var = input_variable((num_classes)) # Normalize the input feature_scale = 1.0 / 256.0 input_var_norm = element_times(feature_scale, input_var) # apply model to input z = model_func(input_var_norm, out_dims=num_classes) # # Training action # # loss and metric ce = cross_entropy_with_softmax(z, label_var) pe = classification_error(z, label_var) # training config epoch_size = 20000 minibatch_size = 64 # Set training parameters lr_per_minibatch = learning_rate_schedule([0.01]*10 + [0.003]*10 + [0.001], UnitType.minibatch, epoch_size) momentum_time_constant = momentum_as_time_constant_schedule(-minibatch_size/np.log(0.9)) l2_reg_weight = 0.001 # trainer object progress_printer = ProgressPrinter(0) learner = momentum_sgd(z.parameters, lr = lr_per_minibatch, momentum = momentum_time_constant, l2_regularization_weight=l2_reg_weight) trainer = Trainer(z, (ce, pe), [learner], [progress_printer]) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } log_number_of_parameters(z) ; print() #progress_printer = ProgressPrinter(tag='Training') # perform model training stop_run=False batch_index = 0 plot_data = {'batchindex':[], 'loss':[], 'error':[]} for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += data[label_var].num_samples # count samples processed so far # For visualization... plot_data['batchindex'].append(batch_index) plot_data['loss'].append(trainer.previous_minibatch_loss_average) plot_data['error'].append(trainer.previous_minibatch_evaluation_average) progress_printer.update_with_trainer(trainer, with_metric=True) # log progress batch_index += 1 if trainer.previous_minibatch_evaluation_average < 0.025: stop_run=True break if stop_run: break progress_printer.epoch_summary(with_metric=True) #trainer.save_checkpoint(model_temp_file) # # Evaluation action # epoch_size = 6600 minibatch_size = 32 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 input_map = { input_var: reader_test.streams.features, label_var: reader_test.streams.labels } while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.1f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") # Visualize training result: window_width = 32 loss_cumsum = np.cumsum(np.insert(plot_data['loss'], 0, 0)) error_cumsum = np.cumsum(np.insert(plot_data['error'], 0, 0)) # Moving average. plot_data['batchindex'] = np.insert(plot_data['batchindex'], 0, 0)[window_width:] plot_data['avg_loss'] = (loss_cumsum[window_width:] - loss_cumsum[:-window_width]) / window_width plot_data['avg_error'] = (error_cumsum[window_width:] - error_cumsum[:-window_width]) / window_width plt.figure(1) plt.subplot(211) plt.plot(plot_data["batchindex"], plot_data["avg_loss"], 'b--') plt.xlabel('Minibatch number') plt.ylabel('Loss') plt.title('Minibatch run vs. Training loss ') plt.show() plt.subplot(212) plt.plot(plot_data["batchindex"], plot_data["avg_error"], 'r--') plt.xlabel('Minibatch number') plt.ylabel('Label Prediction Error') plt.title('Minibatch run vs. Label Prediction Error ') plt.show() return softmax(z)
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None, conv_bias_init=0.0): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: A CNTK variable or constant containing (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height) e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000 add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 num_channels = cfg["CNTK"].RPN_NUM_CHANNELS rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=conv_bias_init)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(rpn_cls_score.shape[0] / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp") p_rpn_cls_score_rshp = cntk.placeholder() rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0) rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob') rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape") # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # classification loss p_rpn_labels = cntk.placeholder() p_rpn_cls_score_rshp = cntk.placeholder() keeps = cntk.greater_equal(p_rpn_labels, 0.0) fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets") bg_labels = minus(1, fg_labels, name="bg_targets") rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0) rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0) rpn_loss_cls = element_times(rpn_ce, keeps) # The terms that are accounted for in the cls loss are those that have a label >= 0 cls_num_terms = reduce_sum(keeps) cls_normalization_factor = 1.0 / cls_num_terms normalized_rpn_cls_loss = reduce_sum(rpn_loss_cls) * cls_normalization_factor reduced_rpn_loss_cls = cntk.as_block(normalized_rpn_cls_loss, [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore', 'norm_rpn_cls_loss') # regression loss p_rpn_bbox_pred = cntk.placeholder() p_rpn_bbox_targets = cntk.placeholder() p_rpn_bbox_inside_weights = cntk.placeholder() rpn_loss_bbox = SmoothL1Loss(cfg["CNTK"].SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0) # The bbox loss is normalized by the rpn batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE normalized_rpn_bbox_loss = reduce_sum(rpn_loss_bbox) * bbox_normalization_factor reduced_rpn_loss_bbox = cntk.as_block(normalized_rpn_bbox_loss, [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets), (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)], 'SmoothL1Loss', 'norm_rpn_bbox_loss') rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses") return rpn_rois, rpn_losses
def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_epochs=80): _cntk_py.set_computation_network_trace_level(1) # Input variables denoting the features and label data input_var = cntk.input((num_channels, image_height, image_width)) label_var = cntk.input((num_classes)) # apply model to input scaled_input = cntk.element_times(cntk.constant(0.00390625), input_var) with cntk.layers.default_options(activation=cntk.relu, pad=True): z = cntk.layers.Sequential([ cntk.layers.For( range(2), lambda: [ cntk.layers.Convolution2D((3, 3), 64), cntk.layers.Convolution2D((3, 3), 64), LocalResponseNormalization(1.0, 4, 0.001, 0.75), cntk.layers.MaxPooling((3, 3), (2, 2)) ]), cntk.layers.For( range(2), lambda i: [cntk.layers.Dense([256, 128][i]), cntk.layers.Dropout(0.5)]), cntk.layers.Dense(num_classes, activation=None) ])(scaled_input) # loss and metric ce = cntk.cross_entropy_with_softmax(z, label_var) pe = cntk.classification_error(z, label_var) # training config minibatch_size = 64 # Set learning parameters lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [ 0.00015625 ] * 20 + [0.000046875] * 10 + [0.000015625] lr_schedule = cntk.learning_rate_schedule( lr_per_sample, unit=cntk.learners.UnitType.sample, epoch_size=epoch_size) mm_time_constant = [0] * 20 + [600] * 20 + [1200] mm_schedule = cntk.learners.momentum_as_time_constant_schedule( mm_time_constant, epoch_size=epoch_size) l2_reg_weight = 0.002 # trainer object learner = cntk.learners.momentum_sgd( z.parameters, lr_schedule, mm_schedule, unit_gain=True, l2_regularization_weight=l2_reg_weight) progress_printer = cntk.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) trainer = cntk.Trainer(z, (ce, pe), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } cntk.logging.log_number_of_parameters(z) print() # perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch( min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() z.save( os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch))) ### Evaluation action epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom)) print("") return metric_numer / metric_denom
def gru_with_attentioin(dh, x): c_att = attention_model(att_input, x) x = C.splice(x, c_att) x = C.element_times(x, C.sigmoid(C.times(x, Wg))) return att_gru(dh, x)