def proj_LSTM(input_dim, out_dim, init_W, init_H, init_b, init_W_0): '''numpy initial''' W = C.Constant(shape=(input_dim, 4096*4),value=init_W) # (512,4096*4) H = C.Constant(shape=(out_dim, 4*4096), value=init_H) b = C.Constant(shape=(4096*4,), value=init_b) proj_W = C.Constant(shape=(4096,out_dim), value=init_W_0) stacked_dim=4096 @C.Function def unit(dh, dc, x): ''' dh: out_dim, dc:4096, x:input_dim''' proj4 = b + times(x, W) + times(dh, H) it_proj = proj4[0:1*stacked_dim] # split along stack_axis bit_proj = proj4[1*stacked_dim: 2*stacked_dim] ft_proj = proj4[2*stacked_dim: 3*stacked_dim] ot_proj = proj4[3*stacked_dim: 4*stacked_dim] it = C.sigmoid(it_proj) # input gate(t) # TODO: should both activations be replaced? bit = it * C.tanh(bit_proj) # applied to tanh of input network ft = C.sigmoid (ft_proj) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = C.sigmoid (ot_proj) # output gate(t) ht = ot * C.tanh(ct) # applied to tanh(cell(t)) c = ct # cell value h = ht proj_h = C.times(h, proj_W) # out_dim return (proj_h, c) return unit
def model(seq_image, decoded): params = dense(decoded) g_x, g_y, sigma2, delta, gamma = attention_parameters(params) i = C.Constant(np.arange(n) + 1, ) # col of patch j = C.Constant(np.arange(n) + 1, ) # row of patch mu_x = g_x + (i - n / 2 - 0.5) * delta mu_y = g_y + (j - n / 2 - 0.5) * delta mu_x = C.expand_dims(mu_x, axis=-1) mu_y = C.expand_dims(mu_y, axis=-1) # mu_x: [#, *] [n, 1] # mu_y: [#, *] [n, 1] image = C.sequence.unpack(seq_image, padding_value=0, no_mask_output=True) # image: [#] [*image_width, filters, image_height] width_pos = Cx.sequence.position(seq_image) # width_pos: [#, *] [1] width_pos_unpacked = C.sequence.unpack(width_pos, padding_value=999_999, no_mask_output=True) # width_pos: [#] [*image_width, 1] a = C.sequence.broadcast_as(C.swapaxes(width_pos_unpacked), mu_x) # a: [#, *] [1, *image_width] # x pos index of image (width) b = C.Constant(np.arange(image_height).reshape((1, -1))) # b: [] [1, image_height] # y pos index of image (height) # calculate the which portion of the image that is attended by the gaussian filter f_xi = C.exp(-0.5 * C.square(a - mu_x) / sigma2) f_yj = C.exp(-0.5 * C.square(b - mu_y) / sigma2) # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] z_x = C.reduce_sum(f_xi, axis=1) z_y = C.reduce_sum(f_yj, axis=1) # z_x: [#, *] [n] # z_y: [#, *] [n] f_xi = f_xi / z_x f_yj = f_yj / z_y # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] # combine filters from x and y image_broadcasted = C.sequence.broadcast_as(image, f_yj) attended = gamma * C.times( f_xi, C.times_transpose(image_broadcasted, f_yj), output_rank=2) # attended: [#, *] [n, filters, n] attended = C.swapaxes(attended) # attended: [#, *] [filters, n (x) , n (y)] return attended
def test_constant_eval(): c = C.Constant(value=1) c_plus_1 = c + 1 op = C.combine([c_plus_1, c]) result = op.eval({}) assert np.array_equal(result[c_plus_1.output], [2.0]) assert np.array_equal(result[c], 1.0)
def flow_reverse(chunk): input_dim = chunk['input_dim'] log_det_J = 0 _half_dim = input_dim//2 _ph = C.placeholder(input_dim, name='place_holder') _log_s_func = chunk['log_s_func'] _t_func = chunk['t_func'] _y1, _y2 = _ph[:_half_dim], _ph[_half_dim:] _log_s = _log_s_func(_y2) _t = _t_func(_y2) _s = C.exp(_log_s) _x1 = (_y1-_t)/_s _x2 = _y2 _X = C.splice(_x1, _x2) log_det_J += C.reduce_sum(C.log(C.abs(_s))) _w = chunk['W_rot_mat'] chunk['W_rot_mat_inv'] = _inv_w = C.Constant(np.linalg.inv(_w.value), name='inv_W') _out = _X@_inv_w log_det_J += input_dim*C.log(C.det(_inv_w)) # if 'scale' in chunk: # _out -= chunk['bias'] # _out /= chunk['scale'] # log_det_J += input_dim*C.reduce_sum(C.log(C.abs(chunk['scale']))) # _out -= chunk['b'] # _out @= _inv_w return _out, log_det_J
def attention(query, key, value): dk = C.reduce_sum(C.ones_like(query)) # cannot use sequence.last, will conflict with recurrence # dk: [#, *] [1, ] and value = int(dim_of_query) unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True) # [#] [-3, key_dim] unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True) # [#] [-3, value_dim] broadcasted_key = C.sequence.broadcast_as(unpacked_key, query) # [#, *] [-3, key_dim] scaled = C.times_transpose(query, broadcasted_key) / dk # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score # masked out invalid temporal connections to obey_sequence_order if obey_sequence_order and max_seq_len: unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs # unpacked_scaled: [#] [-3, -3] <== matrix will be top right diagonally zero-ed # scaled_mask: [#] [-3,] minus_inf = C.constant(-1e+30) valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0)) # [] [max_seq, max_seq] valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled) # [#] [max_seq, max_seq] valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0) # [#] [-3, -3] unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf) # [#] [-3, -3] scaled = C.to_sequence_like(unpacked_scaled, query) # [#, *] [-3] elif obey_sequence_order and not max_seq_len: raise ValueError("max_seq_len must be defined when obey_sequence_order is True") attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query)) # [#, *] [value_dim,] return attended
def create_model(input_sequence, label_sequence, vocab_dim, hidden_dim): # Create the rnn that computes the latent representation for the next token. rnn_with_latent_output = Sequential([ C.layers.Embedding(hidden_dim), For( range(num_layers), lambda: Sequential([ Stabilizer(), Recurrence(LSTM(hidden_dim), go_backwards=False) ])), ]) # Apply it to the input sequence. latent_vector = rnn_with_latent_output(input_sequence) # Connect the latent output to (sampled/full) softmax. if use_sampled_softmax: weights = load_sampling_weights(token_frequencies_file_path) smoothed_weights = np.float32(np.power(weights, alpha)) sampling_weights = C.reshape(C.Constant(smoothed_weights), shape=(1, vocab_dim)) z, ce, errs = cross_entropy_with_sampled_softmax( latent_vector, label_sequence, vocab_dim, hidden_dim, softmax_sample_size, sampling_weights) else: z, ce, errs = cross_entropy_with_full_softmax(latent_vector, label_sequence, vocab_dim, hidden_dim) return z, ce, errs
def pad(x, pattern, mode=C.CONSTANT_PAD, constant_value=0, name=''): """ Pads a tensor in the sequence axis according to the specified patterns. Three padding modes are supported: CONSTANT / REFLECT / SYMMETRIC. Arguments: x: tensor to be padded. pattern (tuple with 2 integers): how many values to add before and after the contents in the sequence axis. mode (int): padding mode: C.ops.CONSTANT_PAD, C.ops.REFLECT_PAD and C.ops.SYMMETRIC_PAD constant_value: the value used to fill the padding cells, only meaningful under CONSTANT mode. name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` """ if not all(isinstance(i, int) for i in pattern) or not isinstance(pattern, tuple): raise ValueError(f"pattern {pattern} must be a tuple with 2 integers") ndim = len(x.shape) null_pattern = [(0, 0)] * ndim final_pattern = [pattern] + null_pattern b, valid = C.sequence.unpack(x, padding_value=0).outputs c = C.pad(b, final_pattern, mode=mode, constant_value=constant_value) seq_length = C.reduce_sum(valid, axis=0) + C.Constant(sum(pattern)) d = C.to_sequence(c, seq_length, name=name) return d
def create_model(model_details, num_classes, input_features, new_prediction_node_name='prediction', freeze=False): # Load the pretrained classification net and find nodes base_model = C.load_model(model_details['model_file']) feature_node = C.logging.find_by_name(base_model, model_details['feature_node_name']) last_node = C.logging.find_by_name(base_model, model_details['last_hidden_node_name']) # Clone the desired layers with fixed weights cloned_layers = C.combine([last_node.owner]).clone( C.CloneMethod.freeze if freeze else C.CloneMethod.clone, {feature_node: C.placeholder(name='features')}) # Add new dense layer for class prediction feat_norm = input_features - C.Constant(114) cloned_out = cloned_layers(feat_norm) z = C.layers.Dense(num_classes, activation=None, name=new_prediction_node_name)(cloned_out) return z
def create_model(model_details, num_classes, input_features, new_prediction_node_name="prediction", freeze=False): # Load the pre-trained classification net and find nodes base_model = cntk.load_model(model_details["model_file"]) feature_node = cntk.logging.find_by_name(base_model, model_details["feature_node_name"]) last_node = cntk.logging.find_by_name(base_model, model_details["last_hidden_node_name"]) if model_details["inception"]: node_outputs = cntk.logging.get_node_outputs(base_model) last_node = node_outputs[5] feature_node = cntk.logging.find_all_with_name(base_model, "")[-5] if model_details["vgg"]: last_node = cntk.logging.find_by_name(base_model, "prob") feature_node = cntk.logging.find_by_name(base_model, "data") # Clone the desired layers with fixed weights cloned_layers = cntk.combine([last_node.owner]).clone( cntk.CloneMethod.freeze if freeze else cntk.CloneMethod.clone, {feature_node: cntk.placeholder(name="features")}, ) # Add new dense layer for class prediction feat_norm = input_features - cntk.Constant(114) cloned_out = cloned_layers(feat_norm) z = cntk.layers.Dense(num_classes, activation=None, name=new_prediction_node_name)(cloned_out) return z
def create_network(cfg): """build the network for faster rcnn""" ##create input variables features = C.input_variable(shape=(cfg.NUM_CHANNELS, cfg.IMAGE_HEIGHT, cfg.IMAGE_WIDTH), dynamic_axes=[C.Axis.default_batch_axis()], name=cfg["MODEL"].FEATURE_NODE_NAME) ##roi_input scaled_gt_boxes = C.input_variable( (cfg.INPUT_ROIS_PER_IMAGE, 5), dynamic_axes=[C.Axis.default_batch_axis()]) dims_in = C.input_variable((6), dynamic_axes=[C.Axis.default_batch_axis()]) dims_input = C.alias(dims_in, name='dims_input') # Load the pre-trained classification net and clone layers base_model = C.load_model(cfg['BASE_MODEL_PATH']) conv_layers = clone_conv_layers(base_model, cfg) fc_layers = clone_model(base_model, [cfg["MODEL"].POOL_NODE_NAME], [cfg["MODEL"].LAST_HIDDEN_NODE_NAME], clone_method=CloneMethod.clone) # Normalization and conv layers feat_norm = features - C.Constant([[[v]] for v in cfg["MODEL"].IMG_PAD_COLOR]) conv_out = conv_layers(feat_norm) # RPN and prediction targets rpn_rois, rpn_losses = create_rpn(conv_out, scaled_gt_boxes, dims_input, cfg) rois, label_targets, bbox_targets, bbox_inside_weights = create_proposal_target_layer( rpn_rois, scaled_gt_boxes, cfg) # Fast RCNN and losses cls_score, bbox_pred = create_fast_rcnn_predictor(conv_out, rois, fc_layers, cfg) detection_losses = create_detection_losses(cls_score, label_targets, bbox_pred, rois, bbox_targets, bbox_inside_weights, cfg) loss = rpn_losses + detection_losses pred_error = classification_error(cls_score, label_targets, axis=1) e2e_lr_factor = cfg["MODEL"].E2E_LR_FACTOR e2e_lr_per_sample_scaled = [ x * e2e_lr_factor for x in cfg["CNTK"].E2E_LR_PER_SAMPLE ] mm_schedule = momentum_schedule(cfg["CNTK"].MOMENTUM_PER_MB) print("Using base model: {}".format(cfg["MODEL"].BASE_MODEL)) print("lr_per_sample: {}".format(e2e_lr_per_sample_scaled)) return { 'features': features, 'roi_input': scaled_gt_boxes, 'loss': loss, 'pred_error': pred_error, 'dim_input': dims_in }
def create_sparse_to_dense(input_vocab_dim): I = C.Constant(np.eye(input_vocab_dim)) @C.Function def no_op(input: InputSequence[C.layers.SparseTensor[input_vocab_dim]]): return C.times(input, I) return no_op
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates=False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.layers.Parameter(shape=(vocab_dim, 1), init=0) weights = C.layers.Parameter(shape=(vocab_dim, hidden_dim), init=C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample( sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency( sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: " + str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:" + str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times( sample_selector, bias, name='zS2') - C.times_transpose( sample_selector, log_prior, name='zS3') # [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times( target_vector, bias, name='zT2') - C.times_transpose( target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape=(vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def create_model(input_vector, label_vector, freq_list, vocab_dim, hidden_dim): hidden_vector = C.layers.Embedding(hidden_dim)(input_vector) #hidden_vector = C.times(input_vector, weights1) + bias1 smoothed_weights = np.float32(np.power(freq_list, alpha)) sampling_weights = C.reshape(C.Constant(smoothed_weights), shape = (1,vocab_dim)) return cross_entropy_with_sampled_softmax(hidden_vector, label_vector, vocab_dim, hidden_dim, num_of_samples, sampling_weights)
def __cntk_trace__(m): if len(m.shape) != 2: raise RuntimeError(f'{m.shape} is not 2 dims') if m.shape[0] != m.shape[1]: raise RuntimeError(f'{m.shape} is different size') _dim = m.shape[0] _identity_matrix = C.Constant(np.eye(_dim)) return C.reduce_sum(m * _identity_matrix)
def __init__(self, p, eps=1e-7): if isinstance(p, (C.Variable, C.Function)): self.p = C.squeeze(p) else: self.p = C.Constant(np.squeeze(p)) self.eps = C.Constant(eps, name='eps') self.c = self.p.shape[0] self.prob = self.p / (self.eps + C.reduce_sum(self.p)) self.logits = C.log(self.prob) self.accum_prob = self.prob @ C.Constant( (1 - np.tri(self.prob.shape[-1], k=-1))) p_log_p = self.logits * self.prob self._entropy = -C.reduce_sum(p_log_p) dist = C.input_variable(1, name='category index') # method 1 self._log_prob = C.log( C.reduce_sum(self.prob * C.one_hot(dist, self.c)))
def f(self, input_dim): x = C.input_variable(input_dim, needs_gradient=True, name='input') z, sum_log_det_jacob = x, C.Constant(0, name='log_det_zero') for i in reversed(range(len(self.t))): z_ = self.mask[i] * z s = self.s[i](z_) * (1 - self.mask[i]) t = self.t[i](z_) * (1 - self.mask[i]) z = z_ + (1 - self.mask[i]) * (z - t) * C.exp(-s) sum_log_det_jacob -= C.reduce_sum(s) z = C.squeeze(z) return z, sum_log_det_jacob
def multivariate_kl_divergence(input_layer): _dim = input_layer.shape[0] out_value = C.unpack_batch(input_layer) _mu1 = C.transpose(C.reduce_mean(out_value, axis=0), [1, 0]) _sigma1 = C.cov2(input_layer) _mu2 = C.zeros_like(_mu1) _sigma2 = C.Constant(np.eye(_dim)) _sigma2_inv = _sigma2 # identity matrix return 0.5 * (C.log(C.det(_sigma2) / C.det(_sigma1)) - _dim + C.trace(_sigma2_inv @ _sigma1) + C.transpose( (_mu2 - _mu1), [1, 0]) @ _sigma2_inv @ (_mu2 - _mu1))
def KLF_reverse(chunk): input_dim = chunk['input_dim'] _ph = C.placeholder(input_dim, name='place_holder') inv_act_func = chunk['inv_act_func'] _out = inv_act_func(_ph) if 'scale' in chunk: _out -= chunk['bias'] _out /= chunk['scale'] _w = chunk['W'] _inv_w = C.Constant(np.linalg.inv(_w.value), name='inv_W') _out -= chunk['b'] _out @= _inv_w return _out
def decode(history, q, c, start_logits, end_logits): q = encode(q) c = encode_c(C.splice(c, start_logits, end_logits, axis=0)) r = history r = stab_in(r) q_last_h = C.sequence.last(q.outputs[0]) q_last_c = C.sequence.last(q.outputs[1]) c_last_h = C.sequence.last(c.outputs[0]) c_last_c = C.sequence.last(c.outputs[1]) initial_hstate = hstate_dense(C.splice(q_last_h, c_last_h)) initial_cstate = cstate_dense(C.splice(q_last_c, c_last_c)) rec_block = rec_blocks[0] # LSTM(hidden_dim) # :: (dh, dc, x) -> (h, c) @C.Function def find_embed(x): gx, ngx = C.slice(x, 0, 0, self.wg_dim), C.slice(x, 0, self.wg_dim, self.vocab_size) return embed(gx, ngx) @C.Function def lstm_with_attention(dh, dc, r, x): history_embed = find_embed(x) h_att = attention_model(c.outputs[0], dh) q_att = attention_model(q.outputs[0], dh) att = C.splice(h_att, q_att) x = C.splice(x, att) x, dc = rec_block(dh, dc, x).outputs # 0*r is a hack because cntk freaks out when r is not used. r = U_dense(att) + W_dense(history_embed) + V_dense(x) + 0*r #bug when W_dense is added first, wtf?! #r = W_dense(embed(gx, ngx)) + U_dense(att) + V_dense(x) + 0*r return x, dc, r _, _, r = C.layers.RecurrenceFrom(lstm_with_attention, return_full_state=True)(initial_hstate, initial_cstate, C.Constant(np.zeros(2*self.hidden_dim)),r).outputs r = maxout(r) r = stab_out(r) r = proj_out(r) #r = C.softmax(r) r = C.layers.Label('out_proj_out')(r) return r
def main(): show_image = False if show_image: bs = 1 ci = 3 co = 3 cg = co * (ci + 1) gd = 8 gh = 64 gw = 64 h = 256 w = 256 else: bs = 1 ci = 3 co = 3 cg = co * (ci + 1) gd = 8 gh = 64 gw = 64 h = 1024 w = 1024 im = C.input_variable([bs, ci, h, w], needs_gradient=True, dynamic_axes=[]) guide = C.input_variable([bs, h, w], needs_gradient=True, dynamic_axes=[]) guide_no_grad = C.input_variable([bs, h, w], needs_gradient=False, dynamic_axes=[]) grid = C.input_variable([bs, cg, gd, gh, gw], needs_gradient=True, dynamic_axes=[]) # Create indices xx = np.arange(0, w).reshape(1, -1).repeat(h, 0).astype(np.float32) yy = np.arange(0, h).reshape(-1, 1).repeat(w, 1).astype(np.float32) xx = C.Constant(xx, xx.shape) yy = C.Constant(yy, yy.shape) gx = ((xx + 0.5) / w) * gw gy = ((yy + 0.5) / h) * gh gz = C.clip(guide, 0.0, 1.0) * gd gz_no_grad = C.clip(guide_no_grad, 0.0, 1.0) * gd fx = C.element_max(C.floor(gx - 0.5), 0.0) fy = C.element_max(C.floor(gy - 0.5), 0.0) fz = C.element_max(C.floor(gz - 0.5), 0.0) fz_no_grad = C.element_max(C.floor(gz_no_grad - 0.5), 0.0) wx = gx - 0.5 - fx wy = gy - 0.5 - fy wx = C.expand_dims(C.expand_dims(wx, -1 - len(wx.shape)), -1 - len(wx.shape)) wy = C.expand_dims(C.expand_dims(wy, -1 - len(wy.shape)), -1 - len(wy.shape)) wz = C.abs(gz - 0.5 - fz) wz = C.expand_dims(wz, 0) fx = C.expand_dims(C.expand_dims(fx, -1 - len(fx.shape)), -1 - len(fx.shape)) fy = C.expand_dims(C.expand_dims(fy, -1 - len(fy.shape)), -1 - len(fy.shape)) cx = C.element_min(fx + 1, gw - 1) cy = C.element_min(fy + 1, gh - 1) cz = C.element_min(fz_no_grad + 1, gd - 1) batch_idx = np.arange(bs).reshape(bs, 1, 1, 1).astype(np.float32) batch_idx = C.Constant(batch_idx, batch_idx.shape) out = [] flat_grid = C.reshape(grid, [-1]) for c_ in range(co): c_idx = np.arange((ci + 1) * c_, (ci + 1) * (c_ + 1)).reshape(1, ci + 1, 1, 1).astype(np.float32) c_idx = C.Constant(c_idx, c_idx.shape) def flatten_and_gather(x, y, z): linear_idx = x + gw * y + gw * gh * z + c_idx * gw * gh * gd + batch_idx * gw * gh * gd * cg flat_linear_idx = C.reshape(linear_idx, [-1]) return C.reshape(C.gather(flat_grid, flat_linear_idx), linear_idx.shape) gather_fff = flatten_and_gather(fx, fy, fz_no_grad) gather_ffc = flatten_and_gather(fx, fy, cz) gather_fcf = flatten_and_gather(fx, cy, fz_no_grad) gather_fcc = flatten_and_gather(fx, cy, cz) gather_cff = flatten_and_gather(cx, fy, fz_no_grad) gather_cfc = flatten_and_gather(cx, fy, cz) gather_ccf = flatten_and_gather(cx, cy, fz_no_grad) gather_ccc = flatten_and_gather(cx, cy, cz) a = gather_fff*(1-wx)*(1-wy)*(1-wz) + \ gather_ffc*(1-wx)*(1-wy)*( wz) + \ gather_fcf*(1-wx)*( wy)*(1-wz) + \ gather_fcc*(1-wx)*( wy)*( wz) + \ gather_cff*( wx)*(1-wy)*(1-wz) + \ gather_cfc*( wx)*(1-wy)*( wz) + \ gather_ccf*( wx)*( wy)*(1-wz) + \ gather_ccc*( wx)*( wy)*( wz) o = C.reduce_sum(a[:, :-1, ...] * im, 1) + a[:, -1, ...] print(o.shape) out.append(C.expand_dims(o, 0)) out = C.splice(*out, axis=1) loss = C.reduce_l2(out) grid_val = np.random.rand(bs, cg, gd, gh, gw).astype(np.float32) if show_image: guide_val = skio.imread("/data/rgb.png").mean(2)[:h, :w].astype( np.float32) guide_val = np.expand_dims(guide_val / 255.0, 0) im_val = np.tile(np.expand_dims(guide_val, 1), [1, 3, 1, 1]) out_val = out.eval({ im: im_val, guide: guide_val, guide_no_grad: guide_val, grid: grid_val }) out_val = np.clip(np.transpose(np.squeeze(out_val), [1, 2, 0]), 0, 1) skio.imsave("/output/imout.png", out_val) else: im_val = np.random.randn(bs, ci, h, w) guide_val = np.random.rand(bs, h, w).astype(np.float32) # burning iteration for it in range(5): print('burning (', it, ')') g = loss.grad({ im: im_val, guide: guide_val, guide_no_grad: guide_val, grid: grid_val }) # actual iterations start = time.time() for it in range(50): print('profiling (', it, ')') g = loss.grad({ im: im_val, guide: guide_val, guide_no_grad: guide_val, grid: grid_val }) end = time.time() runtime = (end - start) * 1000.0 / 50.0 print('Runtime:', runtime)
# input_variable sample # import cntk myFeatures = 7 features = cntk.input_variable(myFeatures) print(features) alternativeFeatures = cntk.input_variable(7) print(alternativeFeatures)) # input_variable with different shapes ## import cntk data = cntk.input_variable(shape=[3,5]) # Using Parameter # import cntk data = cntk.parameter(shape=(3,5), init=2) data.value # Using Constant # import cntk data = cntk.Constant(6,shape = (3,4)) data.value # Using Record # import cntk.variables as var record = var.Record(x = 23, y = 32, z = 55) # printing the record values # record.x record.y record.z
def BilateralSlice(sz, i_chans, o_chans, grid_sz=64, sigma_r=8): gsize = [(i_chans+1)*o_chans, sigma_r, grid_sz, grid_sz] grid = C.Parameter(gsize, name="grid", init=np.random.uniform(size=gsize)) guide_scale = C.Parameter((1, ), name="guide_scale", init=np.ones((1, ))) grid_scale = C.Parameter((1, ), name="grid_scale", init=np.ones((1, ))) im_scale = C.Parameter((1, ), name="im_scale", init=np.ones((1, ))) yy, xx = np.meshgrid(np.arange(0, sz), np.arange(0, sz)) xx = np.expand_dims(xx, 0) yy = np.expand_dims(yy, 0) cc = np.arange(0, i_chans+1) cc = np.expand_dims(cc, 1) cc = np.expand_dims(cc, 2) xx = C.Constant(xx, xx.shape) yy = C.Constant(yy, yy.shape) cc = C.Constant(cc, cc.shape) @C.functions.BlockFunction("BilateralSlice", "bilateral_slice") def bilateral_slice(im, guide, guide_no_grad): # Flatten data for gather op flat_grid = grid_scale*C.reshape(grid, [grid_sz*grid_sz*sigma_r*o_chans*(i_chans+1)]) # flat_grid_u = C.unpack_batch(flat_grid) # Make sure we do sth that requires the gradient w.r.t guide scaled_guide = guide_scale*guide gx_d, gy_d, gz_d, fx_d, fy_d, fz_d, _, _, _ = grid_coord( scaled_guide, xx, yy, sz, grid_sz, sigma_r) wx = C.abs(gx_d - 0.5 - fx_d) wy = C.abs(gy_d - 0.5 - fy_d) wz = C.abs(gz_d - 0.5 - fz_d) # Enclosing cell gx, gy, gz, fx, fy, fz, cx, cy, cz = grid_coord( guide_no_grad, xx, yy, sz, grid_sz, sigma_r) out_chans = [] for chan in range(o_chans): output_components = [] for ix, x in enumerate([fx, cx]): wx_ = (1-wx) if ix == 0 else wx for iy, y in enumerate([fy, cy]): wy_ = (1-wy) if iy == 0 else wy for iz, z in enumerate([fz, cz]): wz_ = (1-wz) if iz == 0 else wz linear_idx = x + grid_sz*(y + grid_sz*(z + sigma_r*(cc + chan*(i_chans+1)))) flat_linear_idx = C.reshape(linear_idx, [(i_chans+1)*sz*sz]) # Slice interp = C.gather(flat_grid, flat_linear_idx) interp_fsz = C.reshape(interp, [i_chans+1, sz, sz])*wx_*wy_*wz_ output_components.append(interp_fsz) out_coeffs = sum(output_components) out_chan = C.reduce_sum(out_coeffs[:i_chans]*(im_scale*im) + out_coeffs[-1], 0) out_chans.append(out_chan) out = C.splice(*out_chans, axis=0) return out return bilateral_slice
def test_constant_value(value): c = C.Constant(value=value) assert np.allclose(c.value, value)
import cntk as C import numpy as np from io_funcs.binary_io import BinaryIOCollection from model_lf0_weight import SRU_MULTI_SPEAKER gpu_descriptor = C.gpu(3) C.try_set_default_device(gpu_descriptor) proj = SRU_MULTI_SPEAKER(87, 187, 0.001, 0.5) trainer = proj.trainer trainer.restore_from_checkpoint('net/16k/trainer_' + str(41)) output = trainer.model index = C.Constant(value=np.asarray([0, 1, 0]).astype(np.float32)) input = C.sequence.input_variable(shape=87) out = output(input, index) out.save('extracted_model/16k/model_emo')
def parameters(self): return self.forward.parameters if __name__ == '__main__': nets = lambda: C.layers.Sequential([ C.layers.Dense(256, activation=C.leaky_relu), C.layers.Dense(256, activation=C.leaky_relu), C.layers.Dense(2, activation=C.tanh) ])(C.placeholder(2)) nett = lambda: C.layers.Sequential([ C.layers.Dense(256, activation=C.leaky_relu), C.layers.Dense(256, activation=C.leaky_relu), C.layers.Dense(2) ])(C.placeholder(2)) masks = C.Constant(np.array([[0, 1], [1, 0]] * 3).astype(np.float32), name='mask') prior = MultivariateNormalDiag(loc=[0., 0.], scale_diag=[1., 1.]) flow = RealNVP(nets, nett, masks, prior) loss = -C.reduce_mean(flow.log_prob) learner = C.adam(loss.parameters, C.learning_parameter_schedule(1e-1), C.momentum_schedule(0.9)) trainer = C.Trainer(flow.forward, (loss, None), learner) for t in range(5001): noisy_moons = datasets.make_moons(n_samples=1000, noise=.05)[0].astype(np.float32) trainer.train_minibatch({loss.arguments[0]: noisy_moons}) if t % 500 == 0:
def main(): bs = 4 c = 64 h = 512 w = 512 im = C.input_variable([bs, c, h, w], needs_gradient=True, dynamic_axes=[]) warp = C.input_variable([bs, 2, h, w], needs_gradient=True, dynamic_axes=[]) warp_ng = C.input_variable([bs, 2, h, w], needs_gradient=False, dynamic_axes=[]) # Create indices dx = 0.5 * (warp[:, 0, :, :] + 1.0) dy = 0.5 * (warp[:, 1, :, :] + 1.0) new_x = C.clip(dx * w, 0, w) new_y = C.clip(dy * h, 0, h) fx = C.clip(C.floor(new_x), 0, w - 2) fy = C.clip(C.floor(new_y), 0, h - 2) wx = new_x - fx wy = new_y - fy dx_ng = 0.5 * (warp_ng[:, 0, :, :] + 1.0) dy_ng = 0.5 * (warp_ng[:, 1, :, :] + 1.0) new_x_ng = C.clip(dx_ng * w, 0, w) new_y_ng = C.clip(dy_ng * h, 0, h) fx_ng = C.clip(C.floor(new_x_ng), 0, w - 2) fy_ng = C.clip(C.floor(new_y_ng), 0, h - 2) chan_idx = np.arange(c).reshape(1, c, 1, 1) chan_idx = C.Constant(chan_idx, chan_idx.shape) batch_idx = np.arange(bs).reshape(bs, 1, 1, 1) batch_idx = C.Constant(batch_idx, batch_idx.shape) flat_im = C.reshape(im, [-1]) def flatten_and_gather(x, y): linear_idx = x + w * y + w * h * chan_idx + w * h * c * batch_idx flat_linear_idx = C.reshape(linear_idx, [-1]) return C.reshape(C.gather(flat_im, flat_linear_idx), linear_idx.shape) gather_ff = flatten_and_gather(fx_ng, fy_ng) gather_fc = flatten_and_gather(fx_ng, fy_ng + 1) gather_cf = flatten_and_gather(fx_ng + 1, fy_ng) gather_cc = flatten_and_gather(fx_ng + 1, fy_ng + 1) out = gather_ff*(1-wx)*(1-wy) + \ gather_fc*(1-wx)*( wy) + \ gather_cf*( wx)*(1-wy) + \ gather_cc*( wx)*( wy) loss = C.reduce_l2(out) im_val = np.random.randn(bs, c, h, w).astype(np.float32) warp_val = np.random.rand(bs, 2, h, w).astype(np.float32) # burning iteration for it in range(5): print('burning (', it, ')') g = loss.grad({im: im_val, warp: warp_val, warp_ng: warp_val}) # actual iterations start = time.time() for it in range(50): print('profiling (', it, ')') g = loss.grad({im: im_val, warp: warp_val, warp_ng: warp_val}) end = time.time() runtime = (end - start) * 1000.0 / 50.0 print('Runtime:', runtime)
if not cntk.device.try_set_default_device(dev): print("Error: error setting device") sys.exit(1) else: dev = None N = float(saxpy.N) YVAL = float(saxpy.YVAL) XVAL = float(saxpy.XVAL) AVAL = float(saxpy.AVAL) print("N: {}".format(N)) a = cntk.Constant(value=AVAL, shape=[N], dtype=np.float32, device=dev, name="a") y = cntk.Parameter(shape=[N], init=YVAL, dtype=np.float32, device=dev, name="y") x = cntk.Parameter(shape=[N], init=XVAL, dtype=np.float32, device=dev, name="x") t0 = time.time() cntk.assign(y, y + a * x).eval()
# return C.atan(x)*5 # c_block = KLF_forward(c_dim, batch_norm=True) c_block = [] for i in range(6): c_block.append(flow_forward(c_dim, batch_norm=False)) # single = np.array([[1, 2]]) # # multi = np.random.uniform(size=(100, 2)) # multi = np.random.normal(size=(100, 2)) # value = multi.astype(np.float32) q = c_input log_det_J = C.Constant(0) bn = [] bn_update = [] for block in c_block: log_det_J += block[1](q) if 'muB' in block[-1]: # batch norm bn.append(block[-1]['muB'](q)) bn.append(block[-1]['varB'](q)) bn_update.append(block[-1]['mu']) bn_update.append(block[-1]['var']) q = block[0](q) base_dist = MultivariateNormalDiag(loc=[0., 0.], scale_diag=[1., 1.]) # log_q_k = C.log(base_dist.pdf(z_0)) - sum_log_det_jacob
def main(): print("version", C.__version__) bs = 1 n_chans = 1 sigma_s = 16 sigma_r = 12 # 4x4x1024x1024 # 4x12x64x64 sz = 256 # sz = 1024 small_sz = sz // sigma_s yy, xx = np.meshgrid(np.arange(0, sz), np.arange(0, sz)) cc, bb = np.meshgrid(np.arange(0, n_chans), np.arange(0, bs)) xx = np.expand_dims(xx, 0) xx = np.expand_dims(xx, 0) yy = np.expand_dims(yy, 0) yy = np.expand_dims(yy, 0) bb = np.expand_dims(bb, 2) bb = np.expand_dims(bb, 3) cc = np.expand_dims(cc, 2) cc = np.expand_dims(cc, 3) # Compute graph grid = C.Parameter([bs, n_chans, sigma_r, small_sz, small_sz], ) # grid = C.input_variable( # [bs, n_chans, sigma_r, small_sz, small_sz], # dynamic_axes=[], needs_gradient=True) guide = C.input_variable([bs, sz, sz], dynamic_axes=[], needs_gradient=True) guide_non_diff = C.input_variable([bs, sz, sz], dynamic_axes=[]) # Coordinates xx = C.Constant(xx, xx.shape) yy = C.Constant(yy, yy.shape) cc = C.Constant(cc, cc.shape) bb = C.Constant(bb, bb.shape) gx_d, gy_d, gz_d, fx_d, fy_d, fz_d, _, _, _ = grid_coord( guide, xx, yy, sz, small_sz, sigma_r, bs) # Trilerp weights wx = (gx_d - 0.5 - fx_d) wy = (gy_d - 0.5 - fy_d) wz = C.abs(gz_d - 0.5 - fz_d) # Enclosing cell gx, gy, gz, fx, fy, fz, cx, cy, cz = grid_coord(guide_non_diff, xx, yy, sz, small_sz, sigma_r, bs) output_components = [] for ix, x in enumerate([fx, cx]): wx_ = (1 - wx) if ix == 0 else wx for iy, y in enumerate([fy, cy]): wy_ = (1 - wy) if iy == 0 else wy for iz, z in enumerate([fz, cz]): wz_ = (1 - wz) if iz == 0 else wz linear_idx = x + small_sz * (y + small_sz * (z + sigma_r * (cc + n_chans * bb))) # Flatten data for gather op flat_grid = C.reshape( grid, [bs * small_sz * small_sz * sigma_r * n_chans]) flat_linear_idx = C.reshape(linear_idx, [bs * n_chans * sz * sz]) # Slice interp = C.gather(flat_grid, flat_linear_idx) interp_fsz = C.reshape(interp, [bs, n_chans, sz, sz]) output_components.append(interp_fsz * wz_ * wx_ * wy_) out = sum(output_components) loss = C.squared_error(out, guide) # svg = C.logging.graph.plot(out, "/output/graph.svg") grid_data = np.random.uniform(size=(bs, n_chans, sigma_r, small_sz, small_sz)).astype(np.float32) # guide_data = np.random.uniform( # size=(bs, sz, sz)).astype(np.float32) guide_data = skio.imread("/data/rgb.png").mean(2)[:sz, :sz].astype( np.float32) guide_data = np.expand_dims(guide_data, 0) / 255.0 inputs = {guide: guide_data, guide_non_diff: guide_data}
def flow_forward(input_dim: int, act_func_pair: tuple = (None, None), batch_norm: bool = False): chunk = {} log_det_J = 0 chunk['input_dim'] = input_dim _ph = C.placeholder(input_dim, name='place_holder') _out = _ph if batch_norm: # _bn = C.layers.BatchNormalization(name='batch_norm')(_ph) # chunk['scale'] = _bn.parameters[0] # chunk['bias'] = _bn.parameters[1] chunk['mu'] = C.Constant(np.zeros(shape=input_dim)) chunk['var'] = C.Constant(np.ones(shape=input_dim)) _eps = C.Constant(1e-7) _mu = C.reduce_mean(_ph, axis=C.Axis.default_batch_axis()) _var = C.reduce_mean(C.square(_ph-_mu), axis=C.Axis.default_batch_axis()) chunk['muB'] = _mu chunk['varB'] = _var # _bn = (_ph-chunk['mu'])/C.sqrt(chunk['var']+_eps) _bn = C.sqrt(chunk['var']+_eps)*_ph + chunk['mu'] _ph = _bn log_det_J += -0.5*C.reduce_sum(C.log((_var+_eps))) # log_det_J += C.reduce_sum(C.log()) chunk['W_rot_mat'] = _W = C.parameter((input_dim, input_dim)) _W.value = random_rotation_matrix = special_ortho_group.rvs(input_dim) # _W.value = np.roll(np.eye(input_dim),input_dim//2,axis=0) _out = _ph@_W log_det_J += C.log(C.abs(C.det(_W))) # or # log_det_J += C.slogdet(_W)[1] _half_dim = input_dim//2 _x1 = _out[:_half_dim] _x2 = _out[_half_dim:] _log_s_func, _t_func = act_func_pair if _log_s_func is None: # basic network _log_s_func = C.layers.Sequential([ C.layers.Dense(256, C.leaky_relu), C.layers.Dense(256, C.leaky_relu), C.layers.Dense(_half_dim, C.tanh), ])#(C.placeholder(input_dim, name='place_holder')) if _t_func is None: # basic network _t_func = C.layers.Sequential([ C.layers.Dense(256, C.leaky_relu), C.layers.Dense(256, C.leaky_relu), C.layers.Dense(_half_dim), ])#(C.placeholder(input_dim, name='place_holder')) chunk['log_s_func'] = _log_s_func chunk['t_func'] = _t_func _log_s, _t = _log_s_func(_x2), _t_func(_x2) _s = C.exp(_log_s) _y1 = _s*_x1 + _t _y2 = _x2 _Y = C.splice(_y1, _y2) chunk['output'] = _Y log_det_J += C.reduce_sum(_log_s) return _Y, log_det_J, chunk