def fmeasure(output, target, beta=1): """ This operation computes the f-measure between the output and target. If beta is set as one, its called the f1-scorce or dice similarity coefficient. f1-scorce is monotonic in jaccard distance. f-measure = (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall) This loss function is frequently used in semantic segmentation of images. Works with imbalanced classes, for balanced classes you should prefer cross_entropy instead. This operation works with both binary and multiclass classification. Args: output: the output values from the network target: it is usually a one-hot vector where the hot bit corresponds to the label index beta: greater than one weights recall higher than precision, less than one for the opposite. Commonly chosen values are 0.5, 1 or 2. Returns: :class:`~cntk.ops.functions.Function` """ assert len(target.shape) == len(output.shape) if len(output.shape) == 3: axis = (1, 2) # assumes that the first axis is the class axis else: axis = None correct_predictions = C.reduce_sum(output * target, axis=axis) precision = correct_predictions / C.reduce_sum(output, axis=axis) recall = correct_predictions / C.reduce_sum(target, axis=axis) return 1 - (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall)
def cross_entropy_with_full_softmax( output, # Node providing the output of the lstm layers target_vector, # Node providing the expected labels sv_dim, vocab_dim ): sv_vector = output.outputs[3] z = output.outputs[0] zT = C.times_transpose(z, target_vector) # cross entropy loss with softmax function ce = - C.log(zT) # the error zMax = C.reduce_max(z) error = C.less(zT, zMax) ce = sequence.reduce_sum(ce) # discourages the network from turning more than one gate off in a single time step. sumc = C.abs(C.sequence.slice(sv_vector, 1, 0) - C.sequence.slice(sv_vector, 0, -1)) sumc = sequence.reduce_sum(0.0001 * C.pow(100.0, sumc)) #ce += sumc # penalise generated utterances that failed to render all the required slots sumc += C.abs(C.sequence.last(sv_vector)) sumc += C.abs(C.sequence.first(sv_vector) - output.outputs[4]) sumc = C.reduce_sum(sumc) ce = C.reduce_sum(ce) ce += sumc return ce, error
def create_detection_losses(cls_score, label_targets, bbox_pred, rois, bbox_targets, bbox_inside_weights, cfg): # The losses are normalized by the batch size # classification loss p_cls_score = placeholder() p_label_targets = placeholder() cls_loss = cross_entropy_with_softmax(p_cls_score, p_label_targets, axis=1) cls_normalization_factor = 1.0 / cfg.NUM_ROI_PROPOSALS normalized_cls_loss = reduce_sum(cls_loss) * cls_normalization_factor reduced_cls_loss = cntk.as_block(normalized_cls_loss, [(p_cls_score, cls_score), (p_label_targets, label_targets)], 'CrossEntropyWithSoftmax', 'norm_cls_loss') # regression loss p_bbox_pred = placeholder() p_bbox_targets = placeholder() p_bbox_inside_weights = placeholder() bbox_loss = SmoothL1Loss(cfg.SIGMA_DET_L1, p_bbox_pred, p_bbox_targets, p_bbox_inside_weights, 1.0) bbox_normalization_factor = 1.0 / cfg.NUM_ROI_PROPOSALS normalized_bbox_loss = reduce_sum(bbox_loss) * bbox_normalization_factor reduced_bbox_loss = cntk.as_block(normalized_bbox_loss, [(p_bbox_pred, bbox_pred), (p_bbox_targets, bbox_targets), (p_bbox_inside_weights, bbox_inside_weights)], 'SmoothL1Loss', 'norm_bbox_loss') detection_losses = plus(reduced_cls_loss, reduced_bbox_loss, name="detection_losses") return detection_losses
def create_detection_losses(cls_score, label_targets, rois, bbox_pred, bbox_targets, bbox_inside_weights): # classification loss cls_loss = cross_entropy_with_softmax(cls_score, label_targets, axis=1) p_cls_loss = placeholder() p_rois = placeholder() # The terms that are accounted for in the cls loss are those that correspond to an actual roi proposal --> do not count no-op (all-zero) rois roi_indicator = reduce_sum(p_rois, axis=1) cls_num_terms = reduce_sum(cntk.greater_equal(roi_indicator, 0.0)) cls_normalization_factor = 1.0 / cls_num_terms normalized_cls_loss = reduce_sum(p_cls_loss) * cls_normalization_factor reduced_cls_loss = cntk.as_block(normalized_cls_loss, [(p_cls_loss, cls_loss), (p_rois, rois)], 'Normalize', 'norm_cls_loss') # regression loss p_bbox_pred = placeholder() p_bbox_targets = placeholder() p_bbox_inside_weights = placeholder() bbox_loss = SmoothL1Loss(cfg["CNTK"].SIGMA_DET_L1, p_bbox_pred, p_bbox_targets, p_bbox_inside_weights, 1.0) # The bbox loss is normalized by the batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].BATCH_SIZE normalized_bbox_loss = reduce_sum(bbox_loss) * bbox_normalization_factor reduced_bbox_loss = cntk.as_block(normalized_bbox_loss, [(p_bbox_pred, bbox_pred), (p_bbox_targets, bbox_targets), (p_bbox_inside_weights, bbox_inside_weights)], 'SmoothL1Loss', 'norm_bbox_loss') detection_losses = plus(reduced_cls_loss, reduced_bbox_loss, name="detection_losses") return detection_losses
def sample_gaussian_mdn(prediction_tensor, nmix: int, ndim: int): """ Constructs sampling nodes from mixture density network outputs Example: ndim, nmix = 1, 3 a = C.input_variable(ndim) prediction = Dense((ndim + 2) * nmix)(a) sampled = sample_gaussian_mdn(prediction, nmix, ndim) results = sampled.eval({a: x}) # different results every time you eval Arguments: prediction_tensor: input tensor nmix (int): number of mixture ndim (int): number of dimension of gaussian Returns: :class:`~cntk.ops.functions.Function` """ alpha_tensor, mu_tensor, sigma_tensor = gaussian_mdn_coeff( prediction_tensor, nmix=nmix, ndim=ndim) selected_alpha = random.sample(alpha_tensor) selected_mu_tensor = C.reduce_sum(mu_tensor * C.expand_dims(selected_alpha, axis=-1), axis=0) selected_sigma_tensor = C.reduce_sum(sigma_tensor * selected_alpha, axis=0) sampled = C.random.normal_like( selected_sigma_tensor) * selected_sigma_tensor + selected_mu_tensor return sampled
def dice_coefficient(x, y): # average of per-channel dice coefficient # global dice coefificnet doesn't work as class with larger region dominates the metrics # https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient intersection = C.reduce_sum(x * y, axis=(1,2)) return C.reduce_mean(2.0 * intersection / (C.reduce_sum(x, axis=(1,2)) + C.reduce_sum(y, axis=(1,2)) + 1.0))
def forward_network(cls, input_dim: int):# , batch_norm: bool = False): chunk = {} log_det_J = 0 chunk['input_dim'] = input_dim _out = _ph = C.placeholder(input_dim, name='place_holder') _half_dim = input_dim//2 _x1, _x2 = _out[:_half_dim], _out[_half_dim:] chunk['log_s_func'] = _log_s_func = cls.basic_network(_half_dim, 'log_s_func') chunk['t_func'] = _t_func = cls.basic_network(_half_dim, 't_func') _log_s, _t = _log_s_func(_x1), _t_func(_x1) _x2 = _t + _x2 * C.exp(_log_s) log_det_J += C.reduce_sum(_log_s) _out = C.splice(_x1, _x2) # ==== _x1, _x2 = _out[:_half_dim], _out[_half_dim:] chunk['log_s_func2'] = _log_s_func2 = cls.basic_network(_half_dim, 'log_s_func2') chunk['t_func2'] = _t_func2 = cls.basic_network(_half_dim, 't_func2') _log_s2, _t2 = _log_s_func2(_x2), _t_func2(_x2) _x1 = _x1 * C.exp(_log_s2) + _t2 log_det_J += C.reduce_sum(_log_s2) _out = _Y = C.splice(_x1, _x2) # _out = C.as_block(_out, [(_ph,_ph)],'asdf1','zxcv1') return _out, log_det_J, chunk
def fmeasure(output, target, beta=1): """ This operation computes the f-measure between the output and target. If beta is set as one, its called the f1-scorce or dice similarity coefficient. f1-scorce is monotonic in jaccard distance. f-measure = (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall) This loss function is frequently used in semantic segmentation of images. Works with imbalanced classes, for balanced classes you should prefer cross_entropy instead. This operation works with both binary and multiclass classification. Args: output: the output values from the network target: it is usually a one-hot vector where the hot bit corresponds to the label index beta: greater than one weights recall higher than precision, less than one for the opposite. Commonly chosen values are 0.5, 1 or 2. Returns: :class:`~cntk.ops.functions.Function` """ assert len(target.shape) == len(output.shape) if len(output.shape) == 3: axis = (1, 2) # assumes that the first axis is the class axis else: axis = None correct_predictions = C.reduce_sum(output * target, axis=axis) precision = correct_predictions / C.reduce_sum(output, axis=axis) recall = correct_predictions / C.reduce_sum(target, axis=axis) return 1 - (1 + beta**2) * precision * recall / (beta**2 * precision + recall)
def dice_coefficient(x, y): # average of per-channel dice coefficient intersection = C.reduce_sum(x * y, axis=(1, 2)) return C.reduce_mean( 2.0 * intersection / (C.reduce_sum(x, axis=(1, 2)) + C.reduce_sum(y, axis=(1, 2)) + 1.0))
def model(seq_image, decoded): params = dense(decoded) g_x, g_y, sigma2, delta, gamma = attention_parameters(params) i = C.Constant(np.arange(n) + 1, ) # col of patch j = C.Constant(np.arange(n) + 1, ) # row of patch mu_x = g_x + (i - n / 2 - 0.5) * delta mu_y = g_y + (j - n / 2 - 0.5) * delta mu_x = C.expand_dims(mu_x, axis=-1) mu_y = C.expand_dims(mu_y, axis=-1) # mu_x: [#, *] [n, 1] # mu_y: [#, *] [n, 1] image = C.sequence.unpack(seq_image, padding_value=0, no_mask_output=True) # image: [#] [*image_width, filters, image_height] width_pos = Cx.sequence.position(seq_image) # width_pos: [#, *] [1] width_pos_unpacked = C.sequence.unpack(width_pos, padding_value=999_999, no_mask_output=True) # width_pos: [#] [*image_width, 1] a = C.sequence.broadcast_as(C.swapaxes(width_pos_unpacked), mu_x) # a: [#, *] [1, *image_width] # x pos index of image (width) b = C.Constant(np.arange(image_height).reshape((1, -1))) # b: [] [1, image_height] # y pos index of image (height) # calculate the which portion of the image that is attended by the gaussian filter f_xi = C.exp(-0.5 * C.square(a - mu_x) / sigma2) f_yj = C.exp(-0.5 * C.square(b - mu_y) / sigma2) # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] z_x = C.reduce_sum(f_xi, axis=1) z_y = C.reduce_sum(f_yj, axis=1) # z_x: [#, *] [n] # z_y: [#, *] [n] f_xi = f_xi / z_x f_yj = f_yj / z_y # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] # combine filters from x and y image_broadcasted = C.sequence.broadcast_as(image, f_yj) attended = gamma * C.times( f_xi, C.times_transpose(image_broadcasted, f_yj), output_rank=2) # attended: [#, *] [n, filters, n] attended = C.swapaxes(attended) # attended: [#, *] [filters, n (x) , n (y)] return attended
def criteria(label, output, block_size, c_classes, weights): ''' Define the loss function and metric ''' probs = cntk.softmax(output, axis=0) log_probs = cntk.log(probs) ce = cntk.times(weights, -cntk.element_times(log_probs, label), output_rank=2) mean_ce = cntk.reduce_mean(ce) _, w, h = label.shape pe = cntk.classification_error(probs, label, axis=0) - \ cntk.reduce_sum(cntk.slice(label, 0, 0, 1)) / cntk.reduce_sum(label) return (mean_ce, pe)
def test_transpose_backward(): shape = (2, 3, 4) p = (2, 0, 1) x0 = np.arange(np.prod(shape), dtype=np.float32).reshape(*shape) shapet = tuple(shape[i] for i in p) x = C.input_variable(shape, needs_gradient=True) y = C.reduce_sum(C.cos(C.transpose(x, p))) xt = C.input_variable(shapet, needs_gradient=True) yt = C.reduce_sum(C.cos(xt)) g = np.squeeze(y.grad({x:x0})) gt = np.squeeze(yt.grad({xt:np.transpose(x0, p)})) assert np.allclose(np.transpose(g, p), gt)
def test_transpose_backward(): shape = (2, 3, 4) p = (2, 0, 1) x0 = np.arange(np.prod(shape), dtype=np.float32).reshape(*shape) shapet = tuple(shape[i] for i in p) x = C.input_variable(shape, needs_gradient=True) y = C.reduce_sum(C.cos(C.transpose(x, p))) xt = C.input_variable(shapet, needs_gradient=True) yt = C.reduce_sum(C.cos(xt)) g = np.squeeze(y.grad({x: x0})) gt = np.squeeze(yt.grad({xt: np.transpose(x0, p)})) assert np.allclose(np.transpose(g, p), gt)
def var(array,W=_W,B=None,square=0,sqrt=0,V=False,sizz=0): #W=tf.transpose(W, [0,2,3,1]) arrs=array.shape ashp=W.shape sb=(W.shape[1],1,1) WV=W.shape[-2:] xi=(-2,-1) x2=(-2,-1,-3) if V: print(W.eval()) print(arrs,ashp) mul=(array*W) if V: print('Wsamp',W[-1,-1].eval()) print('array*w',(mul.eval())[0,-1]) size=C.reduce_sum(W,axis=xi)#shape=(outputs, channel) if V: print("sizesamp",size.shape,size.eval()) if B is None: B=C.constant(0,shape=W.shape[0:2],dtype=np.float32)#channel B=C.reshape(B,(*B.shape,*[1 for _ in range(len(ashp)-len(B.shape))])) if sizz==1: mean=C.reduce_sum(mul,axis=xi)/size else: mean=C.reduce_sum(mul,axis=xi)/C.constant(value=WV[0]*WV[1],shape=sb,dtype=np.float32) if V: print("meansamp",mean.eval()[0,-1]) if square: i=(C.square(mul-mean)+B) else: i=(((mul)-mean)+B) di=i/size if V==2: print("i",i.eval(),"i") print("di",di.eval(),"di") if V: print('isamp',i.shape,i.eval()[-1,-1,]) out=C.reduce_sum(i+B,axis=x2) #out=np.rollaxis(np.sum(i+B,axis=x2),-1,1) print(out.shape) if sqrt: out=C.sqrt(out) out=C.swapaxes(C.reshape(out,out.shape[:4]), 3, 1) print(out.shape) assert out.shape==(arrs[0],ashp[0],arrs[1],arrs[2]) return(out)
def attention_layer(self, context, query): q_processed = C.placeholder(shape=(2 * self.hidden_dim, )) c_processed = C.placeholder(shape=(2 * self.hidden_dim, )) #convert query's sequence axis to static qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs # This part deserves some explanation # It is the attention layer # In the paper they use a 6 * dim dimensional vector # here we split it in three parts because the different parts # participate in very different operations # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u) ws1 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws2 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws3 = C.parameter(shape=(1, 2 * self.hidden_dim), init=C.glorot_uniform()) att_bias = C.parameter(shape=(), init=0) wh = C.times(c_processed, ws1) wu = C.reshape(C.times(qvw, ws2), (-1, )) whu = C.reshape( C.reduce_sum(c_processed * C.sequence.broadcast_as(qvw * ws3, c_processed), axis=1), (-1, )) S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed) S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) q_attn = C.reshape(C.softmax(S), (-1, 1)) #q_attn = print_node(q_attn) c2q = C.reshape( C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn, axis=0), (-1)) max_col = C.reduce_max(S) c_attn = C.sequence.softmax(max_col) htilde = C.sequence.reduce_sum(c_processed * c_attn) q2c = C.sequence.broadcast_as(htilde, c_processed) q2c_out = c_processed * q2c att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out) return C.as_block(att_context, [(c_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def attention(h_enc, h_dec): history_axis = h_dec # we use history_axis wherever we pass this only for the sake of passing its axis # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders # --- encoder state window (h_enc, h_enc_valid) = PastValueWindow( attention_span, axis=attention_axis, go_backwards=go_backwards)(h_enc).outputs h_enc_proj = attn_proj_enc(h_enc) # window must be broadcast to every decoder time step h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis) h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis) # --- decoder state # project decoder hidden state h_dec_proj = attn_proj_dec(h_dec) tanh_out = C.tanh(h_dec_proj + h_enc_proj) # (attention_span, attention_dim) u = attn_proj_tanh(tanh_out) # (attention_span, 1) u_masked = u + ( h_enc_valid - 1 ) * 50 # logzero-out the unused elements for the softmax denominator TODO: use a less arbitrary number than 50 attention_weights = C.softmax( u_masked, axis=attention_axis) #, name='attention_weights') attention_weights = Label('attention_weights')(attention_weights) # now take weighted sum over the encoder state vectors h_att = C.reduce_sum(C.element_times(h_enc_proj, attention_weights), axis=attention_axis) h_att = attn_final_stab(h_att) return h_att
def new_attention(encoder_hidden_state, decoder_hidden_state): # encode_hidden_state: [#, e] [h] # decoder_hidden_state: [#, d] [H] unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs # unpacked_encoder_hidden_state: [#] [*=e, h] # valid_mask: [#] [*=e] projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state) # projected_encoder_hidden_state: [#, d] [*=e, attention_dim] broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state) # broadcast_valid_mask: [#, d] [*=e] projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state) # projected_decoder_hidden_state: [#, d] [attention_dim] tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state) # tanh_output: [#, d] [*=e, attention_dim] attention_logits = attn_proj_tanh(tanh_output) # attention_logits = [#, d] [*=e, 1] minus_inf = C.constant(-1e+30) masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf) # masked_attention_logits = [#, d] [*=e] attention_weights = C.softmax(masked_attention_logits, axis=0) attention_weights = Label('attention_weights')(attention_weights) # attention_weights = [#, d] [*=e] attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0) # attended_encoder_hidden_state = [#, d] [1, h] output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1)) # output = [#, d], [h] return output
def build_graph(self_attention, self_penalty, embeded_dim=60, h_dim=150, d_a=350, r=30): with C.layers.default_options(init=C.xavier()): embeded = C.layers.Embedding(embeded_dim)(x) embeded = C.layers.Stabilizer()(embeded) H = create_birnn(C.layers.GRU(h_dim), C.layers.GRU(h_dim))(embeded) if self_attention: Ws1 = C.parameter(shape=(d_a, 2 * h_dim), name="Ws1") Ws2 = C.parameter(shape=(r, d_a), name="Ws2") A = C.softmax(C.times(Ws2, C.tanh(C.times_transpose(Ws1, H)))) H = C.times(A, H) # the M in the paper if self_penalty: I = C.constant(np.eye(r), dtype=np.float32) P = C.times_transpose(A, A) - I # r*r p = C.reduce_sum(C.abs(C.element_times( P, P))) # frobenius norm **2 y_ = C.layers.Dense(200, activation=C.ops.relu)(H) # y_pre = C.layers.Dense(num_labels, activation = None)(y_) def selfAtt(x): y_pre = C.layers.Dense(num_labels, activation=None)(y_) return y_pre if self_penalty: selfAtt.p = p return selfAtt
def reduce_sum(x, axis=0, name=''): ''' Computes the sum of the input tensor's elements across one axis. if `axis==rank`, then the sum will be computed over all axes, that is, the output is a scalar, which is the sum of tensor's elements. Examples: >>> # create 3x2 matrix in a sequence of length 1 in a batch of one sample >>> data = [[10, 20],[30, 40],[50, 60]] >>> # reduce over the first axis >>> C.eval(C.reduce_sum(data, 0)) [array([[[ 90., 120.]]])] >>> # reduce over the second axis >>> C.eval(C.reduce_sum(data, 1)) [array([[[ 30.], [ 70.], [ 110.]]])] >>> # reduce over the all axes >>> C.eval(C.reduce_sum(data, 2)) [array([[ 210.]])] Args: x: input tensor axis (:class:`cntk.Axis`): axis along which the reduction will be performed name (str): the name of the node in the network Returns: :class:`cntk.Function` ''' from cntk import reduce_sum x = sanitize_input(x) return reduce_sum(x, axis, name).output()
def create_model(self): self.input_dim = 1000 self.embed_dim = 30 i = C.input_variable((self.input_dim,), is_sparse=True) self.p = C.parameter(shape=(self.input_dim, self.embed_dim), init=1) o = C.times(i, self.p) self.z = C.reduce_sum(o)
def simi_attention(self, input, memory): ''' return: memory weighted vectors over input [#,c][d] weight ''' input_ph = C.placeholder() # [#,c][d] mem_ph = C.placeholder() # [#,q][d] input_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) mem_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) bias = C.Parameter(shape=(2 * self.hidden_dim, ), init=0.0) weight_dense = Dense(1, bias=False, input_rank=1) proj_inp = input_dense(input_ph) # [#,c][d] proj_mem = mem_dense(mem_ph) # [#,q][d] unpack_memory, mem_mask = C.sequence.unpack( proj_mem, 0).outputs # [#][*=q, d] [#][*=q] expand_mem = C.sequence.broadcast_as(unpack_memory, proj_inp) # [#,c][*=q,d] expand_mask = C.sequence.broadcast_as(mem_mask, proj_inp) # [#,c][*=q] matrix = C.reshape(weight_dense(C.tanh(proj_inp + expand_mem + bias)), (-1, )) # [#,c][*=q] matrix = C.element_select(expand_mask, matrix, -1e30) logits = C.softmax(matrix, axis=0) # [#,c][*=q] weight_mem = C.reduce_sum(C.reshape(logits, (-1, 1)) * expand_mem, axis=0) # [#,c][d] weight_mem = C.reshape(weight_mem, (-1, )) return C.as_block(C.combine(weight_mem, logits), [(input_ph, input), (mem_ph, memory)], 'simi_attention', 'simi_attention')
def attention(encoded, network): abk = dense(network) a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures) # print("abk shape:", a.shape, b.shape, k.shape) # a, b, k: [#, n] [nb_mixture, 1] # context: [#, c] [char_ohe] encoded_unpacked = C.sequence.unpack(encoded, padding_value=0, no_mask_output=True) # context_unpacked: [#] [*=c, char_ohe] u = Cx.sequence.position(encoded) # position gives shape=(1, ) # u: [#, c], [1] u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs # u_values: [#] [*=c, 1] # u_valid: [#] [*=c] u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k)) # u_values_broadcast: [#, n] [1, *=c] u_valid_broadcast = C.sequence.broadcast_as(C.reshape(u_valid, (1,), 1), k) # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point # print("u_values_broadcast shape:", u_values_broadcast.shape) # print("abk shape:", a.shape, b.shape, k.shape) phi = window_weight(a, b, k, u_values_broadcast) # phi: [#, n] [*=c, 1] zero = C.constant(0) phi = C.element_select(u_valid_broadcast, phi, zero, name="phi") # phi: [#, n] [*=c, 1] attended = C.reduce_sum(phi * C.sequence.broadcast_as(encoded_unpacked, phi), axis=0) # [#, n] [1, char_ohe] # print("attended_context shape:", attended_context.shape) output = C.squeeze(attended, name="GaussianWindowAttention") # [#, n] [char_ohe] return output
def criterion(self): # hyperparameters lambda_val = 0.5 # Margin loss left = ct.square(ct.relu(0.9 - self.length)) right = ct.square(ct.relu(self.length - 0.1)) left = ct.reshape(left, (-1)) right = ct.reshape(right, (-1)) lc = self.labels * left + lambda_val * (1 - self.labels) * right margin_loss = ct.reduce_sum(lc, axis=0) margin_loss = ct.reduce_mean(margin_loss, axis=ct.axis.Axis.default_batch_axis()) # classification_error predict = ct.softmax(self.length, axis=0) error = ct.classification_error(ct.reshape(predict, (10)), self.labels) total_loss = margin_loss reconstruction_err = 0 if self.use_reconstruction: features = ct.reshape(self.features, shape=(-1,)) encoder = ct.reshape(self.training_model, shape=(-1,)) squared = ct.square(encoder - features) reconstruction_err = ct.reduce_mean(squared, axis=0) reconstruction_err = ct.reduce_mean(reconstruction_err, axis=ct.axis.Axis.default_batch_axis()) total_loss = margin_loss + (0.0005*784) * reconstruction_err return total_loss, error
def test_sequence_reduce_over_reduced_scalar(): x = C.sequence.input_variable(shape=(1), needs_gradient=True) op = C.sequence.reduce_sum(C.reduce_sum(x)) grad, result = op.grad({x : np.asarray([[-1], [3], [5]], dtype=np.float32)}, outputs=[op]) assert np.array_equal(result, [7.0]) assert np.array_equal(grad[0], [[1.0], [1.0], [1.0]])
def pad(x, pattern, mode=C.CONSTANT_PAD, constant_value=0, name=''): """ Pads a tensor in the sequence axis according to the specified patterns. Three padding modes are supported: CONSTANT / REFLECT / SYMMETRIC. Arguments: x: tensor to be padded. pattern (tuple with 2 integers): how many values to add before and after the contents in the sequence axis. mode (int): padding mode: C.ops.CONSTANT_PAD, C.ops.REFLECT_PAD and C.ops.SYMMETRIC_PAD constant_value: the value used to fill the padding cells, only meaningful under CONSTANT mode. name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` """ if not all(isinstance(i, int) for i in pattern) or not isinstance(pattern, tuple): raise ValueError(f"pattern {pattern} must be a tuple with 2 integers") ndim = len(x.shape) null_pattern = [(0, 0)] * ndim final_pattern = [pattern] + null_pattern b, valid = C.sequence.unpack(x, padding_value=0).outputs c = C.pad(b, final_pattern, mode=mode, constant_value=constant_value) seq_length = C.reduce_sum(valid, axis=0) + C.Constant(sum(pattern)) d = C.to_sequence(c, seq_length, name=name) return d
def test_trainer(tmpdir, no_eval_function): in1 = input(shape=(1, )) labels = input(shape=(1, )) p = parameter(shape=(2, ), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) if no_eval_function: errs = None else: errs = classification_error(z, labels) momentum_time_constant = momentum_as_time_constant_schedule(1100) lr_per_sample = learning_rate_schedule(0.007, UnitType.sample) trainer = Trainer(z, (ce, errs), [ momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True) ]) in1_value = [[1], [2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') trainer.save_checkpoint(p) trainer.restore_from_checkpoint(p) assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__ assert isinstance(trainer.parameter_learners[0], Learner)
def test_trainer(tmpdir, no_eval_function): in1 = input_variable(shape=(1,)) labels = input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) if no_eval_function: errs = None else: errs = classification_error(z, labels) momentum_time_constant = momentum_as_time_constant_schedule(1100) lr_per_sample = learning_rate_schedule(0.007, UnitType.sample) trainer = Trainer(z, (ce, errs), [momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') trainer.save_checkpoint(p) trainer.restore_from_checkpoint(p) assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__ assert isinstance(trainer.parameter_learners[0], Learner)
def window_weight(a, b, k, u): """ Calculate Phi is the window weight of character seq at position u of time t. Function tested to be correct on 2018-25-02 using numpy equivalent math: phi = summation of mixtures { a * exp ( -b * (k - u) ^ 2 ) } Args: a: importance of window within the mixture. Not normalised and doesn't sum to one. b: width of attention window k: location of window u: integer position of each item in sequence. Value from 1 to seq_length. (rank 2 tensor) [-3, 1] Returns: :class:`~cntk.ops.functions.Function` """ # print(f"k shape: {k.shape}, u shape: {u.shape}") phi = a * C.exp(-1 * b * C.square(k - u)) # print("internal phi shape:", phi.shape) phi = C.swapaxes(C.reduce_sum(phi, axis=0)) # Reduce sum the mixture axis # phi: [#, n] [*-c, 1] return phi
def flow_reverse(chunk): input_dim = chunk['input_dim'] log_det_J = 0 _half_dim = input_dim//2 _ph = C.placeholder(input_dim, name='place_holder') _log_s_func = chunk['log_s_func'] _t_func = chunk['t_func'] _y1, _y2 = _ph[:_half_dim], _ph[_half_dim:] _log_s = _log_s_func(_y2) _t = _t_func(_y2) _s = C.exp(_log_s) _x1 = (_y1-_t)/_s _x2 = _y2 _X = C.splice(_x1, _x2) log_det_J += C.reduce_sum(C.log(C.abs(_s))) _w = chunk['W_rot_mat'] chunk['W_rot_mat_inv'] = _inv_w = C.Constant(np.linalg.inv(_w.value), name='inv_W') _out = _X@_inv_w log_det_J += input_dim*C.log(C.det(_inv_w)) # if 'scale' in chunk: # _out -= chunk['bias'] # _out /= chunk['scale'] # log_det_J += input_dim*C.reduce_sum(C.log(C.abs(chunk['scale']))) # _out -= chunk['b'] # _out @= _inv_w return _out, log_det_J
def test_trainer(tmpdir, no_eval_function): in1 = C.input_variable(shape=(1,)) labels = C.input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) if no_eval_function: errs = None else: errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_parameter_schedule(0.007, minibatch_size =1) trainer = C.Trainer(z, (ce, errs), [C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') external_state = {"additional external state":math.pi, "nested dict":{"a":"b"}, "list":[1,2,3]} trainer.save_checkpoint(p, external_state) restored_state = trainer.restore_from_checkpoint(p) assert external_state == restored_state assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__ assert isinstance(trainer.parameter_learners[0], C.Learner)
def attention(query, key, value): dk = C.reduce_sum(C.ones_like(query)) # cannot use sequence.last, will conflict with recurrence # dk: [#, *] [1, ] and value = int(dim_of_query) unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True) # [#] [-3, key_dim] unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True) # [#] [-3, value_dim] broadcasted_key = C.sequence.broadcast_as(unpacked_key, query) # [#, *] [-3, key_dim] scaled = C.times_transpose(query, broadcasted_key) / dk # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score # masked out invalid temporal connections to obey_sequence_order if obey_sequence_order and max_seq_len: unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs # unpacked_scaled: [#] [-3, -3] <== matrix will be top right diagonally zero-ed # scaled_mask: [#] [-3,] minus_inf = C.constant(-1e+30) valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0)) # [] [max_seq, max_seq] valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled) # [#] [max_seq, max_seq] valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0) # [#] [-3, -3] unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf) # [#] [-3, -3] scaled = C.to_sequence_like(unpacked_scaled, query) # [#, *] [-3] elif obey_sequence_order and not max_seq_len: raise ValueError("max_seq_len must be defined when obey_sequence_order is True") attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query)) # [#, *] [value_dim,] return attended
def multiFunc(self, arg1): # load or create the inputs we need multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1,)) # lets compute the means we need # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits, # it is the difference between the previous bits approximation and the true value. carry_over = multiIn approx = C.element_times(multiIn, 0) # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization for i in range(max_bits): # determine which values of the input should be binarized to i bits or more hot_vals = C.greater(bit_map, i) # select only the values which we need to binarize valid_vals = C.element_select(hot_vals, carry_over, 0) # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels) mean = C.element_divide(C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1)) # reshape the mean to match the dimensionality of the input mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1)) # binarize the carry over bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) # add in the equivalent binary representation to the approximation approx = C.plus(approx, C.element_times(mean, bits)) # compute the new carry over carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def create_model(self): self.input_dim = 1000 self.embed_dim = 30 i = C.input_variable((self.input_dim, ), is_sparse=True) self.p = C.parameter(shape=(self.input_dim, self.embed_dim), init=1) o = C.times(i, self.p) self.z = C.reduce_sum(o)
def test_ReduceSum(tmpdir, dtype): with C.default_options(dtype=dtype): data = np.array( [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]], dtype=dtype) model = C.reduce_sum(data, 0) verify_no_input(model, tmpdir, 'ReduceSum_0')
def run_cntk(image_path, model_path): import functools import cv2 model = cntk.load_model(model_path) pool_nodes = list() for l in cntk.logging.depth_first_search(model, lambda x: True, depth=0): if type(l) is cntk.ops.functions.Function: description = str(l) if description.find('Pooling') >= 0: pool_nodes.append(l) print(l) print(pool_nodes) # node contributions to the loss metric layer_contributions = { pool_nodes[2]: 1, pool_nodes[3]: 3, } # Define the loss loss = None for layer in layer_contributions.keys(): coeff = layer_contributions[layer] activation = layer.output scaling = functools.reduce(lambda x, y: x * y, activation.shape) sum_squares = cntk.reduce_sum(cntk.square(activation)) scaled_sum_squares = (coeff / scaling) * sum_squares if loss is None: loss = scaled_sum_squares else: loss += scaled_sum_squares dream = cntk.input_variable(shape=model.arguments[0].shape, needs_gradient=True, name='features') model = cntk.ops.combine(loss).clone( cntk.ops.CloneMethod.freeze, substitutions={model.arguments[0]: dream}) step = 0.1 # Gradient ascent step size iterations = 5 # Number of ascent steps per scale # Load the image into a Numpy array img = cv2.imread(image_path) img = cv2.resize(img, (224, 224)) # cv2.imshow('Original Image', img.copy()) img = img.astype(np.float32) img = np.transpose(img, (2, 0, 1)) img /= 127.5 img -= 1 img = gradient_ascent_cntk(model, img, iterations=iterations, step=step) img = np.transpose(img, (1, 2, 0)) img /= 2. img += 0.5 img *= 255. img = np.clip(img, 0, 255).astype('uint8') return img
def test_conv_cudnn_batch_size_change(device_id): if device_id == -1: pytest.skip('Test only runs on GPU') np.random.seed(0) input_shape = (1, 16, 100) input1 = C.sequence.input_variable(input_shape, needs_gradient=True, sequence_axis=C.Axis.new_unique_dynamic_axis('c')) input2 = C.sequence.input_variable(input_shape, needs_gradient=True, sequence_axis=C.Axis.new_unique_dynamic_axis('q')) conv = C.layers.Convolution2D((5,8), 100, activation=C.relu, init=C.glorot_uniform(), bias=True, init_bias=0) output = C.reduce_sum(conv(input1), axis=C.Axis.all_axes()) + C.reduce_sum(conv(input2), axis=C.Axis.all_axes()) num_batches = 100 # change to greater value for a more thorough test batch_size = 1 max_seq_len = [100, 10] for batch in range(num_batches): seq_lens = [[int(x*msl+1) for x in np.random.random((batch_size))] for msl in max_seq_len] output.grad({input1:[np.random.random((sl,) + input_shape).astype(np.float32) for sl in seq_lens[0]], input2:[np.random.random((sl,) + input_shape).astype(np.float32) for sl in seq_lens[1]]})
def multiFunc(self, arg1): multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1,)) carry_over = multiIn approx = C.element_times(multiIn, 0) for i in range(max_bits): hot_vals = C.greater(bit_map, i) valid_vals = C.element_select(hot_vals, carry_over, 0) mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)), C.reduce_sum(hot_vals)) bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) approx = C.plus(approx, C.element_times(mean, bits)) carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def create_sample_model(device, writer=None, lr_per_sample=C.learning_parameter_schedule_per_sample([0.3, 0.2, 0.1, 0.0])): in1 = sequence.input_variable(shape=(input_dim,)) labels = sequence.input_variable(shape=(input_dim,)) p = parameter(shape=(input_dim,), init=10, device=device) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) learner = C.sgd(z.parameters, lr_per_sample) trainer = C.Trainer(z, (ce, errs), [learner], writer) return (trainer, in1, labels)
def test_sequence_unpack_with_broadcast_as(device_id, precision): x = C.sequence.input_variable(5) a = C.sequence.input_variable(4, sequence_axis=C.Axis('a')) y, mask = C.sequence.unpack(x, 0).outputs bvm = C.sequence.broadcast_as(0 * C.reduce_sum(y) + mask, a) x1 = [np.arange(7 * 5).reshape(7, 5).astype('f'), np.arange(3 * 5).reshape(3, 5).astype('f')] a1 = [np.arange(3 * 4).reshape(3, 4).astype('f'), np.arange(6 * 4).reshape(6, 4).astype('f')] expected = [np.ones((3, 7), dtype=np.float32), np.ones((6, 7), dtype=np.float32)] expected[1][:,3:] = 0 actual = bvm.eval({x: x1, a: a1}) for actual_i, expected_i in zip(actual, expected): assert np.allclose(actual_i, expected_i)
def test_output_to_retain(): in1 = input_variable(shape=(1,)) labels = input_variable(shape=(1,)) p = parameter(shape=(2,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) momentum_time_constant = momentum_as_time_constant_schedule(1100) lr_per_sample = learning_rate_schedule(0.007, UnitType.sample) trainer = Trainer(z, (ce, errs), [momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)]) in1_value = [[[1]], [[2]]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) assert np.allclose(var_map[z_output], np.asarray(in1_value)+20)
def attention_pooling(inputs, inputs_mask, inputs_weights, decode, decode_weights, keys): """ inputs: shape=(n, dim) inputs_weight: shape=(dim, dim) decode: shape=(1, dec_dim) decode_weights: shape=(dec_dim, dim) keys: shape=(dim, 1) """ w_in = C.times(inputs, inputs_weights) #shape=(n, dim) w_dec = C.times(decode, decode_weights) #shape=(dim, 1) S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in)) #shape=(n, dim) S = C.element_select(inputs_mask, S, C.constant(-1e+30)) S = C.times(S, keys) #shape=(n) S = C.ops.sequence.softmax(S, name="softmax") attention = C.reduce_sum(inputs * S, axis=0) return attention
def test_nce_backward_indices(classes, xdim, batch, expected_value, device_id, precision): """ Simple test that makes sure that the derivatives have the correct sparsity pattern """ # ignore precision, only sparsity pattern matters for this test dt = np.float32 from cntk.losses import nce_loss import scipy trials = 10 # Establish baseline expected_count = np.zeros(classes) I = C.constant(np.eye(classes, dtype=dt)) q = np.arange(classes, dtype=dt) + 1 z = C.reduce_sum(C.times(C.random_sample(q, 32, True, seed=98052), I), axis=0) for i in range(trials): expected_count[np.nonzero(z.eval().ravel())] += 1 # Set things up to measure the same thing with nce_loss x = C.input_variable(xdim, needs_gradient=True) y = C.input_variable(classes, is_sparse=True) x0 = np.arange(batch * xdim, dtype=dt).reshape((batch, xdim))/(batch * xdim) data = np.ones(batch, dtype=dt) indices = list(range(10,10*batch+1,10)) indptr = list(range(batch+1)) y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes)) b = C.parameter((classes, 1)) W = C.parameter((classes, C.InferredDimension)) gb = np.zeros(classes) vb = C.input_variable((classes, 1), dtype=dt) Ib = C.constant(np.eye(1, dtype=dt)) zb = C.times(vb, Ib) loss = C.nce_loss(W, b, x, y, q, seed=98052) for i in range(trials): v = loss.grad({x: x0, y: y0}, wrt=loss.parameters, as_numpy=False) gb[np.nonzero(zb.eval({vb: v[b]}).ravel())] += 1 for i in range(classes): assert gb[i] == expected_count[i] or (i in indices and gb[i] == trials)
def test_op_reduce_sum(input_data, axis, device_id, precision): # Forward pass test # ================== # We compute the expected output for the forward pass. # We need two surrounding brackets: # The first for sequences (length=1, since we have dynamic_axis=''). # The second for batch of one sample. # keepdims = True as CNTK keeps them as well def reduce_sum(x, axis, keepdims=True): x_aa = AA(x) if axis == len(x_aa.shape): return [np.reshape(np.add.reduce(np.ravel(x_aa)), (1, 1))] return [[np.add.reduce(x_aa, axis, dtype=PRECISION_TO_TYPE[precision], keepdims=keepdims)]] expected_result = reduce_sum(input_data, axis) a = I([input_data]) # splice using the operator result = C.reduce_sum(a, axis) unittest_helper( result, None, expected_result, device_id=device_id, precision=precision, clean_up=True, backward_pass=False ) # Backward pass test # ================== # The gradient of the reduce_sum operator is all ones in the shape of the input def grad_reduce_sum(x): return AA(np.ones_like(x, dtype=PRECISION_TO_TYPE[precision])) expected_gradient = [[grad_reduce_sum(input_data)]] unittest_helper( result, None, expected_gradient, device_id=device_id, precision=precision, clean_up=True, backward_pass=True, input_node=a, )
def create_binary_convolution_model(): # Input variables denoting the features and label data feature_var = C.input((num_channels, image_height, image_width)) label_var = C.input((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), feature_var) # first layer is ok to be full precision z = C.layers.Convolution((3, 3), 32, pad=True, activation=C.relu)(scaled_input) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=32, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=128, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (1,1), num_classes, channels=128, pad=True) z = C.layers.AveragePooling((z.shape[1], z.shape[2]))(z) z = C.reshape(z, (num_classes,)) # Add binary regularization (ala Gang Hua) weight_sum = C.constant(0) for p in z.parameters: if (p.name == "filter"): weight_sum = C.plus(weight_sum, C.reduce_sum(C.minus(1, C.square(p)))) bin_reg = C.element_times(.000005, weight_sum) # After the last layer, we need to apply a learnable scale SP = C.parameter(shape=z.shape, init=0.001) z = C.element_times(z, SP) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) ce = C.plus(ce, bin_reg) pe = C.classification_error(z, label_var) return C.combine([z, ce, pe])
def test_restore_constants(tmpdir): C.device.try_set_default_device(C.device.cpu()) def _setvalue(x, v): x.value = 0 * x.value + v if len(x.shape)> 0 else np.array(v, dtype=np.float32) def _setall(f, v): for x in f.constants + f.parameters: _setvalue(x, v) def _checkall(f, v): for x in f.constants + f.parameters: assert (x.value == v).all() x = C.input_variable(10) f = C.layers.BatchNormalization()(x) trainer = C.Trainer(f, C.reduce_sum(f), C.sgd(f.parameters, C.learning_rate_schedule(0.1, 'sample'))) model_filename = str(tmpdir / 'function.out') checkpoint_filename = str(tmpdir / 'checkpoint.out') _setall(f, 1) f.save(model_filename) _checkall(f, 1) _setall(f, 2) trainer.save_checkpoint(checkpoint_filename) _checkall(f, 2) _setall(f, 3) _checkall(f, 3) trainer.restore_from_checkpoint(checkpoint_filename) _checkall(f, 2) f2 = C.Function.load(model_filename) _checkall(f2, 1) _setall(f, 4) _checkall(f, 4) f.restore(model_filename) _checkall(f, 1) _setall(f2, 5) _checkall(f2, 5)
def run_distributed_training(tmpdir, create_func): in1 = sequence.input_variable(shape=1) labels = sequence.input_variable(shape=1) p = parameter(shape=2, init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) momentum_time_constant = C.momentum_as_time_constant_schedule(1100) lr_per_sample = C.learning_rate_schedule(0.007, C.UnitType.sample) dist_learner = create_func(C.momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, True)) communicator = dist_learner.communicator() workers = communicator.workers() current_worker = communicator.current_worker() found_rank = False for wk in workers: if current_worker.global_rank == wk.global_rank: found_rank = True assert found_rank trainer = C.Trainer(z, (ce, errs), [ dist_learner ]) in1_value = [[1],[2]] label_value = [[0], [1]] arguments = {in1: in1_value, labels: label_value} z_output = z.output updated, var_map = trainer.train_minibatch(arguments, outputs=[z_output]) p = str(tmpdir / 'checkpoint.dat') trainer.save_checkpoint(p) trainer.restore_from_checkpoint(p) communicator.barrier() assert trainer.model.name == 'z' # Ensure that Swig is not leaking raw types assert isinstance(trainer.model, Function) assert trainer.model.__doc__
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False): polymath = PolyMath(config_file) z, loss = polymath.model() training_config = importlib.import_module(config_file).training_config max_epochs = training_config['max_epochs'] log_freq = training_config['log_freq'] progress_writers = [C.logging.ProgressPrinter( num_epochs = max_epochs, freq = log_freq, tag = 'Training', log_to_file = log_file, rank = C.Communicator.rank(), gen_heartbeat = gen_heartbeat)] lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None) ema = {} dummies = [] for p in z.parameters: ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid) ema[p.uid] = ema_p dummies.append(C.reduce_sum(C.assign(ema_p, 0.999 * ema_p + 0.001 * p))) dummy = C.combine(dummies) learner = C.adadelta(z.parameters, lr) if C.Communicator.num_workers() > 1: learner = C.data_parallel_distributed_learner(learner) tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir='log', model=z) trainer = C.Trainer(z, (loss, None), learner, tensorboard_writer) if profiling: C.debugging.start_profiler(sync_gpu=True) train_data_file = os.path.join(data_path, training_config['train_data']) train_data_ext = os.path.splitext(train_data_file)[-1].lower() model_file = os.path.join(model_path, model_name) model = C.combine(list(z.outputs) + [loss.output]) label_ab = argument_by_name(loss, 'ab') epoch_stat = { 'best_val_err' : 100, 'best_since' : 0, 'val_since' : 0} if restore and os.path.isfile(model_file): trainer.restore_from_checkpoint(model_file) #after restore always re-evaluate epoch_stat['best_val_err'] = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) def post_epoch_work(epoch_stat): trainer.summarize_training_progress() epoch_stat['val_since'] += 1 if epoch_stat['val_since'] == training_config['val_interval']: epoch_stat['val_since'] = 0 temp = dict((p.uid, p.value) for p in z.parameters) for p in trainer.model.parameters: p.value = ema[p.uid].value val_err = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) if epoch_stat['best_val_err'] > val_err: epoch_stat['best_val_err'] = val_err epoch_stat['best_since'] = 0 trainer.save_checkpoint(model_file) for p in trainer.model.parameters: p.value = temp[p.uid] else: epoch_stat['best_since'] += 1 if epoch_stat['best_since'] > training_config['stop_after']: return False if profiling: C.debugging.enable_profiler() return True if train_data_ext == '.ctf': mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath) minibatch_size = training_config['minibatch_size'] # number of samples epoch_size = training_config['epoch_size'] for epoch in range(max_epochs): num_seq = 0 while True: if trainer.total_number_of_samples_seen >= training_config['distributed_after']: data = mb_source.next_minibatch(minibatch_size*C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) else: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count dummy.eval() if num_seq >= epoch_size: break if not post_epoch_work(epoch_stat): break else: if train_data_ext != '.tsv': raise Exception("Unsupported format") minibatch_seqs = training_config['minibatch_seqs'] # number of sequences for epoch in range(max_epochs): # loop over epochs tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers()) minibatch_count = 0 for data in tsv_reader: if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank(): trainer.train_minibatch(data) # update model with it dummy.eval() minibatch_count += 1 if not post_epoch_work(epoch_stat): break if profiling: C.debugging.stop_profiler()
def test_ReduceSum(tmpdir): data = np.array([[[5,1], [20,2]],[[30,1], [40,2]],[[55,1], [60,2]]], dtype=np.float32) model = C.reduce_sum(data, 0) verify_no_input(model, tmpdir, 'ReduceSum_0')
def validate_model(test_data, model, polymath): begin_logits = model.outputs[0] end_logits = model.outputs[1] loss = model.outputs[2] root = C.as_composite(loss.owner) mb_source, input_map = create_mb_and_map(root, test_data, polymath, randomize=False, repeat=False) begin_label = argument_by_name(root, 'ab') end_label = argument_by_name(root, 'ae') begin_prediction = C.sequence.input_variable(1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True) end_prediction = C.sequence.input_variable(1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True) best_span_score = symbolic_best_span(begin_prediction, end_prediction) predicted_span = C.layers.Recurrence(C.plus)(begin_prediction - C.sequence.past_value(end_prediction)) true_span = C.layers.Recurrence(C.plus)(begin_label - C.sequence.past_value(end_label)) common_span = C.element_min(predicted_span, true_span) begin_match = C.sequence.reduce_sum(C.element_min(begin_prediction, begin_label)) end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label)) predicted_len = C.sequence.reduce_sum(predicted_span) true_len = C.sequence.reduce_sum(true_span) common_len = C.sequence.reduce_sum(common_span) f1 = 2*common_len/(predicted_len+true_len) exact_match = C.element_min(begin_match, end_match) precision = common_len/predicted_len recall = common_len/true_len overlap = C.greater(common_len, 0) s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes()) stats = C.splice(s(f1), s(exact_match), s(precision), s(recall), s(overlap), s(begin_match), s(end_match)) # Evaluation parameters minibatch_size = 20000 num_sequences = 0 stat_sum = 0 loss_sum = 0 while True: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) if not data or not (begin_label in data) or data[begin_label].num_sequences == 0: break out = model.eval(data, outputs=[begin_logits,end_logits,loss], as_numpy=False) testloss = out[loss] g = best_span_score.grad({begin_prediction:out[begin_logits], end_prediction:out[end_logits]}, wrt=[begin_prediction,end_prediction], as_numpy=False) other_input_map = {begin_prediction: g[begin_prediction], end_prediction: g[end_prediction], begin_label: data[begin_label], end_label: data[end_label]} stat_sum += stats.eval((other_input_map)) loss_sum += np.sum(testloss.asarray()) num_sequences += data[begin_label].num_sequences stat_avg = stat_sum / num_sequences loss_avg = loss_sum / num_sequences print("Validated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}".format( num_sequences, loss_avg, stat_avg[0], stat_avg[1], stat_avg[2], stat_avg[3], stat_avg[4], stat_avg[5], stat_avg[6])) return loss_avg
def test_sweep_based_schedule(tmpdir, device_id): from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk import cross_entropy_with_softmax, classification_error, plus, reduce_sum, sequence from cntk import Trainer input_dim = 69 ctf_data = '''\ 0 |S0 3:1 |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_dim, is_sparse=True), labels = StreamDef(field='S1', shape=input_dim, is_sparse=True) )), randomize=False) in1 = sequence.input_variable(shape=(input_dim,)) labels = sequence.input_variable(shape=(input_dim,)) p = parameter(shape=(input_dim,), init=10) z = plus(in1, reduce_sum(p), name='z') ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) lr_per_sample = learning_rate_schedule([0.3, 0.2, 0.1, 0.0], UnitType.sample) learner = sgd(z.parameters, lr_per_sample) trainer = Trainer(z, (ce, errs), [learner]) input_map = { in1 : mbs.streams.features, labels : mbs.streams.labels } # fetch minibatch (first sequence) data = mbs.next_minibatch(1, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.3 # fetch minibatch (second sequence, sweep ends at this point) data = mbs.next_minibatch(1, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.2 # fetch minibatch (both sequences -- entire sweep in one go) data = mbs.next_minibatch(9, input_map=input_map) trainer.train_minibatch(data) assert learner.learning_rate() == 0.1 # fetch minibatch (multiple sweeps) data = mbs.next_minibatch(30, input_map=input_map) trainer.train_minibatch(data, outputs=[z.output]) assert learner.learning_rate() == 0.0
def test_ReduceSum(tmpdir, dtype): with C.default_options(dtype = dtype): data = np.array([[[5,1], [20,2]],[[30,1], [40,2]],[[55,1], [60,2]]], dtype=dtype) model = C.reduce_sum(data, 0) verify_no_input(model, tmpdir, 'ReduceSum_0')
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: (image_widht, image_height, image_scale) as CNTK variable or constant add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 rpn_conv_3x3 = Convolution((3, 3), 256, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=0.1)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(np.prod(rpn_cls_score.shape) / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions)) rpn_cls_prob = softmax(rpn_cls_score_rshp, axis=0, name="objness_softmax") rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape) # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # For loss functions: ignore label predictions for the 'ignore label', # i.e. set target and prediction to 0 --> needs to be softmaxed before rpn_labels_rshp = reshape(rpn_labels, (1, num_predictions)) ignore = user_function(IgnoreLabel(rpn_cls_prob, rpn_labels_rshp, ignore_label=-1)) rpn_cls_prob_ignore = ignore.outputs[0] fg_targets = ignore.outputs[1] bg_targets = 1 - fg_targets rpn_labels_ignore = splice(bg_targets, fg_targets, axis=0) # RPN losses rpn_loss_cls = cross_entropy_with_softmax(rpn_cls_prob_ignore, rpn_labels_ignore, axis=0) rpn_loss_bbox = user_function(SmoothL1Loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights)) rpn_losses = plus(reduce_sum(rpn_loss_cls), reduce_sum(rpn_loss_bbox), name="rpn_losses") return rpn_rois, rpn_losses
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None, conv_bias_init=0.0): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: A CNTK variable or constant containing (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height) e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000 add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 num_channels = cfg["CNTK"].RPN_NUM_CHANNELS rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=conv_bias_init)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(rpn_cls_score.shape[0] / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp") p_rpn_cls_score_rshp = cntk.placeholder() rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0) rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob') rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape") # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # classification loss p_rpn_labels = cntk.placeholder() p_rpn_cls_score_rshp = cntk.placeholder() keeps = cntk.greater_equal(p_rpn_labels, 0.0) fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets") bg_labels = minus(1, fg_labels, name="bg_targets") rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0) rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0) rpn_loss_cls = element_times(rpn_ce, keeps) # The terms that are accounted for in the cls loss are those that have a label >= 0 cls_num_terms = reduce_sum(keeps) cls_normalization_factor = 1.0 / cls_num_terms normalized_rpn_cls_loss = reduce_sum(rpn_loss_cls) * cls_normalization_factor reduced_rpn_loss_cls = cntk.as_block(normalized_rpn_cls_loss, [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore', 'norm_rpn_cls_loss') # regression loss p_rpn_bbox_pred = cntk.placeholder() p_rpn_bbox_targets = cntk.placeholder() p_rpn_bbox_inside_weights = cntk.placeholder() rpn_loss_bbox = SmoothL1Loss(cfg["CNTK"].SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0) # The bbox loss is normalized by the rpn batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE normalized_rpn_bbox_loss = reduce_sum(rpn_loss_bbox) * bbox_normalization_factor reduced_rpn_loss_bbox = cntk.as_block(normalized_rpn_bbox_loss, [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets), (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)], 'SmoothL1Loss', 'norm_rpn_bbox_loss') rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses") return rpn_rois, rpn_losses