def test_slice_with_inferred_static_axis(): x = C.input_variable(shape=(C.InferredDimension, C.InferredDimension, 3)) padding_shape = (3, C.InferredDimension, 3) y = C.splice(C.constant(value=0, shape=padding_shape), x, axis=0) assert y.shape == (-1, -1, 3) y = C.splice(x, C.constant(value=0, shape=padding_shape), axis=0) assert y.shape == (-1, -1, 3)
def test_udf_input_values_no_sharing(): i = C.input_variable(1, needs_gradient=True, name='i_var') m = C.user_function(MyArgumentPreservingPlus(i + 1, i + 2)) w = C.parameter(shape=(1,), init=1) m = m + w m2 = C.splice(m, m, axis=0) m3 = C.splice(m2, m2, axis=0) m4 = C.splice(m3, m3, axis=0) grad_value, result = m4.grad({i : np.asarray([2], dtype=np.float32)}, outputs=[m4], wrt=[w, i]) assert np.array_equal(result, [[8, 8, 8, 8, 8, 8, 8, 8]])
def lstm_with_attention(dh, dc, x): # encoder hidden state, decoder hidden state tmp = encode_out.outputs[0].owner print(tmp) h_att = attention_model(encode_out.outputs[0], dh) x = C.splice(x, h_att) return rec_block(dh,dc,x)
def test_op_splice(input_data1, input_data2, axis, expected_result, device_id, precision): # Forward pass test #================== # We compute the expected output for the forward pass. # We need two surrounding brackets: # The first for sequences (length=1, since we have dynamic_axis=''). # The second for batch of one sample. a = I([input_data1]) b = I([input_data2]) # splice using the operator result = C.splice((a, b), axis) unittest_helper(result, None, [[expected_result]], device_id=device_id, precision=precision, clean_up=True, backward_pass=False) # Backward pass test # ================== # The gradient of the splice operator is all ones in the shape of the input def grad_splice(x): return np.ones_like(x) expected_gradient1 = grad_splice(np.asarray(input_data1)) expected_gradient2 = grad_splice(np.asarray(input_data2)) unittest_helper(result, None, [[expected_gradient1]], device_id = device_id, precision=precision, clean_up=True, backward_pass=True, input_node=a) unittest_helper(result, None, [[expected_gradient2]], device_id = device_id, precision=precision, clean_up=True, backward_pass=True, input_node=b)
def test_splice(shape1, shape2): a = C.input_variable(shape=shape1, dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name='a') b = C.input_variable(shape=shape2, dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name='b') # create batch input_data1.shape = (1,) + input_data1.shape input_data2.shape = (1,) + input_data2.shape # splice using the operator root_op = C.splice(a, b, axis=axis, name='splice_ab') forward_input = {a: input_data1, b: input_data2} # Backward pass test # ================== # The gradient of the splice operator is all ones in the shape of the input def grad_splice(x): return np.ones_like(x) expected_forward = [expected_result] expected_backward = { a: grad_splice(np.asarray(input_data1)), b: grad_splice(np.asarray(input_data2)) } unittest_helper(root_op, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision)
def create_model(input_dim): row = sequence.input_variable(shape=input_dim) col = sequence.input_variable(shape=input_dim) rowh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(row) colh = Sequential([Embedding(opt.embed), Stabilizer(), Dropout(opt.dropout)])(col) x = C.splice(rowh, colh, axis=-1) x = lightlstm(opt.embed, opt.nhid)(x) x = For(range(opt.layer-1), lambda: lightlstm(opt.nhid, opt.nhid))(x) rowh = C.slice(x, -1, opt.nhid * 0, opt.nhid * 1) colh = C.slice(x, -1, opt.nhid * 1, opt.nhid * 2) row_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(rowh) col_predict = Sequential([Dropout(opt.dropout), Dense(input_dim)])(colh) # variable : row label and col label row_label = sequence.input_variable(shape=input_dim) col_label = sequence.input_variable(shape=input_dim) model = C.combine([row_predict, col_predict]) return {'row': row, 'col': col, 'row_label': row_label, 'col_label': col_label, 'model': model}
def input_layer(self,cgw,cnw,cc,qgw,qnw,qc): cgw_ph = C.placeholder() cnw_ph = C.placeholder() cc_ph = C.placeholder() qgw_ph = C.placeholder() qnw_ph = C.placeholder() qc_ph = C.placeholder() input_chars = C.placeholder(shape=(1,self.word_size,self.c_dim)) input_glove_words = C.placeholder(shape=(self.wg_dim,)) input_nonglove_words = C.placeholder(shape=(self.wn_dim,)) # we need to reshape because GlobalMaxPooling/reduce_max is retaining a trailing singleton dimension # todo GlobalPooling/reduce_max should have a keepdims default to False embedded = C.splice( C.reshape(self.charcnn(input_chars), self.convs), self.embed()(input_glove_words, input_nonglove_words), name='splice_embed') processed = C.layers.Sequential([For(range(2), lambda: OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn'))])(embedded) qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) q_processed = processed.clone(C.CloneMethod.share, {input_chars:qce, input_glove_words:qgw_ph, input_nonglove_words:qnw_ph}) c_processed = processed.clone(C.CloneMethod.share, {input_chars:cce, input_glove_words:cgw_ph, input_nonglove_words:cnw_ph}) return C.as_block( C.combine([c_processed, q_processed]), [(cgw_ph, cgw),(cnw_ph, cnw),(cc_ph, cc),(qgw_ph, qgw),(qnw_ph, qnw),(qc_ph, qc)], 'input_layer', 'input_layer')
def test_sequence_step_function_scalar_shape_inferrence(): hidden_dim = 3 in_dim = 5 x = C.sequence.input_variable((in_dim,)) r = C.sequence.input_variable((1,)) # value of 0/1. 0 means reset merged_x = C.splice(x, r) # Recurrence only takes 1 input, so concatenate the two cell = C.layers.LSTM(hidden_dim) # (dh, dc, x) -> (h, c) y = C.layers.Recurrence(cell)(x) @C.Function def lstm_with_reset(dh, dc, xr): xx = xr[0:-1] rr = xr[-1] return cell(rr * dh, rr * dc, xx) yr = C.layers.Recurrence(lstm_with_reset)(merged_x) seq_len = [2,3,5] total_len = np.sum(seq_len) accum_seq_len = np.cumsum(seq_len) x_total_data = np.random.rand(1, total_len, in_dim).astype(np.float32) x_data = [np.squeeze(v) for v in np.split(x_total_data, accum_seq_len[0:-1], axis=1)] r_data = np.ones(accum_seq_len[-1]) for i in np.nditer(accum_seq_len[0:-1]): r_data[i] = 0 r_data = np.reshape(r_data, (-1,1)).astype(np.float32) v1 = y.eval(x_data) v2 = yr.eval({x:x_total_data, r:r_data}) assert np.allclose(np.concatenate(v1), v2[0])
def lightlstm(input_dim, cell_dim): x = C.placeholder(name='x') dh = C.placeholder(name='dh') dc = C.placeholder(name='dc') x1 = C.slice(x, -1, input_dim * 0, input_dim * 1) x2 = C.slice(x, -1, input_dim * 1, input_dim * 2) def LSTMCell(x, y, dh, dc): '''LightLSTM Cell''' b = C.parameter(shape=(4 * cell_dim), init=0) W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform()) H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform()) # projected contribution from input x, hidden, and bias proj4 = b + C.times(x, W) + C.times(dh, H) it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim) bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim) ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim) ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim) it = C.sigmoid(it_proj) # input gate bit = it * C.tanh(bit_proj) ft = C.sigmoid(ft_proj) # forget gate bft = ft * dc ct = bft + bit ot = C.sigmoid(ot_proj) # output gate ht = ot * C.tanh(ct) # projected contribution from input y, hidden, and bias proj4_2 = b + C.times(y, W) + C.times(ht, H) it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim) bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim) ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim) ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim) it_2 = C.sigmoid(it_proj_2) # input gate bit_2 = it_2 * C.tanh(bit_proj_2) ft_2 = C.sigmoid(ft_proj_2) # forget gate bft_2 = ft_2 * ct ct2 = bft_2 + bit_2 ot_2 = C.sigmoid(ot_proj_2) # output gate ht2 = ot_2 * C.tanh(ct2) return (ht, ct, ht2, ct2) Cell = LSTMCell(x1, x2, dh, dc) actualDh = past_value(Cell[2]) actualDc = past_value(Cell[3]) Cell[0].replace_placeholders( {dh: actualDh.output, dc: actualDc.output}) return C.splice(Cell[0], Cell[2], axis=-1)
def create_model(): x = C.placeholder() with C.layers.default_options(initial_state=0.1): e = C.layers.Embedding(emb_dim, name='embed')(x) negRnn = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=True)(e) posRnn = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(e) h = C.splice(posRnn, negRnn) out = C.layers.Dense(num_labels, name='classify')(h) return out
def test_Concat(tmpdir): data1 = np.asarray([[[1, 2], [4, 5]]], dtype=np.float32) x = C.constant(value=data1) # create 3x2 matrix in a sequence of length 1 in a batch of one sample data2 = np.asarray([[[10, 20], [30, 40], [50, 60]]],dtype=np.float32) y = C.constant(value=data2) # splice both inputs on axis=0 returns a 5x2 matrix model = C.splice(x, y, axis=1) verify_no_input(model, tmpdir, 'Concat_0') x = C.input_variable(data1.shape) model = C.splice(x, y, axis=1) verify_one_input(model, data1, tmpdir, 'Concat__1')
def out_func1(att_input, enc_input): enc_input2 = enc_input @C.Function def bigru_with_match(dh, x): c_att = matching_model(att_input, dh) x = C.splice(x, c_att) x = C.element_times(x, C.sigmoid(C.times(x, Wg))) return att_gru(dh, x) return C.splice(C.layers.Recurrence(bigru_with_match)(enc_input2), C.layers.Recurrence(bigru_with_match, go_backwards=True)(enc_input2), name="bigru_with_match")
def test_clone_with_slice(): i1 = C.input_variable((2,2), name='i1') i2 = C.input_variable((2,2), name='i2') x = C.splice(i1, i2, axis=0) W = C.constant(1, (4,1), name='W') y = C.convolution(W, x) assert(y.shape == (4,2)) from ..functions import CloneMethod x1 = C.input_variable((2,1), name='x1') x2 = C.input_variable((2,1), name='x2') p1 = C.placeholder() p2 = C.placeholder() y_cloned = y.clone('clone', {i1:p1, i2:p2}) y2 = y_cloned(x1, x2) assert(y2.shape == (4,1))
def cntk_baseline_lstm(): import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.sequence.input_variable(shape=(in_dim)) fwbw = C.splice(C.layers.Recurrence(C.layers.LSTM(dim, init_bias=C.glorot_uniform()))(input_var), C.layers.Recurrence(C.layers.LSTM(dim), go_backwards=True)(input_var)) ci.watch(fwbw, 'birnn', var_type=cstk.RnnAttr, attr=cstk.RnnAttr(bidirectional=True, op_type='lstm', input_dim=in_dim, hidden_dim=dim, forget_bias=0)) ci.watch(fwbw, 'birnn_out') data = {input_var:data_cntk} ci.set_data(data) ci.set_workdir(workdir) ci.fetch('birnn', save=True) ci.fetch('birnn_out', save=True) ci.reset()
def test_op_splice(input_data1, input_data2, axis, expected_result, device_id, precision): # FIXME This test currently fails in C++ with # RuntimeError: Node 'splice_ab' (RowStack operation): Attempted to # type-cast node to struct Microsoft::MSR::CNTK::INumInputs, which is not # possible. input_data1 = AA(input_data1, dtype=PRECISION_TO_TYPE[precision]) input_data2 = AA(input_data2, dtype=PRECISION_TO_TYPE[precision]) a = I( shape=input_data1.shape, data_type=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name="a", ) b = I( shape=input_data2.shape, data_type=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name="b", ) # create batch input_data1.shape = (1, 1) + input_data1.shape input_data2.shape = (1, 1) + input_data2.shape # splice using the operator root_op = C.splice((a, b), axis, name="splice_ab") forward_input = {a: input_data1, b: input_data2} # Backward pass test # ================== # The gradient of the splice operator is all ones in the shape of the input def grad_splice(x): return np.ones_like(x) expected_forward = [[expected_result]] expected_backward = {a: grad_splice(np.asarray(input_data1)), b: grad_splice(np.asarray(input_data2))} unittest_helper( root_op, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision )
def gate_attention_layer(self, inputs, memory, common_len, att_kind='simi'): # [#,c][2*d] [#,c][*=q,1] if att_kind == 'dot': qc_attn, attn_weight = self.dot_attention(inputs, memory, common_len).outputs else: qc_attn, attn_weight = self.simi_attention(inputs, memory).outputs inputs = inputs[:common_len] qc_attn = qc_attn[:common_len] cont_attn = C.splice(inputs, qc_attn) # [#,c][4*d] dense = Dropout(self.dropout) >> Dense(2 * common_len, activation=C.sigmoid, input_rank=1) >> Label('gate') gate = dense(cont_attn) # [#, c][4*d] return gate * cont_attn, attn_weight
def greedy_model(aawk, aawn, qqwk, qqwn): a_oneh = C.splice(aawk, aawn) sentence_start = C.sequence.slice(a_oneh, 0, 1) @C.Function def process_history(hist, inp): wk = C.slice(hist, 0, 0, myConfig['wg_dim']) wn = hist[myConfig['wg_dim']:] hist_processed = embed_layer(wk, wn) out_logits = s2smodel(hist_processed, inp) hamax = C.reshape(C.hardmax(out_logits), (-1, )) return hamax q_processed = embed_layer(qqwk, qqwn) unfold = UnfoldFrom( lambda history: process_history(history, q_processed), until_predicate=lambda w: w[..., sentence_end_index], length_increase=1.5) out_onehot = unfold(sentence_start, q_processed) return out_onehot
def multi_head_attention(self, contextQ, contextV, contextK, name): Q = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis]) V = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis]) K = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis]) att0 = self.scale_dot_product_attention_block(Q, V, K, '0') att1 = self.scale_dot_product_attention_block(Q, V, K, '1') att2 = self.scale_dot_product_attention_block(Q, V, K, '2') att3 = self.scale_dot_product_attention_block(Q, V, K, '3') att4 = self.scale_dot_product_attention_block(Q, V, K, '4') att5 = self.scale_dot_product_attention_block(Q, V, K, '5') att = C.splice(att0, att1, att2, att3, att4, att5) att_residual = att + Q return C.as_block( att_residual, [(Q, contextQ), (V, contextV), (K, contextK)], 'multi_head_attention_layer' + name, 'multi_head_attention_layer' + name)
def attention_layer(self, context, query): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) c_processed = C.placeholder(shape=(2*self.hidden_dim,)) #convert query's sequence axis to static qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs # This part deserves some explanation # It is the attention layer # In the paper they use a 6 * dim dimensional vector # here we split it in three parts because the different parts # participate in very different operations # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u) ws1 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws2 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws3 = C.parameter(shape=(1, 2 * self.hidden_dim), init=C.glorot_uniform()) att_bias = C.parameter(shape=(), init=0) wh = C.times (c_processed, ws1) wu = C.reshape(C.times (qvw, ws2), (-1,)) whu = C.reshape(C.reduce_sum(c_processed * C.sequence.broadcast_as(qvw * ws3, c_processed), axis=1), (-1,)) S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed) S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) q_attn = C.reshape(C.softmax(S), (-1,1)) c2q = C.reshape(C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn, axis=0),(-1)) max_col = C.reduce_max(S) c_attn = C.sequence.softmax(max_col) htilde = C.sequence.reduce_sum(c_processed * c_attn) q2c = C.sequence.broadcast_as(htilde, c_processed) q2c_out = c_processed * q2c att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out) return C.as_block( att_context, [(c_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def bi_sru_layer(self, sru_1, index): f_1_f = C.sigmoid(sru_1[0 * self.param2 : 1 * self.param2] + self.list_bias[0 + index * 4]) r_1_f = C.sigmoid(sru_1[1 * self.param2 : 2 * self.param2] + self.list_bias[1 + index * 4]) c_1_f_r = (1 - f_1_f) * sru_1[2 * self.param2: 3 * self.param2] dec_c_1_f = C.layers.ForwardDeclaration('f_' + str(index)) var_c_1_f = C.sequence.delay(dec_c_1_f, initial_state=0, time_step=1) nex_c_1_f = var_c_1_f * f_1_f + c_1_f_r dec_c_1_f.resolve_to(nex_c_1_f) h_1_f = r_1_f * C.tanh(nex_c_1_f) + (1 - r_1_f) * sru_1[3 * self.param2 : 4 * self.param2] f_1_b = C.sigmoid(sru_1[4 * self.param2 : 5 * self.param2] + self.list_bias[2 + index * 4]) r_1_b = C.sigmoid(sru_1[5 * self.param2 : 6 * self.param2] + self.list_bias[3 + index * 4]) c_1_b_r = (1 - f_1_b) * sru_1[6 * self.param2 : 7 * self.param2] dec_c_1_b = C.layers.ForwardDeclaration('b_' + str(index)) var_c_1_b = C.sequence.delay(dec_c_1_b, time_step=-1) nex_c_1_b = var_c_1_b * f_1_b + c_1_b_r dec_c_1_b.resolve_to(nex_c_1_b) h_1_b = r_1_b * C.tanh(nex_c_1_b) + (1 - r_1_b) * sru_1[7 * self.param2 : 8 * self.param2] x = C.splice(h_1_f, h_1_b) return x
def test_splice(shape1, shape2): a = C.input_variable(shape=shape1, dtype=sanitize_dtype_cntk( PRECISION_TO_TYPE[precision]), needs_gradient=True, name='a') b = C.input_variable(shape=shape2, dtype=sanitize_dtype_cntk( PRECISION_TO_TYPE[precision]), needs_gradient=True, name='b') # create batch input_data1.shape = (1, ) + input_data1.shape input_data2.shape = (1, ) + input_data2.shape # splice using the operator root_op = C.splice(a, b, axis=axis, name='splice_ab') forward_input = {a: input_data1, b: input_data2} # Backward pass test # ================== # The gradient of the splice operator is all ones in the shape of the input def grad_splice(x): return np.ones_like(x) expected_forward = [expected_result] expected_backward = { a: grad_splice(np.asarray(input_data1)), b: grad_splice(np.asarray(input_data2)) } unittest_helper(root_op, forward_input, expected_forward, expected_backward, device_id=device_id, precision=precision)
def bilateral_slice(im, guide, guide_no_grad): # Flatten data for gather op flat_grid = grid_scale*C.reshape(grid, [grid_sz*grid_sz*sigma_r*o_chans*(i_chans+1)]) # flat_grid_u = C.unpack_batch(flat_grid) # Make sure we do sth that requires the gradient w.r.t guide scaled_guide = guide_scale*guide gx_d, gy_d, gz_d, fx_d, fy_d, fz_d, _, _, _ = grid_coord( scaled_guide, xx, yy, sz, grid_sz, sigma_r) wx = C.abs(gx_d - 0.5 - fx_d) wy = C.abs(gy_d - 0.5 - fy_d) wz = C.abs(gz_d - 0.5 - fz_d) # Enclosing cell gx, gy, gz, fx, fy, fz, cx, cy, cz = grid_coord( guide_no_grad, xx, yy, sz, grid_sz, sigma_r) out_chans = [] for chan in range(o_chans): output_components = [] for ix, x in enumerate([fx, cx]): wx_ = (1-wx) if ix == 0 else wx for iy, y in enumerate([fy, cy]): wy_ = (1-wy) if iy == 0 else wy for iz, z in enumerate([fz, cz]): wz_ = (1-wz) if iz == 0 else wz linear_idx = x + grid_sz*(y + grid_sz*(z + sigma_r*(cc + chan*(i_chans+1)))) flat_linear_idx = C.reshape(linear_idx, [(i_chans+1)*sz*sz]) # Slice interp = C.gather(flat_grid, flat_linear_idx) interp_fsz = C.reshape(interp, [i_chans+1, sz, sz])*wx_*wy_*wz_ output_components.append(interp_fsz) out_coeffs = sum(output_components) out_chan = C.reduce_sum(out_coeffs[:i_chans]*(im_scale*im) + out_coeffs[-1], 0) out_chans.append(out_chan) out = C.splice(*out_chans, axis=0) return out
def multiHead(self, context, query, outdim, head=4): cph = C.placeholder() qph = C.placeholder() atts = [] for i in range(head): dense_q = C.layers.Dense(outdim, activation=C.relu, init=xavier(1.377), bias=False, input_rank=1, name='headq_{}'.format(i))(qph) dense_c = C.layers.Dense(outdim, activation=C.relu, init=xavier(1.377), bias=False, input_rank=1, name='headc_{}'.format(i))(cph) attn, _ = self.dot_attention(dense_c, dense_q, outdim).outputs atts.append(attn) res = C.splice(*atts) return C.as_block(res, [(cph, context), (qph, query)], 'multiHead', 'multiHead')
def create_model(input, num_classes): c_map = [16, 32, 64] num_stack_layers = 3 conv = conv_bn_relu(input, (3,3), c_map[0]) r1 = resnet_basic_stack(conv, num_stack_layers, c_map[0]) r2_1 = resnet_basic_inc(r1, c_map[1]) r2_2 = resnet_basic_stack(r2_1, num_stack_layers-1, c_map[1]) r3_1 = resnet_basic_inc(r2_2, c_map[2]) r3_2 = resnet_basic_stack(r3_1, num_stack_layers-1, c_map[2]) up_r1 = OneByOneConvAndUpSample(r1, 0, num_classes) up_r2_2 = OneByOneConvAndUpSample(r2_2, 1, num_classes) up_r3_2 = OneByOneConvAndUpSample(r3_2, 2, num_classes) merged = C.splice(up_r1, up_r3_2, up_r2_2, axis=0) resnet_fcn_out = Convolution((1, 1), num_classes, init=he_normal(), activation=sigmoid, pad=True)(merged) return resnet_fcn_out
def input_layer(self,cgw,cc,qgw,qc,qnw,cnw): cgw_ph = C.placeholder() cnw_ph = C.placeholder() cc_ph = C.placeholder() qgw_ph = C.placeholder() qnw_ph = C.placeholder() qc_ph = C.placeholder() input_chars = C.placeholder(shape=(1,self.word_size,self.c_dim)) input_glove_words = C.placeholder(shape=(self.wg_dim,)) input_nonglove_words = C.placeholder(shape=(self.wn_dim,)) embedded = C.splice( C.reshape(self.charcnn(input_chars), self.convs), self.embed()(input_glove_words, input_nonglove_words), name='splice_embed') highway = HighwayNetwork(dim=self.elmo_dim + self.hidden_dim + self.convs, highway_layers=self.highway_layers)(embedded) highway_drop = C.layers.Dropout(self.dropout)(highway) processed = OptimizedRnnStack(self.hidden_dim, num_layers=1, bidirectional=True, use_cudnn=self.use_cudnn, name='input_rnn')(highway_drop) qce = C.one_hot(qc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) cce = C.one_hot(cc_ph, num_classes=self.c_dim, sparse_output=self.use_sparse) q_processed = processed.clone(C.CloneMethod.share, {input_chars:qce, input_glove_words:qgw_ph, input_nonglove_words:qnw_ph}) c_processed = processed.clone(C.CloneMethod.share, {input_chars:cce, input_glove_words:cgw_ph, input_nonglove_words:cnw_ph}) return C.as_block( C.combine([c_processed, q_processed]), [(cgw_ph, cgw), (cc_ph, cc), (qgw_ph, qgw), (qc_ph, qc), (qnw_ph, qnw), (cnw_ph, cnw)], 'input_layer', 'input_layer')
def create_model_cnn_body(): with C.layers.default_options(initial_state=0.1): h1t= C.layers.Embedding(300,name='embed')(xb)#init=embedding, h1b= C.layers.Embedding(300,name='embed')(xt)#init=embedding, bnb = C.layers.BatchNormalization(name='bn')(h1b) bnt = C.layers.BatchNormalization(name='bn')(h1t) to_static_t= C.layers.PastValueWindow(window_size=max_length_title, axis=-2)(bnt)[0] to_static_b= C.layers.PastValueWindow(window_size=max_length_body, axis=-2)(bnb)[0] h2_1t=C.layers.Convolution((1,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_t) h2_2t=C.layers.Convolution((2,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_t) h2_3t=C.layers.Convolution((3,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_t) h2_1b=C.layers.Convolution((1,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_b) h2_2b=C.layers.Convolution((2,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_b) h2_3b=C.layers.Convolution((3,emb_dim),num_filters=100,reduction_rank=0,activation=C.relu)(to_static_b) h3_2t=C.layers.MaxPooling((max_length-1,1),name='pooling')(h2_2t) h3_1t=C.layers.MaxPooling((max_length,1),name='pooling')(h2_1t) h3_3t=C.layers.MaxPooling((max_length-2,1),name='pooling')(h2_3t) h3_2b=C.layers.MaxPooling((max_length-1,1),name='pooling')(h2_2b) h3_1b=C.layers.MaxPooling((max_length,1),name='pooling')(h2_1b) h3_3b=C.layers.MaxPooling((max_length-2,1),name='pooling')(h2_3b) h3=C.splice(h3_2t,h3_1t,h3_3t,h3_2b,h3_1b,h3_3b,axis=0) h4=C.layers.Dense(hidden_dim, activation=C.relu,name='hidden')(h3) drop2 = C.layers.Dropout(0.5)(h4) h5=C.layers.Dense(num_labels,name='classify')(drop2) return h5
def test_large_model_serialization_float(tmpdir): import os from cntk.layers import Recurrence, LSTM, Dense type_size = np.dtype(np.float32).itemsize two_gb = 2**31 size = (2097152 + 4, 256, 512, 4096) assert size[0] * size[1] * type_size > two_gb device = C.device.cpu() i = C.sequence.input(size[0]) w = C.Parameter((size[0], size[1]), init=C.uniform(3.0, seed=12345), device=device) e = C.times(i, w) h_fwd = Recurrence(LSTM(size[2]))(e) h_bwd = Recurrence(LSTM(size[2]), go_backwards=True)(e) h_last_fwd = C.sequence.last(h_fwd) h_first_bwd = C.sequence.first(h_bwd) t = C.splice(h_last_fwd, h_first_bwd) z1 = Dense(size[2], activation=C.relu)(t) z = Dense(2, activation=None)(z1) filename = str(tmpdir / 'test_large_model_serialization_float.out') z.save(filename) assert os.path.getsize(filename) > two_gb y = C.Function.load(filename, device=device) assert (len(z.parameters) == len(y.parameters)) for param_pair in zip(z.parameters, y.parameters): assert param_pair[0].shape == param_pair[1].shape assert np.allclose(param_pair[0].value, param_pair[1].value)
def test_sequence_step_function_scalar_shape_inferrence(): hidden_dim = 3 in_dim = 5 x = C.sequence.input_variable((in_dim, )) r = C.sequence.input_variable((1, )) # value of 0/1. 0 means reset merged_x = C.splice( x, r) # Recurrence only takes 1 input, so concatenate the two cell = C.layers.LSTM(hidden_dim) # (dh, dc, x) -> (h, c) y = C.layers.Recurrence(cell)(x) @C.Function def lstm_with_reset(dh, dc, xr): xx = xr[0:-1] rr = xr[-1] return cell(rr * dh, rr * dc, xx) yr = C.layers.Recurrence(lstm_with_reset)(merged_x) seq_len = [2, 3, 5] total_len = np.sum(seq_len) accum_seq_len = np.cumsum(seq_len) x_total_data = np.random.rand(1, total_len, in_dim).astype(np.float32) x_data = [ np.squeeze(v) for v in np.split(x_total_data, accum_seq_len[0:-1], axis=1) ] r_data = np.ones(accum_seq_len[-1]) for i in np.nditer(accum_seq_len[0:-1]): r_data[i] = 0 r_data = np.reshape(r_data, (-1, 1)).astype(np.float32) v1 = y.eval(x_data) v2 = yr.eval({x: x_total_data, r: r_data}) assert np.allclose(np.concatenate(v1), v2[0])
def indy_lstm(dh, dc, x): dhs = Sdh(dh) # previous values, stabilized dcs = Sdc(dc) # note: input does not get a stabilizer here, user is meant to do that outside # projected contribution from input(s), hidden, and bias proj4 = b + times(x, W) + C.splice( dhs, dhs, dhs, dhs) * H1 # 4 is the number of stacked dim it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis bit_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) ft_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) # helper to inject peephole connection if requested def peep(x, c, C): return x + C * c if use_peepholes else x it = sigmoid(peep(it_proj, dcs, Ci)) # input gate(t) # TODO: should both activations be replaced? bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(peep(ft_proj, dcs, Cf)) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(peep(ot_proj, Sct(ct), Co)) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) c = ct # cell value h = times(Sht(ht), Wmr) if has_projection else ht return h, c
def test_large_model_serialization_float(tmpdir): import os; from cntk.layers import Recurrence, LSTM, Dense type_size = np.dtype(np.float32).itemsize two_gb = 2**31 size = (2097152 + 4, 256, 512, 4096) assert size[0] * size[1] * type_size > two_gb device = C.device.cpu() i = C.sequence.input(size[0]) w = C.Parameter((size[0], size[1]), init=C.uniform(3.0, seed=12345), device=device) e = C.times(i, w) h_fwd = Recurrence(LSTM(size[2]))(e) h_bwd = Recurrence(LSTM(size[2]), go_backwards=True)(e) h_last_fwd = C.sequence.last(h_fwd) h_first_bwd = C.sequence.first(h_bwd) t = C.splice(h_last_fwd, h_first_bwd) z1 = Dense(size[2], activation=C.relu)(t) z = Dense(2, activation=None)(z1) filename = str(tmpdir / 'test_large_model_serialization_float.out') z.save(filename) assert os.path.getsize(filename) > two_gb y = C.Function.load(filename, device=device) assert (len(z.parameters) == len(y.parameters)) for param_pair in zip(z.parameters, y.parameters): assert param_pair[0].shape == param_pair[1].shape assert np.allclose(param_pair[0].value, param_pair[1].value)
def output_layer(self, embed, attention_context, model_context, aw, q_processed, c_processed,cw): cw_ph=C.placeholder() att_context = C.placeholder(shape=(8*self.hidden_dim,)) query_processed = C.placeholder(shape=(2*self.hidden_dim,)) context_processed = C.placeholder(shape=(2*self.hidden_dim,)) mod_context = C.placeholder(shape=(2*self.hidden_dim)) a_onehot = C.placeholder(shape=(self.vocab_size+1,)) start_logits = C.layers.Dense(1, name='out_start')(C.dropout(C.splice(mod_context, att_context), self.dropout)) start_hardmax = seq_hardmax(start_logits) att_mod_ctx = C.sequence.last(C.sequence.gather(mod_context, start_hardmax)) att_mod_ctx_expanded = C.sequence.broadcast_as(att_mod_ctx, att_context) end_input = C.splice(att_context, mod_context, att_mod_ctx_expanded, mod_context * att_mod_ctx_expanded) m2 = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='output_rnn')(end_input) end_logits = C.layers.Dense(1, name='out_end')(C.dropout(C.splice(m2, att_context), self.dropout)) start_flag = C.hardmax(start_logits) end_flag = C.hardmax(end_logits) def create_model(): # Encoder: (input*) --> (h0, c0) # Create multiple layers of LSTMs by passing the output of the i-th layer # to the (i+1)th layer as its input with C.layers.default_options(enable_self_stabilization=True, go_backwards=False): LastRecurrence = C.layers.Recurrence encode = C.layers.Sequential([ C.layers.Stabilizer(), OptimizedRnnStack(self.hidden_dim, return_full_state=True), ]) encode_c = C.layers.Sequential([ C.layers.Stabilizer(), OptimizedRnnStack(self.hidden_dim, return_full_state=True), ]) # Decoder: (history*, input*) --> unnormalized_word_logp* # where history is one of these, delayed by 1 step and <s> prepended: # - training: labels # - testing: its own output hardmax(z) (greedy decoder) with C.layers.default_options(enable_self_stabilization=True): # sub-layers stab_in = C.layers.Stabilizer() rec_blocks = [C.layers.LSTM(self.hidden_dim) for i in range(self.num_layers)] stab_out = C.layers.Stabilizer() proj_out = C.layers.Dense(self.vocab_size+1, name='out_proj') # attention model attention_model = C.layers.AttentionModel(self.attention_dim, name='attention_model') # :: (h_enc*, h_dec) -> (h_dec augmented) hstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1) cstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1) W_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1) U_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1) V_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1) maxout = C.layers.MaxPooling((2,), strides=2) # layer function @C.Function def decode(history, q, c, start_logits, end_logits): q = encode(q) c = encode_c(C.splice(c, start_logits, end_logits, axis=0)) r = history r = stab_in(r) q_last_h = C.sequence.last(q.outputs[0]) q_last_c = C.sequence.last(q.outputs[1]) c_last_h = C.sequence.last(c.outputs[0]) c_last_c = C.sequence.last(c.outputs[1]) initial_hstate = hstate_dense(C.splice(q_last_h, c_last_h)) initial_cstate = cstate_dense(C.splice(q_last_c, c_last_c)) rec_block = rec_blocks[0] # LSTM(hidden_dim) # :: (dh, dc, x) -> (h, c) @C.Function def find_embed(x): gx, ngx = C.slice(x, 0, 0, self.wg_dim), C.slice(x, 0, self.wg_dim, self.vocab_size) return embed(gx, ngx) @C.Function def lstm_with_attention(dh, dc, r, x): history_embed = find_embed(x) h_att = attention_model(c.outputs[0], dh) q_att = attention_model(q.outputs[0], dh) att = C.splice(h_att, q_att) x = C.splice(x, att) x, dc = rec_block(dh, dc, x).outputs # 0*r is a hack because cntk freaks out when r is not used. r = U_dense(att) + W_dense(history_embed) + V_dense(x) + 0*r #bug when W_dense is added first, wtf?! #r = W_dense(embed(gx, ngx)) + U_dense(att) + V_dense(x) + 0*r return x, dc, r _, _, r = C.layers.RecurrenceFrom(lstm_with_attention, return_full_state=True)(initial_hstate, initial_cstate, C.Constant(np.zeros(2*self.hidden_dim)),r).outputs r = maxout(r) r = stab_out(r) r = proj_out(r) #r = C.softmax(r) r = C.layers.Label('out_proj_out')(r) return r return decode def create_model_train(s2smodel): # model used in training (history is known from labels) # note: the labels must NOT contain the initial <s> @C.Function def model_train(labels, q, c, start_logits, end_logits): # (input*, labels*) --> (word_logp*) # The input to the decoder always starts with the special label sequence start token. # Then, use the previous value of the label sequence (for training) or the output (for execution). past_labels = C.layers.Delay(initial_state=self.sentence_start)(labels) return s2smodel(past_labels, q, c, start_logits, end_logits) return model_train def create_model_greedy(s2smodel): # model used in (greedy) decoding (inferencing) (history is decoder's own output) @C.Function def model_greedy(q, c, start_logits, end_logits): # (input*) --> (word_sequence*) # Decoding is an unfold() operation starting from sentence_start. # We must transform s2smodel (history*, input* -> word_logp*) into a generator (history* -> output*) # which holds 'input' in its closure. unfold = C.layers.UnfoldFrom(\ lambda history: s2smodel(history, q, c, start_logits, end_logits) >> C.hardmax, # stop once sentence_end_index was max-scoring output until_predicate=lambda w: w[...,self.sentence_end_index], length_increase=self.sentence_max_length) return unfold(initial_state=self.sentence_start, dynamic_axes_like=c) return model_greedy s2smodel = create_model() model_train = create_model_train(s2smodel)(a_onehot, query_processed, context_processed, start_logits, end_logits) model_greed = create_model_greedy(s2smodel)(query_processed, context_processed, start_logits, end_logits) model_greedy = C.argmax(model_greed,0) context = C.argmax(cw_ph,0) return C.as_block( C.combine((model_train, model_greedy, start_logits, end_logits,context)), [(att_context, attention_context), (mod_context, model_context), (a_onehot, aw), (query_processed, q_processed), (context_processed, c_processed),(cw_ph,cw)], 'attention_layer', 'attention_layer')
def validate_model(test_data, model, polymath): begin_logits = model.outputs[0] end_logits = model.outputs[1] loss = model.outputs[2] root = C.as_composite(loss.owner) mb_source, input_map = create_mb_and_map(root, test_data, polymath, randomize=False, repeat=False) begin_label = argument_by_name(root, 'ab') end_label = argument_by_name(root, 'ae') begin_prediction = C.sequence.input_variable(1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True) end_prediction = C.sequence.input_variable(1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True) best_span_score = symbolic_best_span(begin_prediction, end_prediction) predicted_span = C.layers.Recurrence(C.plus)(begin_prediction - C.sequence.past_value(end_prediction)) true_span = C.layers.Recurrence(C.plus)(begin_label - C.sequence.past_value(end_label)) common_span = C.element_min(predicted_span, true_span) begin_match = C.sequence.reduce_sum(C.element_min(begin_prediction, begin_label)) end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label)) predicted_len = C.sequence.reduce_sum(predicted_span) true_len = C.sequence.reduce_sum(true_span) common_len = C.sequence.reduce_sum(common_span) f1 = 2*common_len/(predicted_len+true_len) exact_match = C.element_min(begin_match, end_match) precision = common_len/predicted_len recall = common_len/true_len overlap = C.greater(common_len, 0) s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes()) stats = C.splice(s(f1), s(exact_match), s(precision), s(recall), s(overlap), s(begin_match), s(end_match)) # Evaluation parameters minibatch_size = 20000 num_sequences = 0 stat_sum = 0 loss_sum = 0 while True: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) if not data or not (begin_label in data) or data[begin_label].num_sequences == 0: break out = model.eval(data, outputs=[begin_logits,end_logits,loss], as_numpy=False) testloss = out[loss] g = best_span_score.grad({begin_prediction:out[begin_logits], end_prediction:out[end_logits]}, wrt=[begin_prediction,end_prediction], as_numpy=False) other_input_map = {begin_prediction: g[begin_prediction], end_prediction: g[end_prediction], begin_label: data[begin_label], end_label: data[end_label]} stat_sum += stats.eval((other_input_map)) loss_sum += np.sum(testloss.asarray()) num_sequences += data[begin_label].num_sequences stat_avg = stat_sum / num_sequences loss_avg = loss_sum / num_sequences print("Validated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}".format( num_sequences, loss_avg, stat_avg[0], stat_avg[1], stat_avg[2], stat_avg[3], stat_avg[4], stat_avg[5], stat_avg[6])) return loss_avg
def LookAhead(x): xn = C.sequence.future_value(x) return C.splice(x,xn)
def func(x): return C.splice( C.layers.Recurrence(C.layers.GRU(hidden_dim))(x), C.layers.Recurrence(C.layers.GRU(hidden_dim), go_backwards=True)(x), name=name)
def BiRecurrence(fwd, bwd): F = C.layers.Recurrence(fwd) G = C.layers.Recurrence(bwd, go_backwards=True) x = C.placeholder() apply_x = C.splice(F(x), G(x)) # concatenate the tensors return apply_x
def flow_forward(input_dim: int, act_func_pair: tuple = (None, None), batch_norm: bool = False): chunk = {} log_det_J = 0 chunk['input_dim'] = input_dim _ph = C.placeholder(input_dim, name='place_holder') _out = _ph if batch_norm: # _bn = C.layers.BatchNormalization(name='batch_norm')(_ph) # chunk['scale'] = _bn.parameters[0] # chunk['bias'] = _bn.parameters[1] chunk['mu'] = C.Constant(np.zeros(shape=input_dim)) chunk['var'] = C.Constant(np.ones(shape=input_dim)) _eps = C.Constant(1e-7) _mu = C.reduce_mean(_ph, axis=C.Axis.default_batch_axis()) _var = C.reduce_mean(C.square(_ph-_mu), axis=C.Axis.default_batch_axis()) chunk['muB'] = _mu chunk['varB'] = _var # _bn = (_ph-chunk['mu'])/C.sqrt(chunk['var']+_eps) _bn = C.sqrt(chunk['var']+_eps)*_ph + chunk['mu'] _ph = _bn log_det_J += -0.5*C.reduce_sum(C.log((_var+_eps))) # log_det_J += C.reduce_sum(C.log()) chunk['W_rot_mat'] = _W = C.parameter((input_dim, input_dim)) _W.value = random_rotation_matrix = special_ortho_group.rvs(input_dim) # _W.value = np.roll(np.eye(input_dim),input_dim//2,axis=0) _out = _ph@_W log_det_J += C.log(C.abs(C.det(_W))) # or # log_det_J += C.slogdet(_W)[1] _half_dim = input_dim//2 _x1 = _out[:_half_dim] _x2 = _out[_half_dim:] _log_s_func, _t_func = act_func_pair if _log_s_func is None: # basic network _log_s_func = C.layers.Sequential([ C.layers.Dense(256, C.leaky_relu), C.layers.Dense(256, C.leaky_relu), C.layers.Dense(_half_dim, C.tanh), ])#(C.placeholder(input_dim, name='place_holder')) if _t_func is None: # basic network _t_func = C.layers.Sequential([ C.layers.Dense(256, C.leaky_relu), C.layers.Dense(256, C.leaky_relu), C.layers.Dense(_half_dim), ])#(C.placeholder(input_dim, name='place_holder')) chunk['log_s_func'] = _log_s_func chunk['t_func'] = _t_func _log_s, _t = _log_s_func(_x2), _t_func(_x2) _s = C.exp(_log_s) _y1 = _s*_x1 + _t _y2 = _x2 _Y = C.splice(_y1, _y2) chunk['output'] = _Y log_det_J += C.reduce_sum(_log_s) return _Y, log_det_J, chunk
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: (image_widht, image_height, image_scale) as CNTK variable or constant add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 rpn_conv_3x3 = Convolution((3, 3), 256, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=0.1)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=0.1)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(np.prod(rpn_cls_score.shape) / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions)) rpn_cls_prob = softmax(rpn_cls_score_rshp, axis=0, name="objness_softmax") rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape) # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # For loss functions: ignore label predictions for the 'ignore label', # i.e. set target and prediction to 0 --> needs to be softmaxed before rpn_labels_rshp = reshape(rpn_labels, (1, num_predictions)) ignore = user_function(IgnoreLabel(rpn_cls_prob, rpn_labels_rshp, ignore_label=-1)) rpn_cls_prob_ignore = ignore.outputs[0] fg_targets = ignore.outputs[1] bg_targets = 1 - fg_targets rpn_labels_ignore = splice(bg_targets, fg_targets, axis=0) # RPN losses rpn_loss_cls = cross_entropy_with_softmax(rpn_cls_prob_ignore, rpn_labels_ignore, axis=0) rpn_loss_bbox = user_function(SmoothL1Loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights)) rpn_losses = plus(reduce_sum(rpn_loss_cls), reduce_sum(rpn_loss_bbox), name="rpn_losses") return rpn_rois, rpn_losses
def UpSampling2D(x): xr = c.reshape(x, (x.shape[0], x.shape[1], 1, x.shape[2], 1)) xx = c.splice(xr, xr, axis=-1) xy = c.splice(xx, xx, axis=-3) result = c.reshape(xy, (x.shape[0], x.shape[1] * 2, x.shape[2] * 2)) return result
def validate_model(test_data, model, polymath): begin_logits = model.outputs[0] end_logits = model.outputs[1] loss = model.outputs[2] root = C.as_composite(loss.owner) mb_source, input_map = create_mb_and_map(root, test_data, polymath, randomize=False, repeat=False) begin_label = argument_by_name(root, 'ab') end_label = argument_by_name(root, 'ae') begin_prediction = C.sequence.input_variable( 1, sequence_axis=begin_label.dynamic_axes[1], needs_gradient=True) end_prediction = C.sequence.input_variable( 1, sequence_axis=end_label.dynamic_axes[1], needs_gradient=True) best_span_score = symbolic_best_span(begin_prediction, end_prediction) predicted_span = C.layers.Recurrence( C.plus)(begin_prediction - C.sequence.past_value(end_prediction)) true_span = C.layers.Recurrence(C.plus)(begin_label - C.sequence.past_value(end_label)) common_span = C.element_min(predicted_span, true_span) begin_match = C.sequence.reduce_sum( C.element_min(begin_prediction, begin_label)) end_match = C.sequence.reduce_sum(C.element_min(end_prediction, end_label)) predicted_len = C.sequence.reduce_sum(predicted_span) true_len = C.sequence.reduce_sum(true_span) common_len = C.sequence.reduce_sum(common_span) f1 = 2 * common_len / (predicted_len + true_len) exact_match = C.element_min(begin_match, end_match) precision = common_len / predicted_len recall = common_len / true_len overlap = C.greater(common_len, 0) s = lambda x: C.reduce_sum(x, axis=C.Axis.all_axes()) stats = C.splice(s(f1), s(exact_match), s(precision), s(recall), s(overlap), s(begin_match), s(end_match)) # Evaluation parameters minibatch_size = 2048 num_sequences = 0 stat_sum = 0 loss_sum = 0 with tqdm(ncols=32) as progress_bar: while True: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) if not data or not (begin_label in data ) or data[begin_label].num_sequences == 0: break out = model.eval(data, outputs=[begin_logits, end_logits, loss], as_numpy=False) testloss = out[loss] g = best_span_score.grad( { begin_prediction: out[begin_logits], end_prediction: out[end_logits] }, wrt=[begin_prediction, end_prediction], as_numpy=False) other_input_map = { begin_prediction: g[begin_prediction], end_prediction: g[end_prediction], begin_label: data[begin_label], end_label: data[end_label] } stat_sum += stats.eval((other_input_map)) loss_sum += np.sum(testloss.asarray()) num_sequences += data[begin_label].num_sequences progress_bar.update(data[begin_label].num_sequences) stat_avg = stat_sum / num_sequences loss_avg = loss_sum / num_sequences print( "\nValidated {} sequences, loss {:.4f}, F1 {:.4f}, EM {:.4f}, precision {:4f}, recall {:4f} hasOverlap {:4f}, start_match {:4f}, end_match {:4f}" .format(num_sequences, loss_avg, stat_avg[0], stat_avg[1], stat_avg[2], stat_avg[3], stat_avg[4], stat_avg[5], stat_avg[6])) return loss_avg
def create_rpn(conv_out, scaled_gt_boxes, im_info, add_loss_functions=True, proposal_layer_param_string=None, conv_bias_init=0.0): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: A CNTK variable or constant containing (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height) e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000 add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses proposal_layer_param_string: A yaml parameter string that is passed to the proposal layer. Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 num_channels = cfg["CNTK"].RPN_NUM_CHANNELS rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1, init = normal(scale=0.01), init_bias=conv_bias_init)(conv_out) rpn_cls_score = Convolution((1, 1), 18, activation=None, name="rpn_cls_score", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution((1, 1), 36, activation=None, name="rpn_bbox_pred", init = normal(scale=0.01), init_bias=conv_bias_init)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(rpn_cls_score.shape[0] / 2) rpn_cls_score_rshp = reshape(rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp") p_rpn_cls_score_rshp = cntk.placeholder() rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0) rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob') rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape") # proposal layer rpn_rois_raw = user_function(ProposalLayer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, param_str=proposal_layer_param_string)) rpn_rois = alias(rpn_rois_raw, name='rpn_rois') rpn_losses = None if(add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... atl = user_function(AnchorTargetLayer(rpn_cls_score, scaled_gt_boxes, im_info, param_str=proposal_layer_param_string)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # classification loss p_rpn_labels = cntk.placeholder() p_rpn_cls_score_rshp = cntk.placeholder() keeps = cntk.greater_equal(p_rpn_labels, 0.0) fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets") bg_labels = minus(1, fg_labels, name="bg_targets") rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0) rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0) rpn_loss_cls = element_times(rpn_ce, keeps) # The terms that are accounted for in the cls loss are those that have a label >= 0 cls_num_terms = reduce_sum(keeps) cls_normalization_factor = 1.0 / cls_num_terms normalized_rpn_cls_loss = reduce_sum(rpn_loss_cls) * cls_normalization_factor reduced_rpn_loss_cls = cntk.as_block(normalized_rpn_cls_loss, [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore', 'norm_rpn_cls_loss') # regression loss p_rpn_bbox_pred = cntk.placeholder() p_rpn_bbox_targets = cntk.placeholder() p_rpn_bbox_inside_weights = cntk.placeholder() rpn_loss_bbox = SmoothL1Loss(cfg["CNTK"].SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0) # The bbox loss is normalized by the rpn batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE normalized_rpn_bbox_loss = reduce_sum(rpn_loss_bbox) * bbox_normalization_factor reduced_rpn_loss_bbox = cntk.as_block(normalized_rpn_bbox_loss, [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets), (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)], 'SmoothL1Loss', 'norm_rpn_bbox_loss') rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses") return rpn_rois, rpn_losses
def func(x): return C.splice(C.layers.Recurrence(C.layers.LSTM(hidden_dim))(x), C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=True)(x), name=name)
def lstm_w_attention(h, c, x): # alias is used to work around bug when arguments in block funcion are the same attended = mha(h, encoded, C.alias(encoded)) xx = C.splice(attended, x) return lstm(h, c, xx)
print('Writing train text file...') savetxt(os.path.join(data_dir, "Train-28x28_cntk_text.txt"), train) print('Writing test text file...') savetxt(os.path.join(data_dir, "Test-28x28_cntk_text.txt"), test) print('Done') input = C.input_variable(input_dim) label = C.input_variable(num_output_classes) normalize_input = input / 255.0 squared_input = C.square(input / 255.0) sqrt_input = C.sqrt(input / 255.0) z = create_model(C.splice(normalize_input, squared_input, sqrt_input)) loss = C.cross_entropy_with_softmax(z, label) label_error = C.classification_error(z, label) lr_schedule = C.learning_parameter_schedule(learning_rate) learner = C.sgd(z.parameters, lr_schedule) trainer = C.Trainer(z, (loss, label_error), [learner]) data_found = False for data_dir in [ os.path.join("..", "Examples", "Image", "DataSets", "MNIST"),
def attention_layer(self, context, query, layer): q_processed = C.placeholder(shape=(2 * self.hidden_dim, )) p_processed = C.placeholder(shape=(2 * self.hidden_dim, )) qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs wq = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim), init=C.glorot_uniform()) wp = C.parameter(shape=(2 * self.hidden_dim, 2 * self.hidden_dim), init=C.glorot_uniform()) wg = C.parameter(shape=(8 * self.hidden_dim, 8 * self.hidden_dim), init=C.glorot_uniform()) v = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) # seq[tensor[2d]] p_len x 2d wpt = C.reshape(C.times(p_processed, wp), (-1, 2 * self.hidden_dim)) # q_len x 2d wqt = C.reshape(C.times(qvw, wq), (-1, 2 * self.hidden_dim)) # seq[tensor[q_len]] S = C.reshape( C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1)) qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed) # seq[tensor[q_len]] S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) # seq[tensor[q_len]] A = C.softmax(S, axis=0) # seq[tensor[2d]] swap_qvw = C.swapaxes(qvw) cq = C.reshape( C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1)) # seq[tensor[4d]] uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq) # seq[tensor[4d]] gt = C.tanh(C.times(uc_concat, wg)) # seq[tensor[4d]] uc_concat_star = gt * uc_concat # seq[tensor[4d]] vp = C.layers.Sequential([ C.layers.Dropout(self.dropout), OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name=layer + '_attention_rnn') ])(uc_concat_star) return C.as_block(vp, [(p_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def BiRnn(e): h = C.splice(posRnn(e), negRnn(e), name=name) return h
def createReadOutNetwork(self, decoderHidden, preTrgEmb): readOut = C.splice(decoderHidden, preTrgEmb, axis=-1) preSoftmax = C.times(readOut, self.Wt) + self.Wtb return preSoftmax
def build_model(self): c = C.Axis.new_unique_dynamic_axis('c') q = C.Axis.new_unique_dynamic_axis('q') b = C.Axis.default_batch_axis() cgw = C.input_variable(self.wg_dim, dynamic_axes=[b, c], is_sparse=self.use_sparse, name='cgw') cnw = C.input_variable(self.wn_dim, dynamic_axes=[b, c], is_sparse=self.use_sparse, name='cnw') qgw = C.input_variable(self.wg_dim, dynamic_axes=[b, q], is_sparse=self.use_sparse, name='qgw') qnw = C.input_variable(self.wn_dim, dynamic_axes=[b, q], is_sparse=self.use_sparse, name='qnw') cc = C.input_variable((1, self.word_size), dynamic_axes=[b, c], name='cc') qc = C.input_variable((1, self.word_size), dynamic_axes=[b, q], name='qc') ab = C.input_variable(self.a_dim, dynamic_axes=[b, c], name='ab') ae = C.input_variable(self.a_dim, dynamic_axes=[b, c], name='ae') input_phs = { 'cgw': cgw, 'cnw': cnw, 'qgw': qgw, 'qnw': qnw, 'cc': cc, 'qc': qc, 'ab': ab, 'ae': ae } self._input_phs = input_phs seif.info['query'] = C.splice(qgw, qnw) self.info['doc'] = C.splice(cgw, gnw) # graph pu, qu = self.input_layer(cgw, cnw, cc, qgw, qnw, qc).outputs gate_pu, wei1 = self.gate_attention_layer( pu, qu, common_len=2 * self.hidden_dim, attn_kind=self.attn_configs[0]) # [#,c][4*hidden] self.info['attn1'] = wei1 * 1.0 print('[RNet build]gate_pu:{}'.format(gate_pu)) pv = self.reasoning_layer(gate_pu) # [#,c][2*hidden] gate_self, wei2 = self.gate_attention_layer( pv, pv, common_len=2 * self.hidden_dim, att_kind=self.attn_configs[1]) # [#,c][4*hidden] self.info['attn2'] = wei2 * 1.0 ph = self.reasoning_layer(gate_self) # [#,c][2*hidden] init_pu = self.weighted_sum(pu) start_logits, end_logits = self.output_layer( init_pu.outputs[0], ph, 2 * self.hidden_dim) # [#, c][1] # loss start_loss = seq_loss(start_logits, ab) end_loss = seq_loss(end_logits, ae) # paper_loss = start_loss + end_loss new_loss = all_spans_loss(start_logits, ab, end_logits, ae) self._model = C.combine([start_logits, end_logits]) self._loss = new_loss return self._model, self._loss, self._input_phs
def bigru_with_match(dh, x): c_att = matching_model(att_input, dh) x = C.splice(x, c_att) x = C.element_times(x, C.sigmoid(C.times(x, Wg))) return att_gru(dh, x)
def build_model(self): c = C.Axis.new_unique_dynamic_axis('c') q = C.Axis.new_unique_dynamic_axis('q') b = C.Axis.default_batch_axis() cgw = C.input_variable(self.wg_dim, dynamic_axes=[b, c], is_sparse=self.use_sparse, name='cgw') cnw = C.input_variable(self.wn_dim, dynamic_axes=[b, c], is_sparse=self.use_sparse, name='cnw') qgw = C.input_variable(self.wg_dim, dynamic_axes=[b, q], is_sparse=self.use_sparse, name='qgw') qnw = C.input_variable(self.wn_dim, dynamic_axes=[b, q], is_sparse=self.use_sparse, name='qnw') cc = C.input_variable((1, self.word_size), dynamic_axes=[b, c], name='cc') qc = C.input_variable((1, self.word_size), dynamic_axes=[b, q], name='qc') ab = C.input_variable(self.a_dim, dynamic_axes=[b, c], name='ab') ae = C.input_variable(self.a_dim, dynamic_axes=[b, c], name='ae') qf = C.input_variable(1, dynamic_axes=[b, q], is_sparse=False, name='query_feature') df = C.input_variable(3, dynamic_axes=[b, c], is_sparse=False, name='doc_feature') input_phs = { 'cgw': cgw, 'cnw': cnw, 'qgw': qgw, 'qnw': qnw, 'cc': cc, 'qc': qc, 'ab': ab, 'ae': ae, 'qf': qf, 'df': df } self._input_phs = input_phs self.info['query'] = C.splice(qgw, qnw) self.info['doc'] = C.splice(cgw, cnw) # graph elmo_encoder = self.__elmo_fac.build() #input layer reduction_cc = C.reshape(cc, (-1, )) reduction_qc = C.reshape(qc, (-1, )) c_elmo = elmo_encoder(reduction_cc) q_elmo = elmo_encoder(reduction_qc) pu, qu = self.input_layer(cgw, cnw, qgw, qnw).outputs enhance_pu = C.splice(pu, c_elmo, df) enhance_qu = C.splice(qu, q_elmo, qf) gate_pu, wei1 = self.gate_attention_layer(enhance_pu, enhance_qu, common_len=2*self.hidden_dim+1024,\ att_kind=self.attn_configs[0]) # [#,c][4*hidden] self.info['attn1'] = 1.0 * wei1 pv = self.reasoning_layer(gate_pu) # [#,c][2*hidden] # self attention gate_self, wei2 = self.gate_attention_layer( pv, pv, common_len=2 * self.hidden_dim, att_kind=self.attn_configs[1]) # [#,c][4*hidden] self.info['attn2'] = 1.0 * wei2 ph = self.reasoning_layer(gate_self) # [#,c][2*hidden] enhance_ph = C.splice(ph, c_elmo, df) init_pu = self.weighted_sum(enhance_pu) start_logits, end_logits = self.output_layer( init_pu.outputs[0], enhance_ph, 2 * self.hidden_dim + 1027) # [#, c][1] self.info['start_logits'] = start_logits * 1.0 self.info['end_logits'] = end_logits * 1.0 # loss start_loss = seq_loss(start_logits, ab) end_loss = seq_loss(end_logits, ae) # paper_loss = start_loss + end_loss new_loss = all_spans_loss(start_logits, ab, end_logits, ae) self._model = C.combine([start_logits, end_logits]) self._loss = new_loss return self._model, self._loss, self._input_phs
def _from_optimized_rnnstack(cudnn_rnn): ''' converts cudnn optimized_rnnstack to non-cudnn functions to run in non-CUDA environment Args: cudnn_rnn: the optimized_rnnstack function that contains the parameters to be converted Returns: converted rnn function on GEMM based implementation that can be used on CPU ''' if cudnn_rnn.root_function.op_name != 'OptimizedRNNStack': raise ValueError('unexpected cudnn_rnn.root_function.op_name value "%s"'%cudnn_rnn.root_function.op_name) cudnn_param = cudnn_rnn.parameters[0] rnn_name = cudnn_rnn.name input_var = cudnn_rnn.inputs[0] hidden_size = cudnn_rnn.root_function.attributes['hiddenSize'] num_layers = cudnn_rnn.root_function.attributes['numLayers'] bidirectional = cudnn_rnn.root_function.attributes['bidirectional'] recurrent_op = cudnn_rnn.root_function.attributes['recurrentOp'] if recurrent_op not in ['lstm', 'rnnReLU', 'rnnTanh']: raise ValueError('unsupported recurrent_op value "%s"'%recurrent_op) #note that cudnn GRU is different from standard GRU so no conversion unless creating a new type of GRU cell for CPU def _any_inferred(shape): return np.any([dim < 0 for dim in shape]) if _any_inferred(cudnn_param.shape) or _any_inferred(input_var.shape): raise ValueError('parameter not initialized yet') input_size = input_var.shape[0] if len(input_var.shape) else 1 num_gates = 1 rnn_lambda = None if recurrent_op == 'lstm': num_gates = 4 if bidirectional: rnn_lambda = lambda x, i : C.splice(C.layers.Recurrence(C.layers.LSTM(hidden_size, name=rnn_name+'_fw'+i))(x), C.layers.Recurrence(C.layers.LSTM(hidden_size, name=rnn_name+'_bw'+i), go_backwards=True)(x)) else: rnn_lambda = lambda x, i : C.layers.Recurrence(C.layers.LSTM(hidden_size, name=rnn_name+"_"+i))(x) elif recurrent_op == 'rnnReLU' or recurrent_op == 'rnnTanh': num_gates = 1 activation = C.relu if recurrent_op == 'rnnReLU' else C.tanh if bidirectional: rnn_lambda = lambda x, i : C.splice(C.layers.Recurrence(C.layers.RNNUnit(hidden_size, activation=activation, name=rnn_name+'_fw'+i))(x), C.layers.Recurrence(C.layers.RNNUnit(hidden_size, activation=activation, name=rnn_name+'_bw'+i), go_backwards=True)(x)) else: rnn_lambda = lambda x, i : C.layers.Recurrence(C.layers.RNNUnit(hidden_size, activation=activation, name=rnn_name+"_"+i))(x) noncudnn_func = rnn_lambda(input_var, '0') param = cudnn_param.value.reshape(-1) offset = 0 multiplier = 2 if bidirectional else 1 def _adjust_gate_order(W): if recurrent_op == 'lstm': if len(W.shape) == 2: i,f,m,o = np.hsplit(W, 4) return np.concatenate((i,m,f,o), axis=1) elif len(W.shape) == 1: i,f,m,o = np.split(W, 4) return np.concatenate((i,m,f,o)) else: raise ValueError('LSTM parameter must have 1 or 2 dimensions') else: return W def _get_cudnn_rnn_weight_splitter(in_dim, h_dim): # for unidirectional, W, H # for bidirectional, fw_W, fw_H, bw_W, bw_H splitter = [in_dim*h_dim*num_gates, h_dim*h_dim*num_gates] * multiplier splitter = splitter[0:-1] return np.cumsum(splitter) def _get_cudnn_rnn_bias_splitter(h_dim): # for unidirectional, b1, b2 # for bidirectional, fw_b1, fw_b2, bw_b1, bw_b2 splitter = [h_dim*num_gates, h_dim*num_gates] * multiplier splitter = splitter[0:-1] return np.cumsum(splitter) offset = 0 layer_input_size = input_size for layer in range(num_layers): layer_size = (layer_input_size + hidden_size) * hidden_size * num_gates * multiplier layer_param = param[offset:offset+layer_size] layer_name = str(layer) if bidirectional: fw_Wt, fw_Ht, bw_Wt, bw_Ht = np.split(layer_param, _get_cudnn_rnn_weight_splitter(layer_input_size, hidden_size)) fw_cell = noncudnn_func.find_by_name(rnn_name+'_fw'+layer_name, -1) bw_cell = noncudnn_func.find_by_name(rnn_name+'_bw'+layer_name, -1) fw_cell.W.value = np.ascontiguousarray(_adjust_gate_order(fw_Wt.reshape(num_gates*hidden_size, -1).transpose())) fw_cell.H.value = np.ascontiguousarray(_adjust_gate_order(fw_Ht.reshape(num_gates*hidden_size, -1).transpose())) bw_cell.W.value = np.ascontiguousarray(_adjust_gate_order(bw_Wt.reshape(num_gates*hidden_size, -1).transpose())) bw_cell.H.value = np.ascontiguousarray(_adjust_gate_order(bw_Ht.reshape(num_gates*hidden_size, -1).transpose())) else: Wt, Ht = np.split(layer_param, _get_cudnn_rnn_weight_splitter(layer_input_size, hidden_size)) cell = noncudnn_func.find_by_name(rnn_name+'_'+layer_name, -1) cell.W.value = np.ascontiguousarray(_adjust_gate_order(Wt.reshape(num_gates*hidden_size, -1).transpose())) cell.H.value = np.ascontiguousarray(_adjust_gate_order(Ht.reshape(num_gates*hidden_size, -1).transpose())) offset += layer_size layer_input_size = hidden_size * multiplier if layer != num_layers - 1: noncudnn_func = rnn_lambda(noncudnn_func.output, str(layer+1)) for layer in range(num_layers): layer_size = 2 * hidden_size * num_gates * multiplier layer_param = param[offset:offset+layer_size] layer_name = str(layer) if bidirectional: fw_b1, fw_b2, bw_b1, bw_b2 = np.split(layer_param, _get_cudnn_rnn_bias_splitter(hidden_size)) fw_cell = noncudnn_func.find_by_name(rnn_name+'_fw'+layer_name, -1) bw_cell = noncudnn_func.find_by_name(rnn_name+'_bw'+layer_name, -1) fw_cell.b.value = _adjust_gate_order(fw_b1 + fw_b2).reshape(-1) bw_cell.b.value = _adjust_gate_order(bw_b1 + bw_b2).reshape(-1) else: b1, b2 = np.split(layer_param, _get_cudnn_rnn_bias_splitter(hidden_size)) cell = noncudnn_func.find_by_name(rnn_name+'_'+layer_name, -1) cell.b.value = _adjust_gate_order(b1 + b2).reshape(-1) offset += layer_size return noncudnn_func
def create_rpn(conv_out, scaled_gt_boxes, im_info, cfg, add_loss_functions=True): ''' Creates a region proposal network for object detection as proposed in the "Faster R-CNN" paper: Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun: "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks" Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). Args: conv_out: The convolutional feature map, i.e. the output of the conv layers from the pretrained classification network scaled_gt_boxes: The ground truth boxes as (x1, y1, x2, y2, label). Coordinates are absolute pixels wrt. the input image. im_info: A CNTK variable or constant containing (pad_width, pad_height, scaled_image_width, scaled_image_height, orig_img_width, orig_img_height) e.g. (1000, 1000, 1000, 600, 500, 300) for an original image of 600x300 that is scaled and padded to 1000x1000 cfg: The configuration dictionary add_loss_functions: If set to True rpn_losses will be returned, otherwise None is returned for the losses Returns: rpn_rois - the proposed ROIs rpn_losses - the losses (SmoothL1 loss for bbox regression plus cross entropy for objectness) ''' # RPN network # init = 'normal', initValueScale = 0.01, initBias = 0.1 num_channels = cfg["MODEL"].RPN_NUM_CHANNELS rpn_conv_3x3 = Convolution((3, 3), num_channels, activation=relu, pad=True, strides=1, init=normal(scale=0.01), init_bias=0.0)(conv_out) rpn_cls_score = Convolution( (1, 1), 18, activation=None, name="rpn_cls_score", init=normal(scale=0.01), init_bias=0.0)(rpn_conv_3x3) # 2(bg/fg) * 9(anchors) rpn_bbox_pred = Convolution( (1, 1), 36, activation=None, name="rpn_bbox_pred", init=normal(scale=0.01), init_bias=0.0)(rpn_conv_3x3) # 4(coords) * 9(anchors) # apply softmax to get (bg, fg) probabilities and reshape predictions back to grid of (18, H, W) num_predictions = int(rpn_cls_score.shape[0] / 2) rpn_cls_score_rshp = reshape( rpn_cls_score, (2, num_predictions, rpn_cls_score.shape[1], rpn_cls_score.shape[2]), name="rpn_cls_score_rshp") p_rpn_cls_score_rshp = cntk.placeholder() rpn_cls_sm = softmax(p_rpn_cls_score_rshp, axis=0) rpn_cls_prob = cntk.as_block(rpn_cls_sm, [(p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'Softmax', 'rpn_cls_prob') rpn_cls_prob_reshape = reshape(rpn_cls_prob, rpn_cls_score.shape, name="rpn_cls_prob_reshape") # proposal layer rpn_rois = create_proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg) rpn_losses = None if (add_loss_functions): # RPN targets # Comment: rpn_cls_score is only passed vvv to get width and height of the conv feature map ... proposal_layer_params = "'feat_stride': {}\n'scales':\n - {}". \ format(cfg["MODEL"].FEATURE_STRIDE, "\n - ".join([str(v) for v in cfg["DATA"].PROPOSAL_LAYER_SCALES])) atl = user_function( AnchorTargetLayer( rpn_cls_score, scaled_gt_boxes, im_info, rpn_batch_size=cfg["TRAIN"].RPN_BATCHSIZE, rpn_fg_fraction=cfg["TRAIN"].RPN_FG_FRACTION, clobber_positives=cfg["TRAIN"].RPN_CLOBBER_POSITIVES, positive_overlap=cfg["TRAIN"].RPN_POSITIVE_OVERLAP, negative_overlap=cfg["TRAIN"].RPN_NEGATIVE_OVERLAP, param_str=proposal_layer_params)) rpn_labels = atl.outputs[0] rpn_bbox_targets = atl.outputs[1] rpn_bbox_inside_weights = atl.outputs[2] # classification loss p_rpn_labels = cntk.placeholder() p_rpn_cls_score_rshp = cntk.placeholder() keeps = cntk.greater_equal(p_rpn_labels, 0.0) fg_labels = element_times(p_rpn_labels, keeps, name="fg_targets") bg_labels = minus(1, fg_labels, name="bg_targets") rpn_labels_ignore = splice(bg_labels, fg_labels, axis=0) rpn_ce = cross_entropy_with_softmax(p_rpn_cls_score_rshp, rpn_labels_ignore, axis=0) rpn_loss_cls = element_times(rpn_ce, keeps) # The terms that are accounted for in the cls loss are those that have a label >= 0 cls_num_terms = reduce_sum(keeps) cls_normalization_factor = 1.0 / cls_num_terms normalized_rpn_cls_loss = reduce_sum( rpn_loss_cls) * cls_normalization_factor reduced_rpn_loss_cls = cntk.as_block( normalized_rpn_cls_loss, [(p_rpn_labels, rpn_labels), (p_rpn_cls_score_rshp, rpn_cls_score_rshp)], 'CE_with_ignore', 'norm_rpn_cls_loss') # regression loss p_rpn_bbox_pred = cntk.placeholder() p_rpn_bbox_targets = cntk.placeholder() p_rpn_bbox_inside_weights = cntk.placeholder() rpn_loss_bbox = SmoothL1Loss(cfg.SIGMA_RPN_L1, p_rpn_bbox_pred, p_rpn_bbox_targets, p_rpn_bbox_inside_weights, 1.0) # The bbox loss is normalized by the rpn batch size bbox_normalization_factor = 1.0 / cfg["TRAIN"].RPN_BATCHSIZE normalized_rpn_bbox_loss = reduce_sum( rpn_loss_bbox) * bbox_normalization_factor reduced_rpn_loss_bbox = cntk.as_block( normalized_rpn_bbox_loss, [(p_rpn_bbox_pred, rpn_bbox_pred), (p_rpn_bbox_targets, rpn_bbox_targets), (p_rpn_bbox_inside_weights, rpn_bbox_inside_weights)], 'SmoothL1Loss', 'norm_rpn_bbox_loss') rpn_losses = plus(reduced_rpn_loss_cls, reduced_rpn_loss_bbox, name="rpn_losses") return rpn_rois, rpn_losses
def create_model_ext(input, ext_values, out_dims): # in VGG style #https://www.cs.toronto.edu/~frossard/post/vgg16/ convolutional_layer_1_1 = Convolution((3, 3), 16, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(input) convolutional_layer_1_2 = Convolution( (5, 5), 32, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(convolutional_layer_1_1) pooling_layer_1 = MaxPooling((2, 2), strides=(2, 2))(convolutional_layer_1_2) convolutional_layer_2_1 = Convolution((3, 3), 32, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(pooling_layer_1) convolutional_layer_2_2 = Convolution( (7, 7), 64, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(convolutional_layer_2_1) pooling_layer_2 = MaxPooling((2, 2), strides=(1, 1))(convolutional_layer_2_2) convolutional_layer_3_1 = Convolution((3, 3), 64, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(pooling_layer_2) convolutional_layer_3_2 = Convolution( (7, 7), 96, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(convolutional_layer_3_1) pooling_layer_3 = MaxPooling((2, 2), strides=(1, 1))(convolutional_layer_3_2) convolutional_layer_4_1 = Convolution((3, 3), 96, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(pooling_layer_3) pooling_layer_4 = MaxPooling((2, 2), strides=(1, 1))(convolutional_layer_4_1) ## fully_connected_layer_1 = Dense(512, init=glorot_uniform())(pooling_layer_4) dropout_layer_1 = Dropout(0.5)(fully_connected_layer_1) fully_connected_with_extra_values = splice(dropout_layer_1, ext_values, axis=0) fully_connected_layer_2 = Dense( 256, init=glorot_uniform())(fully_connected_with_extra_values) fully_connected_layer_3 = Dense( 128, init=glorot_uniform())(fully_connected_layer_2) dropout_layer_2 = Dropout(0.5)(fully_connected_layer_3) output_layer = Dense(out_dims, init=glorot_uniform(), activation=None)(dropout_layer_2) return output_layer
def gru_with_attentioin(dh, x): c_att = attention_model(att_input, x) x = C.splice(x, c_att) x = C.element_times(x, C.sigmoid(C.times(x, Wg))) return att_gru(dh, x)