def output_layer(self, attention_context, modeling_context): att_context = C.placeholder(shape=(8 * self.hidden_dim, )) mod_context = C.placeholder(shape=(2 * self.hidden_dim, )) #output layer start_logits = C.layers.Dense(1, name='out_start')(C.dropout( C.splice(mod_context, att_context), self.dropout)) if self.two_step: start_hardmax = seq_hardmax(start_logits) att_mod_ctx = C.sequence.last( C.sequence.gather(mod_context, start_hardmax)) else: start_prob = C.softmax(start_logits) att_mod_ctx = C.sequence.reduce_sum(mod_context * start_prob) att_mod_ctx_expanded = C.sequence.broadcast_as(att_mod_ctx, att_context) end_input = C.splice(att_context, mod_context, att_mod_ctx_expanded, mod_context * att_mod_ctx_expanded) m2 = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='output_rnn')(end_input) end_logits = C.layers.Dense(1, name='out_end')(C.dropout( C.splice(m2, att_context), self.dropout)) return C.as_block(C.combine([start_logits, end_logits]), [(att_context, attention_context), (mod_context, modeling_context)], 'output_layer', 'output_layer')
def test_op_dropout_with_explicit_seed(device_id, precision): from cntk import combine, dropout, input value = np.ones(shape=(10, 10), dtype=PRECISION_TO_TYPE[precision]) a = input(shape=value.shape, dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name='a') seed = 123 dropout_nodes = [ dropout(a, dropout_rate=0.5, seed=seed), dropout(a, dropout_rate=0.5, seed=seed), dropout(a, dropout_rate=0.5, seed=seed + 1), dropout(a, dropout_rate=0.5) ] value.shape = (1, 1) + value.shape forward_input = {a: value} results = [] for node in dropout_nodes: forward, backward = cntk_eval(node, forward_input, precision, cntk_device(device_id), backward_pass=True) results.append(forward[node.output]) assert np.allclose(results[0], results[1]) assert not np.allclose(results[0], results[2]) assert not np.allclose(results[0], results[3])
def output_layer(self, attention_context, modeling_context): att_context = C.placeholder() mod_context = C.placeholder() #output layer [#,c][1] start_logits = C.layers.Dense(1, name='out_start')(C.dropout( C.splice(mod_context, att_context), self.dropout)) start_logits = C.sequence.softmax(start_logits) start_hardmax = seq_hardmax(start_logits) # [000010000] att_mod_ctx = C.sequence.last( C.sequence.gather(mod_context, start_hardmax)) # [#][2*hidden_dim] att_mod_ctx_expanded = C.sequence.broadcast_as(att_mod_ctx, att_context) end_input = C.splice(att_context, mod_context, att_mod_ctx_expanded, mod_context * att_mod_ctx_expanded) # [#, c][14*hidden_dim] m2 = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='output_rnn')(end_input) end_logits = C.layers.Dense(1, name='out_end')(C.dropout( C.splice(m2, att_context), self.dropout)) end_logits = C.sequence.softmax(end_logits) return C.as_block(C.combine([start_logits, end_logits]), [(att_context, attention_context), (mod_context, modeling_context)], 'output_layer', 'output_layer')
def test_Dropout(tmpdir): data = np.asarray([[10, 20],[30, 40],[50, 60]], dtype=np.float32) model = C.dropout(data, 0.5) verify_no_input(model, tmpdir, 'Dropout_0') x = C.input_variable(data.shape) model = C.dropout(x, 0.5) verify_one_input(model, data, tmpdir, 'Dropout_1')
def test_Dropout(tmpdir): data = np.asarray([[10, 20], [30, 40], [50, 60]], dtype=np.float32) model = C.dropout(data, 0.5) verify_no_input(model, tmpdir, 'Dropout_0') x = C.input_variable(data.shape) model = C.dropout(x, 0.5) verify_one_input(model, data, tmpdir, 'Dropout_1')
def test_Dropout(tmpdir): pytest.skip('Need to support new ONNX spec.') data = np.asarray([[10, 20], [30, 40], [50, 60]], dtype=np.float32) model = C.dropout(data, 0.5) verify_no_input(model, tmpdir, 'Dropout_0') x = C.input_variable(data.shape) model = C.dropout(x, 0.5) verify_one_input(model, data, tmpdir, 'Dropout_1')
def test_Dropout(tmpdir, dtype): with C.default_options(dtype = dtype): data = np.asarray([[10, 20],[30, 40],[50, 60]], dtype=dtype) model = C.dropout(data, 0.5) verify_no_input(model, tmpdir, 'Dropout_0') x = C.input_variable(data.shape) model = C.dropout(x, 0.5) verify_one_input(model, data, tmpdir, 'Dropout_1')
def test_Dropout(tmpdir, dtype): with C.default_options(dtype=dtype): data = np.asarray([[10, 20], [30, 40], [50, 60]], dtype=dtype) model = C.dropout(data, 0.5) verify_no_input(model, tmpdir, 'Dropout_0') x = C.input_variable(data.shape) model = C.dropout(x, 0.5) verify_one_input(model, data, tmpdir, 'Dropout_1')
def test_op_dropout(shape, dropout_rate, device_id, precision): from cntk import dropout from cntk.utils import eval, sanitize_dtype_cntk, cntk_device count = 10 resulted_non_zeros = 0 # As the dropout node is stochastic, we run it a couple times and aggregate # over the results to get more stable tests. for i in range(count): value = np.ones(shape=shape, dtype=PRECISION_TO_TYPE[precision]) a = I( shape=value.shape, data_type=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name="a", ) dropout_node = dropout(a, dropout_rate=dropout_rate) value.shape = (1, 1) + value.shape forward_input = {a: value} forward, backward = eval(dropout_node, forward_input, precision, cntk_device(device_id), backward_pass=True) resulted_non_zeros += np.count_nonzero(forward[dropout_node.output]) resulted_non_zeros /= count num_elements = np.multiply.reduce(shape) expected_non_zeros = num_elements * (1 - dropout_rate) max_off = 0.2 * num_elements assert abs(resulted_non_zeros - expected_non_zeros) < max_off
def test_op_dropout_bad_input(dropout_rate): from cntk import dropout a = I(shape=(1, 2), dtype='float', needs_gradient=True, name='a') with pytest.raises(ValueError): dropout_node = dropout(a, dropout_rate=dropout_rate)
def test_op_dropout(shape, dropout_rate, device_id, precision): from cntk import dropout count = 10 resulted_non_zeros = 0 # As the dropout node is stochastic, we run it a couple times and aggregate # over the results to get more stable tests. for i in range(count): value = np.ones(shape=shape, dtype=PRECISION_TO_TYPE[precision]) a = I(shape=value.shape, dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name='a') dropout_node = dropout(a, dropout_rate=dropout_rate) value.shape = (1, 1) + value.shape forward_input = {a: value} forward, backward = cntk_eval(dropout_node, forward_input, precision, cntk_device(device_id), backward_pass=True) resulted_non_zeros += np.count_nonzero(forward[dropout_node.output]) resulted_non_zeros /= count num_elements = np.multiply.reduce(shape) expected_non_zeros = num_elements * (1 - dropout_rate) max_off = 0.2 * num_elements assert (abs(resulted_non_zeros - expected_non_zeros) < max_off)
def test_changing_dropout_rate(): from cntk import dropout, input resulted_non_zeros = 0 shape = (100, 100) dtype = np.float32 value = np.ones(shape=shape, dtype=dtype) a = input(shape=shape, needs_gradient=True, dtype=dtype) dropout_node = dropout(a, dropout_rate=0.1) value.shape = (1, ) + value.shape for dropout_rate in [0.0, 0.25, 0.5, 0.78, 0.99999]: dropout_node.set_attribute('dropoutRate', dropout_rate) forward, _ = cntk_eval(dropout_node, {a: value}, dtype, backward_pass=True) resulted_non_zeros = np.count_nonzero(forward[dropout_node.output]) if (dropout_rate == 0): assert resulted_non_zeros == value.size assert np.isclose((1 - dropout_rate), resulted_non_zeros * 1.0 / value.size, atol=0.01)
def test_op_dropout_bad_input(dropout_rate): from cntk import dropout from cntk.utils import eval, sanitize_dtype_cntk, cntk_device a = I(shape=(1, 2), data_type='float', needs_gradient=True, name='a') with pytest.raises(ValueError): dropout_node = dropout(a, dropout_rate=dropout_rate)
def test_op_dropout_with_explicit_seed(device_id, precision): from cntk import combine, dropout value = np.ones(shape=(100, 100), dtype=PRECISION_TO_TYPE[precision]) a = C.input_variable(shape=value.shape, dtype=sanitize_dtype_cntk( PRECISION_TO_TYPE[precision]), needs_gradient=True, name='a') seed = 123 dropout_nodes = [ dropout(a, dropout_rate=0.5, seed=seed), dropout(a, dropout_rate=0.5, seed=seed), dropout(a, dropout_rate=0.5, seed=seed + 1), dropout(a, dropout_rate=0.5) ] cloned_nodes = [x.clone('clone') for x in dropout_nodes] value.shape = (1, 1) + value.shape results = [] for node in dropout_nodes + cloned_nodes: forward_input = {node.inputs[0]: value} forward, backward = cntk_eval(node, forward_input, precision, cntk_device(device_id), backward_pass=True) results.append(forward[node.output]) assert np.allclose(results[0], results[1]) assert not np.allclose(results[0], results[2]) assert not np.allclose(results[0], results[3]) clones = results[len(dropout_nodes):] for i in range(len(clones)): assert np.allclose(results[i], clones[i])
def test_op_dropout_with_explicit_seed(device_id, precision): from cntk import combine, dropout value = np.ones(shape=(100,100), dtype=PRECISION_TO_TYPE[precision]) a = C.input_variable(shape=value.shape, dtype=sanitize_dtype_cntk(PRECISION_TO_TYPE[precision]), needs_gradient=True, name='a') seed = 123; dropout_nodes= [ dropout(a, dropout_rate=0.5, seed=seed), dropout(a, dropout_rate=0.5, seed=seed), dropout(a, dropout_rate=0.5, seed=seed+1), dropout(a, dropout_rate=0.5) ] cloned_nodes = [x.clone('clone') for x in dropout_nodes] value.shape = (1, 1) + value.shape results = [] for node in dropout_nodes + cloned_nodes: forward_input = {node.inputs[0]: value} forward, backward = cntk_eval(node, forward_input, precision, cntk_device(device_id), backward_pass=True) results.append(forward[node.output]) assert np.allclose(results[0], results[1]) assert not np.allclose(results[0], results[2]) assert not np.allclose(results[0], results[3]) clones = results[len(dropout_nodes):] for i in range(len(clones)): assert np.allclose(results[i], clones[i])
def test_set_dropout_rate_attribute(): from cntk import dropout, input; from math import pi; dropout_node = dropout(input(1), dropout_rate=0.3) key = 'dropoutRate' root = dropout_node.root_function assert np.isclose(root.attributes[key], 0.3) root.set_attribute(key, 0.4) assert np.isclose(root.attributes[key], 0.4) dropout_node.set_attribute(key, 0.777) assert np.isclose(root.attributes[key], 0.777) dropout_node.set_attribute(key, pi) assert np.isclose(root.attributes[key], pi)
def test_dropout_random_mask_is_recomputed_on_forward_pass(): from cntk import dropout, input shape = (100,100) dtype = np.float32 value = np.ones(shape=shape, dtype=dtype) a = input(shape=shape, needs_gradient=True, dtype=dtype) dropout_node = dropout(a, dropout_rate=0.1) network = dropout_node + constant(0) value.shape = (1,) + value.shape _, forward = network.forward({a: value}, network.outputs, network.outputs) non_zeros_1 = forward[network.output] > 0.0 _, forward = network.forward({a: value}, network.outputs, network.outputs) non_zeros_2 = forward[network.output] > 0.0 assert not (non_zeros_1 == non_zeros_2).all()
def test_dropout_random_mask_is_recomputed_on_forward_pass(): from cntk import dropout, input shape = (100, 100) dtype = np.float32 value = np.ones(shape=shape, dtype=dtype) a = input(shape=shape, needs_gradient=True, dtype=dtype) dropout_node = dropout(a, dropout_rate=0.1) network = dropout_node + constant(0) value.shape = (1, ) + value.shape _, forward = network.forward({a: value}, network.outputs, network.outputs) non_zeros_1 = forward[network.output] > 0.0 _, forward = network.forward({a: value}, network.outputs, network.outputs) non_zeros_2 = forward[network.output] > 0.0 assert not (non_zeros_1 == non_zeros_2).all()
def test_changing_dropout_rate(): from cntk import dropout, input resulted_non_zeros = 0 shape = (100,100) dtype = np.float32 value = np.ones(shape=shape, dtype=dtype) a = input(shape=shape, needs_gradient=True, dtype=dtype) dropout_node = dropout(a, dropout_rate=0.1) value.shape = (1,) + value.shape for dropout_rate in [0.0, 0.25, 0.5, 0.78, 0.99999]: dropout_node.set_attribute('dropoutRate', dropout_rate) forward, _ = cntk_eval(dropout_node, {a: value}, dtype, backward_pass=True) resulted_non_zeros = np.count_nonzero(forward[dropout_node.output]) if (dropout_rate == 0): assert resulted_non_zeros == value.size assert np.isclose((1-dropout_rate), resulted_non_zeros* 1.0/ value.size, atol=0.01)
def output_layer(self, embed, attention_context, model_context, aw, q_processed, c_processed,cw): cw_ph=C.placeholder() att_context = C.placeholder(shape=(8*self.hidden_dim,)) query_processed = C.placeholder(shape=(2*self.hidden_dim,)) context_processed = C.placeholder(shape=(2*self.hidden_dim,)) mod_context = C.placeholder(shape=(2*self.hidden_dim)) a_onehot = C.placeholder(shape=(self.vocab_size+1,)) start_logits = C.layers.Dense(1, name='out_start')(C.dropout(C.splice(mod_context, att_context), self.dropout)) start_hardmax = seq_hardmax(start_logits) att_mod_ctx = C.sequence.last(C.sequence.gather(mod_context, start_hardmax)) att_mod_ctx_expanded = C.sequence.broadcast_as(att_mod_ctx, att_context) end_input = C.splice(att_context, mod_context, att_mod_ctx_expanded, mod_context * att_mod_ctx_expanded) m2 = OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name='output_rnn')(end_input) end_logits = C.layers.Dense(1, name='out_end')(C.dropout(C.splice(m2, att_context), self.dropout)) start_flag = C.hardmax(start_logits) end_flag = C.hardmax(end_logits) def create_model(): # Encoder: (input*) --> (h0, c0) # Create multiple layers of LSTMs by passing the output of the i-th layer # to the (i+1)th layer as its input with C.layers.default_options(enable_self_stabilization=True, go_backwards=False): LastRecurrence = C.layers.Recurrence encode = C.layers.Sequential([ C.layers.Stabilizer(), OptimizedRnnStack(self.hidden_dim, return_full_state=True), ]) encode_c = C.layers.Sequential([ C.layers.Stabilizer(), OptimizedRnnStack(self.hidden_dim, return_full_state=True), ]) # Decoder: (history*, input*) --> unnormalized_word_logp* # where history is one of these, delayed by 1 step and <s> prepended: # - training: labels # - testing: its own output hardmax(z) (greedy decoder) with C.layers.default_options(enable_self_stabilization=True): # sub-layers stab_in = C.layers.Stabilizer() rec_blocks = [C.layers.LSTM(self.hidden_dim) for i in range(self.num_layers)] stab_out = C.layers.Stabilizer() proj_out = C.layers.Dense(self.vocab_size+1, name='out_proj') # attention model attention_model = C.layers.AttentionModel(self.attention_dim, name='attention_model') # :: (h_enc*, h_dec) -> (h_dec augmented) hstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1) cstate_dense = C.layers.Dense(self.hidden_dim, activation=C.tanh, input_rank=1) W_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1) U_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1) V_dense = C.layers.Dense(2*self.hidden_dim, input_rank=1) maxout = C.layers.MaxPooling((2,), strides=2) # layer function @C.Function def decode(history, q, c, start_logits, end_logits): q = encode(q) c = encode_c(C.splice(c, start_logits, end_logits, axis=0)) r = history r = stab_in(r) q_last_h = C.sequence.last(q.outputs[0]) q_last_c = C.sequence.last(q.outputs[1]) c_last_h = C.sequence.last(c.outputs[0]) c_last_c = C.sequence.last(c.outputs[1]) initial_hstate = hstate_dense(C.splice(q_last_h, c_last_h)) initial_cstate = cstate_dense(C.splice(q_last_c, c_last_c)) rec_block = rec_blocks[0] # LSTM(hidden_dim) # :: (dh, dc, x) -> (h, c) @C.Function def find_embed(x): gx, ngx = C.slice(x, 0, 0, self.wg_dim), C.slice(x, 0, self.wg_dim, self.vocab_size) return embed(gx, ngx) @C.Function def lstm_with_attention(dh, dc, r, x): history_embed = find_embed(x) h_att = attention_model(c.outputs[0], dh) q_att = attention_model(q.outputs[0], dh) att = C.splice(h_att, q_att) x = C.splice(x, att) x, dc = rec_block(dh, dc, x).outputs # 0*r is a hack because cntk freaks out when r is not used. r = U_dense(att) + W_dense(history_embed) + V_dense(x) + 0*r #bug when W_dense is added first, wtf?! #r = W_dense(embed(gx, ngx)) + U_dense(att) + V_dense(x) + 0*r return x, dc, r _, _, r = C.layers.RecurrenceFrom(lstm_with_attention, return_full_state=True)(initial_hstate, initial_cstate, C.Constant(np.zeros(2*self.hidden_dim)),r).outputs r = maxout(r) r = stab_out(r) r = proj_out(r) #r = C.softmax(r) r = C.layers.Label('out_proj_out')(r) return r return decode def create_model_train(s2smodel): # model used in training (history is known from labels) # note: the labels must NOT contain the initial <s> @C.Function def model_train(labels, q, c, start_logits, end_logits): # (input*, labels*) --> (word_logp*) # The input to the decoder always starts with the special label sequence start token. # Then, use the previous value of the label sequence (for training) or the output (for execution). past_labels = C.layers.Delay(initial_state=self.sentence_start)(labels) return s2smodel(past_labels, q, c, start_logits, end_logits) return model_train def create_model_greedy(s2smodel): # model used in (greedy) decoding (inferencing) (history is decoder's own output) @C.Function def model_greedy(q, c, start_logits, end_logits): # (input*) --> (word_sequence*) # Decoding is an unfold() operation starting from sentence_start. # We must transform s2smodel (history*, input* -> word_logp*) into a generator (history* -> output*) # which holds 'input' in its closure. unfold = C.layers.UnfoldFrom(\ lambda history: s2smodel(history, q, c, start_logits, end_logits) >> C.hardmax, # stop once sentence_end_index was max-scoring output until_predicate=lambda w: w[...,self.sentence_end_index], length_increase=self.sentence_max_length) return unfold(initial_state=self.sentence_start, dynamic_axes_like=c) return model_greedy s2smodel = create_model() model_train = create_model_train(s2smodel)(a_onehot, query_processed, context_processed, start_logits, end_logits) model_greed = create_model_greedy(s2smodel)(query_processed, context_processed, start_logits, end_logits) model_greedy = C.argmax(model_greed,0) context = C.argmax(cw_ph,0) return C.as_block( C.combine((model_train, model_greedy, start_logits, end_logits,context)), [(att_context, attention_context), (mod_context, model_context), (a_onehot, aw), (query_processed, q_processed), (context_processed, c_processed),(cw_ph,cw)], 'attention_layer', 'attention_layer')
def test_dropout_attributes(): x = C.input_variable( (1, 5, 5) ) f = C.dropout(x, 0.5, 42) d = f.root_function.attributes expected = {'dropoutRate': 0.5, 'rngSeed' : 42, 'rngOffset' : 0} _check(expected, d)
def test_grad_with_no_arguments_needing_gradients(): x = input(10) z = dropout(x, .4) with pytest.raises(ValueError): _, result = z.grad( {x: [np.array([5] * 150, "float32").reshape(15, 10)]}, outputs=[z])
def test_grad_with_no_arguments_needing_gradients(): x = C.input_variable(10) z = dropout(x, .4) with pytest.raises(ValueError): _, result = z.grad({x: [np.array([5]*150, "float32").reshape(15, 10)]}, outputs=[z])
def test_dropout_attributes(): x = C.input((1, 5, 5)) f = C.dropout(x, 0.5) d = f.root_function.attributes expected = {'dropoutRate': 0.5} _check(expected, d)
def test_dropout_attributes(): x = C.input_variable( (1, 5, 5) ) f = C.dropout(x, 0.5) d = f.root_function.attributes expected = {'dropoutRate': 0.5} _check(expected, d)
def test_dropout_attributes(): x = C.input_variable((1, 5, 5)) f = C.dropout(x, 0.5, 42) d = f.root_function.attributes expected = {'dropoutRate': 0.5, 'rngSeed': 42, 'rngOffset': 0} _check(expected, d)