def conv_model(para): for i in range(0, HDIM): f = Convolution((3, VEC_DIM), activation=C.relu) if i == 0: pp = C.reduce_max(f(para)) else: pp = splice(pp, C.reduce_max(f(para))) h3 = dense_model(pp) return h3
def _func(x): input_ph = C.placeholder() ph = C.placeholder() onehot_value = C.one_hot(ph,262) x1 = C.times(onehot_value, self.char_embed) # [#,*][50,16] # x2 = self.convs[0](x1) # [#,*][32,50,1] convs_res = [] for i in range(self.filter_num): conv_res = self.convs[i](x1) convs_res.append(C.reshape(C.reduce_max(conv_res, axis=1),(-1,))) token_embed = C.splice(*convs_res) # [#,*][2048] tmp_res = token_embed for i in range(self.highway_num): tmp_res = self.highways[i](tmp_res) highway_out=tmp_res # [#,*][2048] proj_out = self.proj(highway_out) # [#,*][512] if not require_train: res = proj_out.clone(C.CloneMethod.freeze, {ph:input_ph}) else: res = proj_out.clone(C.CloneMethod.clone, {ph:input_ph}) return C.as_block( res,[(input_ph, x)], 'elmo_char_encoder', 'elmo_char_encoder' )
def test_ReduceMax(tmpdir, dtype): with C.default_options(dtype=dtype): data = np.array( [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]], dtype=dtype) model = C.reduce_max(data, 0) verify_no_input(model, tmpdir, 'ReduceMax_0')
def cross_entropy_with_full_softmax( output, # Node providing the output of the lstm layers target_vector, # Node providing the expected labels sv_dim, vocab_dim ): sv_vector = output.outputs[3] z = output.outputs[0] zT = C.times_transpose(z, target_vector) # cross entropy loss with softmax function ce = - C.log(zT) # the error zMax = C.reduce_max(z) error = C.less(zT, zMax) ce = sequence.reduce_sum(ce) # discourages the network from turning more than one gate off in a single time step. sumc = C.abs(C.sequence.slice(sv_vector, 1, 0) - C.sequence.slice(sv_vector, 0, -1)) sumc = sequence.reduce_sum(0.0001 * C.pow(100.0, sumc)) #ce += sumc # penalise generated utterances that failed to render all the required slots sumc += C.abs(C.sequence.last(sv_vector)) sumc += C.abs(C.sequence.first(sv_vector) - output.outputs[4]) sumc = C.reduce_sum(sumc) ce = C.reduce_sum(ce) ce += sumc return ce, error
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates=False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.layers.Parameter(shape=(vocab_dim, 1), init=0) weights = C.layers.Parameter(shape=(vocab_dim, hidden_dim), init=C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample( sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency( sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: " + str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:" + str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times( sample_selector, bias, name='zS2') - C.times_transpose( sample_selector, log_prior, name='zS3') # [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times( target_vector, bias, name='zT2') - C.times_transpose( target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape=(vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def test_op_reduce_max(input_data, axis_data, expected_result, expected_gradient, device_id, precision): a = I([input_data]) # slice using the operator result = C.reduce_max(a, axis = axis_data) unittest_helper(result, None, [[expected_result]], device_id=device_id, precision=precision, clean_up=True, backward_pass=False) unittest_helper(result, None, [[expected_gradient]], device_id = device_id, precision=precision, clean_up=True, backward_pass=True, input_node=a)
def charcnn(self, x): embedding = C.layers.Embedding(self.char_emb_dim) dropout = C.layers.Dropout(self.dropout), conv2d = C.layers.Convolution2D((5, self.char_emb_dim), self.convs, activation=C.relu, init=C.glorot_uniform(), bias=True, init_bias=0, name='charcnn_conv') conv_out = C.layers.Sequential([embedding, dropout, conv2d])(x) return C.reduce_max(conv_out, axis=1)
def cost_func(training_mode, prediction, target): ''' We use cross entropy in most mode, except for the multi-label mode, which require treating multiple labels exactly the same. ''' train_loss = None if training_mode == 'majority' or training_mode == 'probability' or training_mode == 'crossentropy': # Cross Entropy. train_loss = ct.negate(ct.reduce_sum(ct.element_times(target, ct.log(prediction)), axis=-1)) elif training_mode == 'multi_target': train_loss = ct.negate(ct.log(ct.reduce_max(ct.element_times(target, prediction), axis=-1))) return train_loss
def charcnn(self, x): conv_out = C.layers.Sequential([ C.layers.Dropout(self.dropout), C.layers.Convolution2D((5, self.char_emb_dim), self.convs, activation=C.relu, init=C.glorot_uniform(), bias=True, init_bias=0, name='charcnn_conv') ])(x) return C.reduce_max( conv_out, axis=1) # workaround cudnn failure in GlobalMaxPooling
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates = False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: "+str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:"+str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')# [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(target_vector, bias, name='zT2') - C.times_transpose(target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape = (vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def attention_layer(self, context, query): q_processed = C.placeholder(shape=(2 * self.hidden_dim, )) c_processed = C.placeholder(shape=(2 * self.hidden_dim, )) #convert query's sequence axis to static qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs # This part deserves some explanation # It is the attention layer # In the paper they use a 6 * dim dimensional vector # here we split it in three parts because the different parts # participate in very different operations # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u) ws1 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws2 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws3 = C.parameter(shape=(1, 2 * self.hidden_dim), init=C.glorot_uniform()) att_bias = C.parameter(shape=(), init=0) wh = C.times(c_processed, ws1) wu = C.reshape(C.times(qvw, ws2), (-1, )) whu = C.reshape( C.reduce_sum(c_processed * C.sequence.broadcast_as(qvw * ws3, c_processed), axis=1), (-1, )) S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed) S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) q_attn = C.reshape(C.softmax(S), (-1, 1)) #q_attn = print_node(q_attn) c2q = C.reshape( C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn, axis=0), (-1)) max_col = C.reduce_max(S) c_attn = C.sequence.softmax(max_col) htilde = C.sequence.reduce_sum(c_processed * c_attn) q2c = C.sequence.broadcast_as(htilde, c_processed) q2c_out = c_processed * q2c att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out) return C.as_block(att_context, [(c_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def cross_entropy_with_full_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim # Dimension of the hidden vector ): bias = C.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) z = C.reshape(C.times_transpose(weights, hidden_vector) + bias, (1,vocab_dim)) zT = C.times_transpose(z, target_vector) ce = C.reduce_log_sum_exp(z) - zT zMax = C.reduce_max(z) error_on_samples = C.less(zT, zMax) return (z, ce, error_on_samples)
def cross_entropy_with_full_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim # Dimension of the hidden vector ): bias = C.Parameter(shape=(vocab_dim, 1), init=0) weights = C.Parameter(shape=(vocab_dim, hidden_dim), init=C.initializer.glorot_uniform()) z = C.reshape( C.times_transpose(weights, hidden_vector) + bias, (1, vocab_dim)) zT = C.times_transpose(z, target_vector) ce = C.reduce_log_sum_exp(z) - zT zMax = C.reduce_max(z) error_on_samples = C.less(zT, zMax) return (z, ce, error_on_samples)
def charcnn(self, x): ''' @x:[I,w1,w2,w3,...] @kernal:[O,I,w1,w2,w3,...] @out:[O,?depend on stride] ''' conv_out = C.layers.Sequential([ C.layers.Dropout(self.dropout), C.layers.Convolution2D((5, self.char_emb_dim), self.convs, activation=C.relu, init=C.glorot_uniform(), bias=True, init_bias=0, name='charcnn_conv') ])(x) return C.reduce_max( conv_out, axis=1) # workaround cudnn failure in GlobalMaxPooling
def attention_layer(self, context, query, dimc, dimq, common_dim): q_processed = C.placeholder(shape=(dimq, )) c_processed = C.placeholder(shape=(dimc, )) #convert query's sequence axis to static qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w4 * (h.*u) ws1 = C.parameter(shape=(dimc, 1), init=C.glorot_uniform()) ws2 = C.parameter(shape=(dimq, 1), init=C.glorot_uniform()) ws4 = C.parameter(shape=(1, common_dim), init=C.glorot_uniform()) att_bias = C.parameter(shape=(), init=0) wh = C.times(c_processed, ws1) # [#,c][1] wu = C.reshape(C.times(qvw, ws2), (-1, )) # [#][*] # qvw*ws4: [#][*,200], whu:[#,c][*] whu = C.reshape(C.reduce_sum( c_processed[:common_dim] *\ C.sequence.broadcast_as(qvw[:,:common_dim] * ws4, c_processed), axis=1), (-1,)) S1 = wh + C.sequence.broadcast_as(wu, c_processed) + att_bias # [#,c][*] qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed) S1 = C.element_select(qvw_mask_expanded, S1, C.constant(-1e+30)) q_attn = C.reshape(C.softmax(S1), (-1, 1)) # [#,c][*,1] c2q = C.reshape( C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn, axis=0), (-1)) # [#,c][200] max_col = C.reduce_max(S1) # [#,c][1] 最大的q中的单词 c_attn = C.sequence.softmax(max_col) # [#,c][1] 对c中的每一个单词做softmax htilde = C.sequence.reduce_sum(c_processed * c_attn) # [#][200] q2c = C.sequence.broadcast_as(htilde, c_processed) # [#,c][200] q2c_out = c_processed[:common_dim] * q2c[:common_dim] # 原始文档,题目表示,文章重点表示,匹配度表示,文章上下文表示 att_context_reg = C.splice(c_processed, c2q, q2c_out, c_processed[:common_dim] * c2q[:common_dim]) res = C.combine(att_context_reg, C.reshape(q_attn, (-1, ))) return \ C.as_block(res, [(c_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def cross_entropy_with_sampled_softmax( hidden_vector, label_vector, vocab_dim, hidden_dim, num_samples, sampling_weights, allow_duplicates = False ): bias = C.layers.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.layers.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) sample_selector = sample_selector_sparse inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) log_prior = C.log(inclusion_probs) wS = C.times(sample_selector, weights, name='wS') zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3') # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(label_vector, weights, name='wT') zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(label_vector, bias, name='zT2') - C.times_transpose(label_vector, log_prior, name='zT3') zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape = (vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def test_ReduceMax(tmpdir): data = np.array( [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]], dtype=np.float32) model = C.reduce_max(data, 0) verify_no_input(model, tmpdir, 'ReduceMax_0')
def test_ReduceMax(tmpdir): data = np.array([[[5,1], [20,2]],[[30,1], [40,2]],[[55,1], [60,2]]], dtype=np.float32) model = C.reduce_max(data, 0) verify_no_input(model, tmpdir, 'ReduceMax_0')
def inner(a): perturbed = a + C.random.gumbel_like(a) sampled = C.equal(C.reduce_max(perturbed, axis=axis), perturbed, name=name) # equivalent to hardmax(perturbed_x) return sampled
def inner(a): return C.equal(C.reduce_max(a, axis=axis), a)
def test_ReduceMax(tmpdir, dtype): with C.default_options(dtype = dtype): data = np.array([[[5,1], [20,2]],[[30,1], [40,2]],[[55,1], [60,2]]], dtype=dtype) model = C.reduce_max(data, 0) verify_no_input(model, tmpdir, 'ReduceMax_0')
def test_reduce_max(): data = np.array( [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]], dtype=np.float32) # This test is failing, bug must be fixed: # assert_cntk_ngraph_flat_equal(C.reduce_max([1, 0])) assert_cntk_ngraph_flat_equal(C.reduce_max([1, 0], 0)) assert_cntk_ngraph_flat_equal(C.reduce_max([[1., 1.], [3., 5.]], 0)) assert_cntk_ngraph_flat_equal(C.reduce_max([[1., 1.], [3., 5.]], 1)) assert_cntk_ngraph_flat_equal(C.reduce_max([[1., 1.], [3., 5.]], -1)) assert_cntk_ngraph_flat_equal(C.reduce_max(data, 0)) assert_cntk_ngraph_flat_equal(C.reduce_max(data, 1)) assert_cntk_ngraph_flat_equal(C.reduce_max(data, 2)) assert_cntk_ngraph_flat_equal(C.reduce_max(data, -1)) assert_cntk_ngraph_flat_equal(C.reduce_max(data, (0, 1))) assert_cntk_ngraph_flat_equal(C.reduce_max(data, (0, 2))) assert_cntk_ngraph_flat_equal(C.reduce_max(data, (1, 2))) assert_cntk_ngraph_flat_equal(C.reduce_max(data, (-1, -2)))
def charcnn(self, x): conv_out = C.layers.Sequential([ C.layers.Embedding(self.char_emb_dim), C.layers.Dropout(self.dropout), C.layers.Convolution2D((5,self.char_emb_dim), self.convs, activation=C.relu, init=C.glorot_uniform(), bias=True, init_bias=0, name='charcnn_conv')])(x) return C.reduce_max(conv_out, axis=1) # workaround cudnn failure in GlobalMaxPooling