Beispiel #1
0
def conv_model(para):
    for i in range(0, HDIM):
        f = Convolution((3, VEC_DIM), activation=C.relu)
        if i == 0:
            pp = C.reduce_max(f(para))
        else:
            pp = splice(pp, C.reduce_max(f(para)))

    h3 = dense_model(pp)
    return h3
Beispiel #2
0
        def _func(x):
            input_ph = C.placeholder()

            ph = C.placeholder()
            onehot_value = C.one_hot(ph,262)
            x1 = C.times(onehot_value, self.char_embed) # [#,*][50,16]
            # x2 = self.convs[0](x1) # [#,*][32,50,1]
            convs_res = []
            for i in range(self.filter_num):
                conv_res = self.convs[i](x1)
                convs_res.append(C.reshape(C.reduce_max(conv_res, axis=1),(-1,)))
            token_embed = C.splice(*convs_res) # [#,*][2048]
            
            tmp_res = token_embed
            for i in range(self.highway_num):
                tmp_res = self.highways[i](tmp_res)
            highway_out=tmp_res # [#,*][2048]
            proj_out = self.proj(highway_out) # [#,*][512]

            if not require_train:
                res = proj_out.clone(C.CloneMethod.freeze, {ph:input_ph})
            else:
                res = proj_out.clone(C.CloneMethod.clone, {ph:input_ph})
            return C.as_block(
                res,[(input_ph, x)], 'elmo_char_encoder', 'elmo_char_encoder'
            )
Beispiel #3
0
def test_ReduceMax(tmpdir, dtype):
    with C.default_options(dtype=dtype):
        data = np.array(
            [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]],
            dtype=dtype)
        model = C.reduce_max(data, 0)
        verify_no_input(model, tmpdir, 'ReduceMax_0')
Beispiel #4
0
def cross_entropy_with_full_softmax(
    output,  # Node providing the output of the lstm layers
    target_vector,  # Node providing the expected labels
    sv_dim, 
    vocab_dim
    ):
    sv_vector = output.outputs[3]
    z = output.outputs[0]
    zT = C.times_transpose(z, target_vector)
    # cross entropy loss with softmax function
    ce = - C.log(zT)
    # the error 
    zMax = C.reduce_max(z)
    error = C.less(zT, zMax)
    ce = sequence.reduce_sum(ce)
    # discourages the network from turning more than one gate off in a single time step.
    sumc = C.abs(C.sequence.slice(sv_vector, 1, 0) - C.sequence.slice(sv_vector, 0, -1))
    sumc = sequence.reduce_sum(0.0001 * C.pow(100.0, sumc))
    #ce += sumc
    # penalise generated utterances that failed to render all the required slots
    sumc += C.abs(C.sequence.last(sv_vector))
    sumc += C.abs(C.sequence.first(sv_vector) - output.outputs[4])
    sumc = C.reduce_sum(sumc)
    ce = C.reduce_sum(ce)
    ce += sumc
    return ce, error
Beispiel #5
0
def cross_entropy_with_sampled_softmax(
    hidden_vector,  # Node providing the output of the recurrent layers
    target_vector,  # Node providing the expected labels (as sparse vectors)
    vocab_dim,  # Vocabulary size
    hidden_dim,  # Dimension of the hidden vector
    num_samples,  # Number of samples to use for sampled softmax
    sampling_weights,  # Node providing weights to be used for the weighted sampling
    allow_duplicates=False  # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement.
):
    bias = C.layers.Parameter(shape=(vocab_dim, 1), init=0)
    weights = C.layers.Parameter(shape=(vocab_dim, hidden_dim),
                                 init=C.initializer.glorot_uniform())

    sample_selector_sparse = C.random_sample(
        sampling_weights, num_samples,
        allow_duplicates)  # sparse matrix [num_samples * vocab_size]
    if use_sparse:
        sample_selector = sample_selector_sparse
    else:
        # Note: Sampled softmax with dense data is only supported for debugging purposes.
        # It might easily run into memory issues as the matrix 'I' below might be quite large.
        # In case we wan't to a dense representation for all data we have to convert the sample selector
        I = C.Constant(np.eye(vocab_dim, dtype=np.float32))
        sample_selector = C.times(sample_selector_sparse, I)

    inclusion_probs = C.random_sample_inclusion_frequency(
        sampling_weights, num_samples,
        allow_duplicates)  # dense row [1 * vocab_size]
    log_prior = C.log(inclusion_probs)  # dense row [1 * vocab_dim]

    print("hidden_vector: " + str(hidden_vector.shape))
    wS = C.times(sample_selector, weights,
                 name='wS')  # [num_samples * hidden_dim]
    print("ws:" + str(wS.shape))
    zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(
        sample_selector, bias, name='zS2') - C.times_transpose(
            sample_selector, log_prior, name='zS3')  # [num_samples]

    # Getting the weight vector for the true label. Dimension hidden_dim
    wT = C.times(target_vector, weights, name='wT')  # [1 * hidden_dim]
    zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(
        target_vector, bias, name='zT2') - C.times_transpose(
            target_vector, log_prior, name='zT3')  # [1]

    zSReduced = C.reduce_log_sum_exp(zS)

    # Compute the cross entropy that is used for training.
    # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted
    # twice in the normalizing denominator of sampled softmax.
    cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT

    # For applying the model we also output a node providing the input for the full softmax
    z = C.times_transpose(weights, hidden_vector) + bias
    z = C.reshape(z, shape=(vocab_dim))

    zSMax = C.reduce_max(zS)
    error_on_samples = C.less(zT, zSMax)
    return (z, cross_entropy_on_samples, error_on_samples)
Beispiel #6
0
def test_op_reduce_max(input_data, axis_data, expected_result, expected_gradient, device_id, precision):

    a = I([input_data])


    # slice using the operator
    result = C.reduce_max(a, axis = axis_data)

    unittest_helper(result, None, [[expected_result]], device_id=device_id, 
                precision=precision, clean_up=True, backward_pass=False)
    unittest_helper(result, None, [[expected_gradient]], device_id = device_id,
                precision=precision, clean_up=True, backward_pass=True, input_node=a)
Beispiel #7
0
def test_op_reduce_max(input_data, axis_data, expected_result, expected_gradient, device_id, precision):

    a = I([input_data])


    # slice using the operator
    result = C.reduce_max(a, axis = axis_data)

    unittest_helper(result, None, [[expected_result]], device_id=device_id, 
                precision=precision, clean_up=True, backward_pass=False)
    unittest_helper(result, None, [[expected_gradient]], device_id = device_id,
                precision=precision, clean_up=True, backward_pass=True, input_node=a)
Beispiel #8
0
 def charcnn(self, x):
     embedding = C.layers.Embedding(self.char_emb_dim)
     dropout = C.layers.Dropout(self.dropout),
     conv2d = C.layers.Convolution2D((5, self.char_emb_dim),
                                     self.convs,
                                     activation=C.relu,
                                     init=C.glorot_uniform(),
                                     bias=True,
                                     init_bias=0,
                                     name='charcnn_conv')
     conv_out = C.layers.Sequential([embedding, dropout, conv2d])(x)
     return C.reduce_max(conv_out, axis=1)
Beispiel #9
0
def cost_func(training_mode, prediction, target):
    '''
    We use cross entropy in most mode, except for the multi-label mode, which require treating
    multiple labels exactly the same.
    '''
    train_loss = None
    if training_mode == 'majority' or training_mode == 'probability' or training_mode == 'crossentropy': 
        # Cross Entropy.
        train_loss = ct.negate(ct.reduce_sum(ct.element_times(target, ct.log(prediction)), axis=-1))
    elif training_mode == 'multi_target':
        train_loss = ct.negate(ct.log(ct.reduce_max(ct.element_times(target, prediction), axis=-1)))

    return train_loss
Beispiel #10
0
 def charcnn(self, x):
     conv_out = C.layers.Sequential([
         C.layers.Dropout(self.dropout),
         C.layers.Convolution2D((5, self.char_emb_dim),
                                self.convs,
                                activation=C.relu,
                                init=C.glorot_uniform(),
                                bias=True,
                                init_bias=0,
                                name='charcnn_conv')
     ])(x)
     return C.reduce_max(
         conv_out, axis=1)  # workaround cudnn failure in GlobalMaxPooling
Beispiel #11
0
def cross_entropy_with_sampled_softmax(
    hidden_vector,           # Node providing the output of the recurrent layers
    target_vector,           # Node providing the expected labels (as sparse vectors)
    vocab_dim,               # Vocabulary size
    hidden_dim,              # Dimension of the hidden vector
    num_samples,             # Number of samples to use for sampled softmax
    sampling_weights,        # Node providing weights to be used for the weighted sampling
    allow_duplicates = False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement.
    ):
    bias = C.Parameter(shape = (vocab_dim, 1), init = 0)
    weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

    sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size]
    if use_sparse:
        sample_selector = sample_selector_sparse
    else:
        # Note: Sampled softmax with dense data is only supported for debugging purposes.
        # It might easily run into memory issues as the matrix 'I' below might be quite large.
        # In case we wan't to a dense representation for all data we have to convert the sample selector
        I = C.Constant(np.eye(vocab_dim, dtype=np.float32))
        sample_selector = C.times(sample_selector_sparse, I)

    inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size]
    log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim]


    print("hidden_vector: "+str(hidden_vector.shape))
    wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim]
    print("ws:"+str(wS.shape))
    zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')# [num_samples]

    # Getting the weight vector for the true label. Dimension hidden_dim
    wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim]
    zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(target_vector, bias, name='zT2') - C.times_transpose(target_vector, log_prior, name='zT3') # [1]


    zSReduced = C.reduce_log_sum_exp(zS)

    # Compute the cross entropy that is used for training.
    # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted
    # twice in the normalizing denominator of sampled softmax.
    cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT

    # For applying the model we also output a node providing the input for the full softmax
    z = C.times_transpose(weights, hidden_vector) + bias
    z = C.reshape(z, shape = (vocab_dim))

    zSMax = C.reduce_max(zS)
    error_on_samples = C.less(zT, zSMax)
    return (z, cross_entropy_on_samples, error_on_samples)
Beispiel #12
0
    def attention_layer(self, context, query):
        q_processed = C.placeholder(shape=(2 * self.hidden_dim, ))
        c_processed = C.placeholder(shape=(2 * self.hidden_dim, ))

        #convert query's sequence axis to static
        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        # This part deserves some explanation
        # It is the attention layer
        # In the paper they use a 6 * dim dimensional vector
        # here we split it in three parts because the different parts
        # participate in very different operations
        # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u)
        ws1 = C.parameter(shape=(2 * self.hidden_dim, 1),
                          init=C.glorot_uniform())
        ws2 = C.parameter(shape=(2 * self.hidden_dim, 1),
                          init=C.glorot_uniform())
        ws3 = C.parameter(shape=(1, 2 * self.hidden_dim),
                          init=C.glorot_uniform())
        att_bias = C.parameter(shape=(), init=0)

        wh = C.times(c_processed, ws1)
        wu = C.reshape(C.times(qvw, ws2), (-1, ))
        whu = C.reshape(
            C.reduce_sum(c_processed *
                         C.sequence.broadcast_as(qvw * ws3, c_processed),
                         axis=1), (-1, ))
        S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias
        # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max
        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed)
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))
        q_attn = C.reshape(C.softmax(S), (-1, 1))
        #q_attn = print_node(q_attn)
        c2q = C.reshape(
            C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn,
                         axis=0), (-1))

        max_col = C.reduce_max(S)
        c_attn = C.sequence.softmax(max_col)

        htilde = C.sequence.reduce_sum(c_processed * c_attn)
        q2c = C.sequence.broadcast_as(htilde, c_processed)
        q2c_out = c_processed * q2c

        att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out)

        return C.as_block(att_context, [(c_processed, context),
                                        (q_processed, query)],
                          'attention_layer', 'attention_layer')
Beispiel #13
0
def cross_entropy_with_full_softmax(
    hidden_vector,  # Node providing the output of the recurrent layers
    target_vector,  # Node providing the expected labels (as sparse vectors)
    vocab_dim,      # Vocabulary size
    hidden_dim      # Dimension of the hidden vector
    ):
    bias = C.Parameter(shape = (vocab_dim, 1), init = 0)
    weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

    z = C.reshape(C.times_transpose(weights, hidden_vector) + bias, (1,vocab_dim))
    zT = C.times_transpose(z, target_vector)
    ce = C.reduce_log_sum_exp(z) - zT
    zMax = C.reduce_max(z)
    error_on_samples = C.less(zT, zMax)
    return (z, ce, error_on_samples)
Beispiel #14
0
def cross_entropy_with_full_softmax(
        hidden_vector,  # Node providing the output of the recurrent layers
        target_vector,  # Node providing the expected labels (as sparse vectors)
        vocab_dim,  # Vocabulary size
        hidden_dim  # Dimension of the hidden vector
):
    bias = C.Parameter(shape=(vocab_dim, 1), init=0)
    weights = C.Parameter(shape=(vocab_dim, hidden_dim),
                          init=C.initializer.glorot_uniform())

    z = C.reshape(
        C.times_transpose(weights, hidden_vector) + bias, (1, vocab_dim))
    zT = C.times_transpose(z, target_vector)
    ce = C.reduce_log_sum_exp(z) - zT
    zMax = C.reduce_max(z)
    error_on_samples = C.less(zT, zMax)
    return (z, ce, error_on_samples)
 def charcnn(self, x):
     '''
     @x:[I,w1,w2,w3,...]
     @kernal:[O,I,w1,w2,w3,...]
     @out:[O,?depend on stride]
     '''
     conv_out = C.layers.Sequential([
         C.layers.Dropout(self.dropout),
         C.layers.Convolution2D((5, self.char_emb_dim),
                                self.convs,
                                activation=C.relu,
                                init=C.glorot_uniform(),
                                bias=True,
                                init_bias=0,
                                name='charcnn_conv')
     ])(x)
     return C.reduce_max(
         conv_out, axis=1)  # workaround cudnn failure in GlobalMaxPooling
    def attention_layer(self, context, query, dimc, dimq, common_dim):
        q_processed = C.placeholder(shape=(dimq, ))
        c_processed = C.placeholder(shape=(dimc, ))

        #convert query's sequence axis to static
        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w4 * (h.*u)
        ws1 = C.parameter(shape=(dimc, 1), init=C.glorot_uniform())
        ws2 = C.parameter(shape=(dimq, 1), init=C.glorot_uniform())
        ws4 = C.parameter(shape=(1, common_dim), init=C.glorot_uniform())
        att_bias = C.parameter(shape=(), init=0)

        wh = C.times(c_processed, ws1)  # [#,c][1]
        wu = C.reshape(C.times(qvw, ws2), (-1, ))  # [#][*]
        # qvw*ws4: [#][*,200], whu:[#,c][*]
        whu = C.reshape(C.reduce_sum(
            c_processed[:common_dim] *\
            C.sequence.broadcast_as(qvw[:,:common_dim] * ws4, c_processed), axis=1), (-1,))
        S1 = wh + C.sequence.broadcast_as(wu,
                                          c_processed) + att_bias  # [#,c][*]
        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed)
        S1 = C.element_select(qvw_mask_expanded, S1, C.constant(-1e+30))
        q_attn = C.reshape(C.softmax(S1), (-1, 1))  # [#,c][*,1]
        c2q = C.reshape(
            C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn,
                         axis=0), (-1))  # [#,c][200]

        max_col = C.reduce_max(S1)  # [#,c][1] 最大的q中的单词
        c_attn = C.sequence.softmax(max_col)  # [#,c][1] 对c中的每一个单词做softmax

        htilde = C.sequence.reduce_sum(c_processed * c_attn)  # [#][200]
        q2c = C.sequence.broadcast_as(htilde, c_processed)  # [#,c][200]
        q2c_out = c_processed[:common_dim] * q2c[:common_dim]

        # 原始文档,题目表示,文章重点表示,匹配度表示,文章上下文表示
        att_context_reg = C.splice(c_processed, c2q, q2c_out,
                                   c_processed[:common_dim] * c2q[:common_dim])
        res = C.combine(att_context_reg, C.reshape(q_attn, (-1, )))
        return \
        C.as_block(res,
            [(c_processed, context), (q_processed, query)],
            'attention_layer',
            'attention_layer')
Beispiel #17
0
def cross_entropy_with_sampled_softmax(
    hidden_vector,          
    label_vector,           
    vocab_dim,              
    hidden_dim,             
    num_samples,            
    sampling_weights,       
    allow_duplicates = False 
    ):

	bias = C.layers.Parameter(shape = (vocab_dim, 1), init = 0)
	weights = C.layers.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

	sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates)
	sample_selector = sample_selector_sparse

	inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates)
	log_prior = C.log(inclusion_probs)

	wS = C.times(sample_selector, weights, name='wS')
	zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')

	# Getting the weight vector for the true label. Dimension hidden_dim
	wT = C.times(label_vector, weights, name='wT')
	zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(label_vector, bias, name='zT2') - C.times_transpose(label_vector, log_prior, name='zT3')

	zSReduced = C.reduce_log_sum_exp(zS)

	# Compute the cross entropy that is used for training.
	cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT

	# For applying the model we also output a node providing the input for the full softmax
	z = C.times_transpose(weights, hidden_vector) + bias
	z = C.reshape(z, shape = (vocab_dim))

	zSMax = C.reduce_max(zS)
	error_on_samples = C.less(zT, zSMax)

	return (z, cross_entropy_on_samples, error_on_samples)
Beispiel #18
0
def test_ReduceMax(tmpdir):
    data = np.array(
        [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]],
        dtype=np.float32)
    model = C.reduce_max(data, 0)
    verify_no_input(model, tmpdir, 'ReduceMax_0')
Beispiel #19
0
def test_ReduceMax(tmpdir):
    data = np.array([[[5,1], [20,2]],[[30,1], [40,2]],[[55,1], [60,2]]], dtype=np.float32)
    model = C.reduce_max(data, 0)
    verify_no_input(model, tmpdir, 'ReduceMax_0')
Beispiel #20
0
 def inner(a):
     perturbed = a + C.random.gumbel_like(a)
     sampled = C.equal(C.reduce_max(perturbed, axis=axis),
                       perturbed,
                       name=name)  # equivalent to hardmax(perturbed_x)
     return sampled
Beispiel #21
0
 def inner(a):
     return C.equal(C.reduce_max(a, axis=axis), a)
Beispiel #22
0
def test_ReduceMax(tmpdir, dtype):
    with C.default_options(dtype = dtype):
        data = np.array([[[5,1], [20,2]],[[30,1], [40,2]],[[55,1], [60,2]]], dtype=dtype)
        model = C.reduce_max(data, 0)
        verify_no_input(model, tmpdir, 'ReduceMax_0')
def test_reduce_max():
    data = np.array(
        [[[5, 1], [20, 2]], [[30, 1], [40, 2]], [[55, 1], [60, 2]]],
        dtype=np.float32)

    # This test is failing, bug must be fixed:
    # assert_cntk_ngraph_flat_equal(C.reduce_max([1, 0]))
    assert_cntk_ngraph_flat_equal(C.reduce_max([1, 0], 0))
    assert_cntk_ngraph_flat_equal(C.reduce_max([[1., 1.], [3., 5.]], 0))
    assert_cntk_ngraph_flat_equal(C.reduce_max([[1., 1.], [3., 5.]], 1))
    assert_cntk_ngraph_flat_equal(C.reduce_max([[1., 1.], [3., 5.]], -1))
    assert_cntk_ngraph_flat_equal(C.reduce_max(data, 0))
    assert_cntk_ngraph_flat_equal(C.reduce_max(data, 1))
    assert_cntk_ngraph_flat_equal(C.reduce_max(data, 2))
    assert_cntk_ngraph_flat_equal(C.reduce_max(data, -1))
    assert_cntk_ngraph_flat_equal(C.reduce_max(data, (0, 1)))
    assert_cntk_ngraph_flat_equal(C.reduce_max(data, (0, 2)))
    assert_cntk_ngraph_flat_equal(C.reduce_max(data, (1, 2)))
    assert_cntk_ngraph_flat_equal(C.reduce_max(data, (-1, -2)))
Beispiel #24
0
 def charcnn(self, x):
     conv_out = C.layers.Sequential([
         C.layers.Embedding(self.char_emb_dim),
         C.layers.Dropout(self.dropout),
         C.layers.Convolution2D((5,self.char_emb_dim), self.convs, activation=C.relu, init=C.glorot_uniform(), bias=True, init_bias=0, name='charcnn_conv')])(x)
     return C.reduce_max(conv_out, axis=1) # workaround cudnn failure in GlobalMaxPooling