Exemple #1
0
def build_graph(self_attention,
                self_penalty,
                embeded_dim=60,
                h_dim=150,
                d_a=350,
                r=30):

    with C.layers.default_options(init=C.xavier()):
        embeded = C.layers.Embedding(embeded_dim)(x)
        embeded = C.layers.Stabilizer()(embeded)

        H = create_birnn(C.layers.GRU(h_dim), C.layers.GRU(h_dim))(embeded)

        if self_attention:
            Ws1 = C.parameter(shape=(d_a, 2 * h_dim), name="Ws1")
            Ws2 = C.parameter(shape=(r, d_a), name="Ws2")
            A = C.softmax(C.times(Ws2, C.tanh(C.times_transpose(Ws1, H))))
            H = C.times(A, H)  # the M in the paper

            if self_penalty:
                I = C.constant(np.eye(r), dtype=np.float32)
                P = C.times_transpose(A, A) - I  # r*r
                p = C.reduce_sum(C.abs(C.element_times(
                    P, P)))  # frobenius norm **2

        y_ = C.layers.Dense(200, activation=C.ops.relu)(H)

        # y_pre = C.layers.Dense(num_labels, activation = None)(y_)
        def selfAtt(x):
            y_pre = C.layers.Dense(num_labels, activation=None)(y_)
            return y_pre

        if self_penalty:
            selfAtt.p = p
        return selfAtt
Exemple #2
0
def create_word2vec_cbow_model(word_one_hot, context_one_hots, negative_one_hots):
	# shared_embedding_layer = Embedding(G.embedding_dimension, uniform(scale=1.0/2.0/G.embedding_dimension))
	shared_embedding_layer = Embedding(G.embedding_dimension)

	word_embedding = shared_embedding_layer(word_one_hot)
	context_embeddings = [shared_embedding_layer(x) for x in context_one_hots]
	negative_embeddings = [shared_embedding_layer(x) for x in negative_one_hots]

	print(word_embedding.shape)
	word_embedding_reshaped = C.reshape(word_embedding, shape=(1, G.embedding_dimension))
	print(word_embedding_reshaped.shape)

	context_embeddings_all = C.reshape(C.splice(*context_embeddings), shape=(context_size, G.embedding_dimension))
	negative_embeddings_all = C.reshape(C.splice(*negative_embeddings), shape=(G.negative, G.embedding_dimension))
	print(context_embeddings_all.shape)
	print(negative_embeddings_all.shape)
	cbow = C.reshape(C.reduce_mean(context_embeddings_all, 0), shape=(G.embedding_dimension))
	print(cbow.shape)

	# word_context_product = C.times_transpose(word_embedding_reshaped, cbow)
	word_context_product = C.times_transpose(word_embedding, cbow)
	print(word_context_product.shape)
	negative_context_product = C.reshape(C.times_transpose(negative_embeddings_all, cbow), shape=(G.negative))
	print(negative_context_product.shape)

	word_negative_context_product = C.splice(word_context_product, negative_context_product)
	print(word_negative_context_product.shape)
	# return model and shared embedding layer
	return word_negative_context_product, shared_embedding_layer
Exemple #3
0
def cross_entropy_with_sampled_softmax(
    hidden_vector,  # Node providing the output of the recurrent layers
    target_vector,  # Node providing the expected labels (as sparse vectors)
    vocab_dim,  # Vocabulary size
    hidden_dim,  # Dimension of the hidden vector
    num_samples,  # Number of samples to use for sampled softmax
    sampling_weights,  # Node providing weights to be used for the weighted sampling
    allow_duplicates=False  # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement.
):
    bias = C.layers.Parameter(shape=(vocab_dim, 1), init=0)
    weights = C.layers.Parameter(shape=(vocab_dim, hidden_dim),
                                 init=C.initializer.glorot_uniform())

    sample_selector_sparse = C.random_sample(
        sampling_weights, num_samples,
        allow_duplicates)  # sparse matrix [num_samples * vocab_size]
    if use_sparse:
        sample_selector = sample_selector_sparse
    else:
        # Note: Sampled softmax with dense data is only supported for debugging purposes.
        # It might easily run into memory issues as the matrix 'I' below might be quite large.
        # In case we wan't to a dense representation for all data we have to convert the sample selector
        I = C.Constant(np.eye(vocab_dim, dtype=np.float32))
        sample_selector = C.times(sample_selector_sparse, I)

    inclusion_probs = C.random_sample_inclusion_frequency(
        sampling_weights, num_samples,
        allow_duplicates)  # dense row [1 * vocab_size]
    log_prior = C.log(inclusion_probs)  # dense row [1 * vocab_dim]

    print("hidden_vector: " + str(hidden_vector.shape))
    wS = C.times(sample_selector, weights,
                 name='wS')  # [num_samples * hidden_dim]
    print("ws:" + str(wS.shape))
    zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(
        sample_selector, bias, name='zS2') - C.times_transpose(
            sample_selector, log_prior, name='zS3')  # [num_samples]

    # Getting the weight vector for the true label. Dimension hidden_dim
    wT = C.times(target_vector, weights, name='wT')  # [1 * hidden_dim]
    zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(
        target_vector, bias, name='zT2') - C.times_transpose(
            target_vector, log_prior, name='zT3')  # [1]

    zSReduced = C.reduce_log_sum_exp(zS)

    # Compute the cross entropy that is used for training.
    # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted
    # twice in the normalizing denominator of sampled softmax.
    cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT

    # For applying the model we also output a node providing the input for the full softmax
    z = C.times_transpose(weights, hidden_vector) + bias
    z = C.reshape(z, shape=(vocab_dim))

    zSMax = C.reduce_max(zS)
    error_on_samples = C.less(zT, zSMax)
    return (z, cross_entropy_on_samples, error_on_samples)
Exemple #4
0
def hierarchical_softmax_layer(input_var, label_index, label_dim, label_classes=None):
    '''
    A two layers hierarchical softmax function:

    Args:
        input_var: Variable with shape: [#,*](dim_x)
        label_index: index of label's category:  [#,*](1)
        label_dim: number of the label categories
        label_classes: number of classes of the label categories
    Returns:
        output_prob: the probability of the given label [#,*](1)
        class_probs: the probability of all the label classes [#,*](label_classes)
        all_probs: the probability of all label classes 
    '''
    input_dim = input_var.shape[0]

    if not label_classes:
        label_classes = int(np.ceil(np.sqrt(float(label_dim))))

    n_outputs_per_class = int(np.ceil(label_dim / label_classes))

    target_class = C.floor((label_index + 0.5) / n_outputs_per_class)
    target_output_in_class = C.round(label_index - target_class * n_outputs_per_class)

    w1 = parameter(shape=(input_dim, label_classes), init=C.glorot_normal(), name='hsoftmax_w1')
    b1 = parameter(shape=(label_classes), init=C.glorot_normal(), name='hsoftmax_b1')
    w2s = parameter(shape=(label_classes, input_dim, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_w2s')
    b2s = parameter(shape=(label_classes, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_b2s')

    class_probs = softmax(b1 + times(input_var, w1))

    # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate
    target_class_one_hot = C.one_hot(target_class, num_classes=label_classes, sparse_output=False)
    w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2), [input_dim, -1])
    b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1])
    probs_in_class = softmax(b2 + times(input_var, w2))

    prob_in_class = C.times_transpose(C.one_hot(target_output_in_class, num_classes=n_outputs_per_class, sparse_output=False), probs_in_class)
    class_prob = C.times_transpose(C.one_hot(target_class, num_classes=label_classes, sparse_output=False), class_probs)
    output_prob = prob_in_class * class_prob

    # this is for calculating all the outputs' probabilities
    all_probs = []
    for i in range(label_classes):
        ci = C.constant(i)
        ci_one_hot = C.one_hot(ci, num_classes=label_classes, sparse_output=False)
        w2a = C.times(ci_one_hot, w2s, output_rank=2)
        b2a = C.times(ci_one_hot, b2s, output_rank=1)
        probs_in_classa = C.softmax(b2a + times(input_var, w2a))
        class_proba = C.times_transpose(ci_one_hot, class_probs)
        output_proba = probs_in_classa * class_proba
        all_probs.append(output_proba)

    return output_prob, class_probs, all_probs
Exemple #5
0
def cross_entropy_with_sampled_softmax(
    hidden_vector,           # Node providing the output of the recurrent layers
    target_vector,           # Node providing the expected labels (as sparse vectors)
    vocab_dim,               # Vocabulary size
    hidden_dim,              # Dimension of the hidden vector
    num_samples,             # Number of samples to use for sampled softmax
    sampling_weights,        # Node providing weights to be used for the weighted sampling
    allow_duplicates = False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement.
    ):
    bias = C.Parameter(shape = (vocab_dim, 1), init = 0)
    weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

    sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size]
    if use_sparse:
        sample_selector = sample_selector_sparse
    else:
        # Note: Sampled softmax with dense data is only supported for debugging purposes.
        # It might easily run into memory issues as the matrix 'I' below might be quite large.
        # In case we wan't to a dense representation for all data we have to convert the sample selector
        I = C.Constant(np.eye(vocab_dim, dtype=np.float32))
        sample_selector = C.times(sample_selector_sparse, I)

    inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size]
    log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim]


    print("hidden_vector: "+str(hidden_vector.shape))
    wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim]
    print("ws:"+str(wS.shape))
    zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')# [num_samples]

    # Getting the weight vector for the true label. Dimension hidden_dim
    wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim]
    zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(target_vector, bias, name='zT2') - C.times_transpose(target_vector, log_prior, name='zT3') # [1]


    zSReduced = C.reduce_log_sum_exp(zS)

    # Compute the cross entropy that is used for training.
    # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted
    # twice in the normalizing denominator of sampled softmax.
    cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT

    # For applying the model we also output a node providing the input for the full softmax
    z = C.times_transpose(weights, hidden_vector) + bias
    z = C.reshape(z, shape = (vocab_dim))

    zSMax = C.reduce_max(zS)
    error_on_samples = C.less(zT, zSMax)
    return (z, cross_entropy_on_samples, error_on_samples)
Exemple #6
0
def cross_entropy_with_full_softmax(
    hidden_vector,  # Node providing the output of the recurrent layers
    target_vector,  # Node providing the expected labels (as sparse vectors)
    vocab_dim,      # Vocabulary size
    hidden_dim      # Dimension of the hidden vector
    ):
    bias = C.Parameter(shape = (vocab_dim, 1), init = 0)
    weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

    z = C.reshape(C.times_transpose(weights, hidden_vector) + bias, (1,vocab_dim))
    zT = C.times_transpose(z, target_vector)
    ce = C.reduce_log_sum_exp(z) - zT
    zMax = C.reduce_max(z)
    error_on_samples = C.less(zT, zMax)
    return (z, ce, error_on_samples)
Exemple #7
0
def triangular_matrix_seq(mode: int = 1):
    X = C.placeholder(1)
    ones = C.ones_like(X[0])
    perm_1 = C.layers.Recurrence(C.plus, return_full_state=True)(ones)
    perm_2 = C.layers.Recurrence(C.plus,
                                 go_backwards=True,
                                 return_full_state=True)(ones)

    arr_1 = C.sequence.unpack(perm_1, 0, True)
    arr_2 = C.sequence.unpack(perm_2, 0, True)

    mat = C.times_transpose(arr_1, arr_2)
    mat_c = arr_1 * arr_2

    diagonal_mat = mat - mat_c

    final_mat = diagonal_mat
    if mode == 0:
        final_mat = C.equal(final_mat, 0)
    elif mode == 1:
        final_mat = C.less_equal(final_mat, 0)
    elif mode == 2:
        final_mat = C.less(final_mat, 0)
    elif mode == -1:
        final_mat = C.greater_equal(final_mat, 0)
    elif mode == -2:
        final_mat = C.greater(final_mat, 0)

    result = C.as_block(final_mat, [(X, X)], 'triangular_matrix')

    return C.stop_gradient(result)
def cross_entropy_with_full_softmax(
    output,  # Node providing the output of the lstm layers
    target_vector,  # Node providing the expected labels
    sv_dim, 
    vocab_dim
    ):
    sv_vector = output.outputs[3]
    z = output.outputs[0]
    zT = C.times_transpose(z, target_vector)
    # cross entropy loss with softmax function
    ce = - C.log(zT)
    # the error 
    zMax = C.reduce_max(z)
    error = C.less(zT, zMax)
    ce = sequence.reduce_sum(ce)
    # discourages the network from turning more than one gate off in a single time step.
    sumc = C.abs(C.sequence.slice(sv_vector, 1, 0) - C.sequence.slice(sv_vector, 0, -1))
    sumc = sequence.reduce_sum(0.0001 * C.pow(100.0, sumc))
    #ce += sumc
    # penalise generated utterances that failed to render all the required slots
    sumc += C.abs(C.sequence.last(sv_vector))
    sumc += C.abs(C.sequence.first(sv_vector) - output.outputs[4])
    sumc = C.reduce_sum(sumc)
    ce = C.reduce_sum(ce)
    ce += sumc
    return ce, error
Exemple #9
0
    def createDecoderNetwork(self, networkHiddenSrc, srcLength, trgLength):
        timeZeroHidden = C.slice(networkHiddenSrc, 0, 0, 1)
        srcSentEmb = C.slice(timeZeroHidden, -1, Config.SrcHiddenSize,
                             Config.SrcHiddenSize * 2)
        networkHiddenTrg = {}
        inputTrg = C.reshape(self.inputMatrixTrg,
                             shape=(Config.TrgMaxLength, Config.BatchSize,
                                    Config.TrgVocabSize))
        attProbAll = []
        tce = 0
        for i in range(0, trgLength, 1):

            preTrgEmb = self.initTrgEmb if i == 0 else self.EmbTrg(inputTrg[i -
                                                                            1])

            if (i == 0):
                networkHiddenTrg[i] = self.createDecoderInitNetwork(srcSentEmb)
            else:
                (networkHiddenTrg[i], attProb) = self.createDecoderRNNNetwork(
                    networkHiddenSrc, preTrgEmb, networkHiddenTrg[i - 1],
                    srcLength)
                attProbAll = attProb if i == 1 else C.splice(
                    attProbAll, attProb, axis=0)

            preSoftmax = self.createReadOutNetwork(networkHiddenTrg[i],
                                                   preTrgEmb)
            ce = C.cross_entropy_with_softmax(preSoftmax, inputTrg[i], 2)
            ce = C.reshape(ce, shape=(1, Config.BatchSize))
            tce += C.times_transpose(ce, self.maskMatrixTrg[i])

        return tce
Exemple #10
0
    def attention(query, key, value):
        dk = C.reduce_sum(C.ones_like(query))  # cannot use sequence.last, will conflict with recurrence
        # dk: [#, *] [1, ] and value = int(dim_of_query)

        unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True)  # [#] [-3, key_dim]
        unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True)  # [#] [-3, value_dim]

        broadcasted_key = C.sequence.broadcast_as(unpacked_key, query)  # [#, *] [-3, key_dim]
        scaled = C.times_transpose(query, broadcasted_key) / dk
        # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim
        # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score

        # masked out invalid temporal connections to obey_sequence_order
        if obey_sequence_order and max_seq_len:
            unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs
            # unpacked_scaled: [#] [-3, -3]  <== matrix will be top right diagonally zero-ed
            # scaled_mask: [#] [-3,]

            minus_inf = C.constant(-1e+30)
            valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0))  # [] [max_seq, max_seq]
            valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled)  # [#] [max_seq, max_seq]
            valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0)  # [#] [-3, -3]
            unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf)  # [#] [-3, -3]
            scaled = C.to_sequence_like(unpacked_scaled, query)  # [#, *] [-3]

        elif obey_sequence_order and not max_seq_len:
            raise ValueError("max_seq_len must be defined when obey_sequence_order is True")

        attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query))  # [#, *] [value_dim,]
        return attended
Exemple #11
0
def cross_entropy_with_full_softmax(
        hidden_vector,  # Node providing the output of the recurrent layers
        target_vector,  # Node providing the expected labels (as sparse vectors)
        vocab_dim,  # Vocabulary size
        hidden_dim  # Dimension of the hidden vector
):
    bias = C.Parameter(shape=(vocab_dim, 1), init=0)
    weights = C.Parameter(shape=(vocab_dim, hidden_dim),
                          init=C.initializer.glorot_uniform())

    z = C.reshape(
        C.times_transpose(weights, hidden_vector) + bias, (1, vocab_dim))
    zT = C.times_transpose(z, target_vector)
    ce = C.reduce_log_sum_exp(z) - zT
    zMax = C.reduce_max(z)
    error_on_samples = C.less(zT, zMax)
    return (z, ce, error_on_samples)
Exemple #12
0
    def model(seq_image, decoded):
        params = dense(decoded)
        g_x, g_y, sigma2, delta, gamma = attention_parameters(params)

        i = C.Constant(np.arange(n) + 1, )  # col of patch
        j = C.Constant(np.arange(n) + 1, )  # row of patch
        mu_x = g_x + (i - n / 2 - 0.5) * delta
        mu_y = g_y + (j - n / 2 - 0.5) * delta
        mu_x = C.expand_dims(mu_x, axis=-1)
        mu_y = C.expand_dims(mu_y, axis=-1)
        # mu_x: [#, *] [n, 1]
        # mu_y: [#, *] [n, 1]

        image = C.sequence.unpack(seq_image,
                                  padding_value=0,
                                  no_mask_output=True)
        # image: [#] [*image_width, filters, image_height]

        width_pos = Cx.sequence.position(seq_image)
        # width_pos: [#, *] [1]

        width_pos_unpacked = C.sequence.unpack(width_pos,
                                               padding_value=999_999,
                                               no_mask_output=True)
        # width_pos: [#] [*image_width, 1]

        a = C.sequence.broadcast_as(C.swapaxes(width_pos_unpacked), mu_x)
        # a: [#, *] [1, *image_width]
        # x pos index of image (width)

        b = C.Constant(np.arange(image_height).reshape((1, -1)))
        # b: [] [1, image_height]
        # y pos index of image (height)

        # calculate the which portion of the image that is attended by the gaussian filter
        f_xi = C.exp(-0.5 * C.square(a - mu_x) / sigma2)
        f_yj = C.exp(-0.5 * C.square(b - mu_y) / sigma2)
        # f_xi: [#, *] [n, *image_width]
        # f_yj: [#, *] [n, image_height]

        z_x = C.reduce_sum(f_xi, axis=1)
        z_y = C.reduce_sum(f_yj, axis=1)
        # z_x: [#, *] [n]
        # z_y: [#, *] [n]

        f_xi = f_xi / z_x
        f_yj = f_yj / z_y
        # f_xi: [#, *] [n, *image_width]
        # f_yj: [#, *] [n, image_height]

        # combine filters from x and y
        image_broadcasted = C.sequence.broadcast_as(image, f_yj)
        attended = gamma * C.times(
            f_xi, C.times_transpose(image_broadcasted, f_yj), output_rank=2)
        # attended: [#, *] [n, filters, n]
        attended = C.swapaxes(attended)
        # attended: [#, *] [filters, n (x) , n (y)]
        return attended
Exemple #13
0
    def build_trainer(self):

        # Set the learning rate, and the momentum parameters for the Adam optimizer.
        lr = learning_rate_schedule(self.lr, UnitType.minibatch)
        beta1 = momentum_schedule(0.9)
        beta2 = momentum_schedule(0.99)

        # Calculate the losses.
        loss_on_v = cntk.squared_error(self.R, self.v)
        pi_a_s = cntk.log(cntk.times_transpose(self.pi, self.action))

        loss_on_pi = cntk.variables.Constant(-1) * (cntk.plus(
            cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc)),
            0.01 * cntk.times_transpose(self.pi, cntk.log(self.pi))))
        #loss_on_pi = cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc))

        self.tensorboard_v_writer = TensorBoardProgressWriter(
            freq=10, log_dir="tensorboard_v_logs", model=self.v)
        self.tensorboard_pi_writer = TensorBoardProgressWriter(
            freq=10, log_dir="tensorboard_pi_logs", model=self.pi)

        # tensorboard --logdir=tensorboard_pi_logs  http://localhost:6006/
        # tensorboard --logdir=tensorboard_v_logs  http://localhost:6006/

        # Create the trainiers.
        self.trainer_v = cntk.Trainer(self.v, (loss_on_v), [
            adam(self.pms_v,
                 lr,
                 beta1,
                 variance_momentum=beta2,
                 gradient_clipping_threshold_per_sample=2,
                 l2_regularization_weight=0.01)
        ], self.tensorboard_v_writer)
        self.trainer_pi = cntk.Trainer(self.pi, (loss_on_pi), [
            adam(self.pms_pi,
                 lr,
                 beta1,
                 variance_momentum=beta2,
                 gradient_clipping_threshold_per_sample=2,
                 l2_regularization_weight=0.01)
        ], self.tensorboard_pi_writer)
Exemple #14
0
def cross_entropy_with_sampled_softmax(
    hidden_vector,          
    label_vector,           
    vocab_dim,              
    hidden_dim,             
    num_samples,            
    sampling_weights,       
    allow_duplicates = False 
    ):

	bias = C.layers.Parameter(shape = (vocab_dim, 1), init = 0)
	weights = C.layers.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform())

	sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates)
	sample_selector = sample_selector_sparse

	inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates)
	log_prior = C.log(inclusion_probs)

	wS = C.times(sample_selector, weights, name='wS')
	zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')

	# Getting the weight vector for the true label. Dimension hidden_dim
	wT = C.times(label_vector, weights, name='wT')
	zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(label_vector, bias, name='zT2') - C.times_transpose(label_vector, log_prior, name='zT3')

	zSReduced = C.reduce_log_sum_exp(zS)

	# Compute the cross entropy that is used for training.
	cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT

	# For applying the model we also output a node providing the input for the full softmax
	z = C.times_transpose(weights, hidden_vector) + bias
	z = C.reshape(z, shape = (vocab_dim))

	zSMax = C.reduce_max(zS)
	error_on_samples = C.less(zT, zSMax)

	return (z, cross_entropy_on_samples, error_on_samples)
Exemple #15
0
def test_op_times_sparse_grad(device_id, precision):
    dt_precision = PRECISION_TO_TYPE[precision]

    from cntk import times, times_transpose, parameter, reshape, Value, sequence
    dim = 5
    num_sequences = 2
    seq = [i for i in range(dim)]
    identity = np.identity(dim, dtype=dt_precision)
    input_data = Value.one_hot([seq]*num_sequences, dim, dtype=dt_precision)
    input_var  = sequence.input_variable(shape=(dim), is_sparse=True, needs_gradient=False, dtype=dt_precision)
    e = parameter(shape = (dim, dim), init = identity, dtype=dt_precision)
    z = reshape(times_transpose(e, times(input_var, e)), dim)
    e_grad = z.grad({input_var : input_data}, [e])
    
    assert np.allclose(e_grad, np.ones((dim,dim))*4)
Exemple #16
0
def test_op_times_sparse_grad(device_id, precision):
    dt_precision = PRECISION_TO_TYPE[precision]

    from cntk import times, times_transpose, parameter, reshape, one_hot
    dim = 5
    num_sequences = 2
    seq = [i for i in range(dim)]
    identity = np.identity(dim, dtype=np.float32)
    input_data = one_hot([seq] * num_sequences, dim)
    input_var = I(shape=(dim), is_sparse=True, needs_gradient=False)
    e = parameter(shape=(dim, dim), init=identity)
    z = reshape(times_transpose(e, times(input_var, e)), dim)
    e_grad = z.grad({input_var: input_data}, [e])

    assert np.allclose(e_grad, np.ones((dim, dim)) * 4)
Exemple #17
0
def test_times_transpose_sequence_param(device_id, precision):
    dt_precision = PRECISION_TO_TYPE[precision]

    from cntk import times_transpose, parameter, sequence, Value
    dim = 5
    num_sequences = 2
    seq = [i for i in range(dim)]
    identity = np.identity(dim, dtype=dt_precision)
    input_data = Value.one_hot([seq] * num_sequences, dim, dtype=dt_precision)
    input_var = sequence.input_variable(shape=(dim),
                                        needs_gradient=True,
                                        dtype=dt_precision)
    e = parameter(shape=(dim, ), init=1, dtype=dt_precision)
    z = times_transpose(e, input_var)
    e_grad = z.grad({input_var: input_data}, [e, input_var])
    def dot_attention(self, inputs, memory, dim):
        '''
        @inputs: [#,c][d] a sequence need attention
        @memory(key): [#,q][d] a sequence input refers to compute similarity(weight)
        @value: [#,q][d] a sequence input refers to weighted sum
        @output: [#,c][d] attention vector
        '''
        input_ph = C.placeholder()
        input_mem = C.placeholder()
        with C.layers.default_options(
                bias=False,
                activation=C.relu):  # all the projections have no bias
            attn_proj_enc = C.layers.Dense(dim,
                                           init=glorot_uniform(),
                                           input_rank=1,
                                           name="Wqu")
            attn_proj_dec = C.layers.Dense(dim,
                                           init=glorot_uniform(),
                                           input_rank=1)

        inputs_ = attn_proj_enc(input_ph)  # [#,c][d]
        memory_ = attn_proj_dec(input_mem)  # [#,q][d]
        unpack_memory, mem_mask = C.sequence.unpack(
            memory_, 0).outputs  # [#][*=q, d], [#][*=q]
        unpack_memory_expand = C.sequence.broadcast_as(unpack_memory,
                                                       inputs_)  # [#,c][*=q,d]

        matrix = C.times_transpose(inputs_, unpack_memory_expand) / (
            dim**0.5)  # [#,c][*=q]
        mem_mask_expand = C.sequence.broadcast_as(mem_mask,
                                                  inputs_)  # [#,c][*=q]
        matrix = C.element_select(mem_mask_expand, matrix,
                                  C.constant(-1e+30))  # [#,c][*=q]
        logits = C.reshape(C.softmax(matrix), (-1, 1))  # [#,c][*=q,1]
        # [#,c][*=q, d]
        memory_expand = C.sequence.broadcast_as(
            C.sequence.unpack(input_mem, 0, no_mask_output=True), input_ph)
        weighted_att = C.reshape(C.reduce_sum(logits * memory_expand, axis=0),
                                 (-1, ))  # [#,c][d]

        return C.as_block(C.combine(weighted_att,
                                    logits), [(input_ph, inputs),
                                              (input_mem, memory)],
                          'dot attention', 'dot attention')
def hierarchical_softmax_layer(input_var,
                               label_index,
                               label_dim,
                               label_classes=None):
    '''
    A two layers hierarchical softmax function:

    Args:
        input_var: Variable with shape: [#,*](dim_x)
        label_index: index of label's category:  [#,*](1)
        label_dim: number of the label categories
        label_classes: number of classes of the label categories
    Returns:
        output_prob: the probability of the given label [#,*](1)
        class_probs: the probability of all the label classes [#,*](label_classes)
        all_probs: the probability of all label classes 
    '''
    input_dim = input_var.shape[0]

    if not label_classes:
        label_classes = int(np.ceil(np.sqrt(float(label_dim))))

    n_outputs_per_class = int(np.ceil(label_dim / label_classes))

    target_class = C.floor((label_index + 0.5) / n_outputs_per_class)
    target_output_in_class = C.round(label_index -
                                     target_class * n_outputs_per_class)

    w1 = parameter(shape=(input_dim, label_classes),
                   init=C.glorot_normal(),
                   name='hsoftmax_w1')
    b1 = parameter(shape=(label_classes),
                   init=C.glorot_normal(),
                   name='hsoftmax_b1')
    w2s = parameter(shape=(
        label_classes,
        input_dim,
        n_outputs_per_class,
    ),
                    init=C.glorot_normal(),
                    name='hsoftmax_w2s')
    b2s = parameter(shape=(
        label_classes,
        n_outputs_per_class,
    ),
                    init=C.glorot_normal(),
                    name='hsoftmax_b2s')

    class_probs = softmax(b1 + times(input_var, w1))

    # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate
    target_class_one_hot = C.one_hot(target_class,
                                     num_classes=label_classes,
                                     sparse_output=False)
    w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2),
                   [input_dim, -1])
    b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1])
    probs_in_class = softmax(b2 + times(input_var, w2))

    prob_in_class = C.times_transpose(
        C.one_hot(target_output_in_class,
                  num_classes=n_outputs_per_class,
                  sparse_output=False), probs_in_class)
    class_prob = C.times_transpose(
        C.one_hot(target_class, num_classes=label_classes,
                  sparse_output=False), class_probs)
    output_prob = prob_in_class * class_prob

    # this is for calculating all the outputs' probabilities
    all_probs = []
    for i in range(label_classes):
        ci = C.constant(i)
        ci_one_hot = C.one_hot(ci,
                               num_classes=label_classes,
                               sparse_output=False)
        w2a = C.times(ci_one_hot, w2s, output_rank=2)
        b2a = C.times(ci_one_hot, b2s, output_rank=1)
        probs_in_classa = C.softmax(b2a + times(input_var, w2a))
        class_proba = C.times_transpose(ci_one_hot, class_probs)
        output_proba = probs_in_classa * class_proba
        all_probs.append(output_proba)

    return output_prob, class_probs, all_probs
Exemple #20
0
def self_attention_layer(in_dims: int,
                         out_dims: int,
                         name='self_attention',
                         as_block: bool = False,
                         k_ph: bool = False,
                         v_ph: bool = False,
                         mask_opt: bool = False) -> C.Function:
    sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims')

    X = C.placeholder(
        in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()),
        name=name + '_ph')

    if k_ph is False and v_ph is False:
        q = C.layers.Dense(out_dims, name=name + '_q')(
            X
        )  # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q')
        k = C.layers.Dense(out_dims, name=name + '_k')(
            X
        )  # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k')
        v = C.layers.Dense(out_dims, name=name + '_v')(
            X
        )  # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v')
    elif k_ph is True and v_ph is True:
        q = C.layers.Dense(out_dims, name=name + '_q')(X)
        k = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_k_ph')
        v = C.placeholder(out_dims,
                          (C.Axis.default_batch_axis(), C.Axis('kv_seq')),
                          name=name + '_v_ph')
    else:
        raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}')

    q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q')
    k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k')
    v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v')

    scores = C.times_transpose(q_, k_, name=name + '_score_matrix')
    scaled = scores / sq_sa_dims  # div_k

    if mask_opt:
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')
        scaled = C.element_min(scaled, inf_mask)

    softmax = C.softmax(scaled, name=name + '_softmax')
    attention = C.times(softmax, v_, name=name + '_attention')

    result = C.to_sequence_like(attention, X)

    if as_block:
        if k_ph is False and v_ph is False:
            return C.as_block(result, [(X, X)], 'self_attention',
                              'self_attention_')
        elif k_ph is True and v_ph is True:
            return C.as_block(result, [(X, X), (k, k), (v, v)],
                              'self_attention', 'self_attention_')
        else:
            raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}')
    else:
        return result
    def attention_layer(self, context, query, dim):
        input_ph = C.placeholder(shape=(dim, ))
        input_mem = C.placeholder(shape=(dim, ))
        with C.layers.default_options(bias=False, activation=C.relu):
            attn_proj_enc = C.layers.Dense(self.hidden_dim,
                                           init=glorot_uniform(),
                                           input_rank=1,
                                           name="Wqu")
            attn_proj_dec = C.layers.Dense(self.hidden_dim,
                                           init=glorot_uniform(),
                                           input_rank=1)

        inputs_ = attn_proj_enc(input_ph)  # [#,c][d]
        memory_ = attn_proj_dec(input_mem)  # [#,q][d]

        cln_mem_ph = C.placeholder()  # [#,q][?=d]
        cln_inp_ph = C.placeholder()  # [#,c][?=d]
        unpack_inputs, inputs_mask = C.sequence.unpack(
            cln_inp_ph, 0).outputs  # [#][*=c,d] [#][*=c]
        expand_inputs = C.sequence.broadcast_as(unpack_inputs,
                                                cln_mem_ph)  # [#,q][*=c,d]
        matrix = C.reshape(
            C.times_transpose(cln_mem_ph, expand_inputs) /
            (self.hidden_dim**0.5), (-1, ))  # [#,q][*=c]
        matrix = C.element_select(
            C.sequence.broadcast_as(inputs_mask, cln_mem_ph), matrix,
            C.constant(-1e30))
        logits = C.softmax(matrix, axis=0, name='level 1 weight')  # [#,q][*=c]
        trans_expand_inputs = C.transpose(expand_inputs,
                                          [1, 0])  # [#,q][d,*=c]
        q_over_c = C.reshape(
            C.reduce_sum(logits * trans_expand_inputs, axis=1),
            (-1, )) / (self.hidden_dim**0.5)  # [#,q][d]
        new_q = C.splice(cln_mem_ph, q_over_c)  # [#,q][2*d]
        # over
        unpack_matrix, matrix_mask = C.sequence.unpack(
            matrix, 0).outputs  # [#][*=q,*=c] [#][*=q]
        inputs_mask_s = C.to_sequence(C.reshape(inputs_mask,
                                                (-1, 1)))  # [#,c'][1]
        trans_matrix = C.to_sequence_like(C.transpose(unpack_matrix, [1, 0]),
                                          inputs_mask_s)  # [#,c'][*=q]
        trans_matrix = C.sequence.gather(trans_matrix,
                                         inputs_mask_s)  # [#,c2][*=q]
        trans_matrix = C.element_select(
            C.sequence.broadcast_as(matrix_mask, trans_matrix), trans_matrix,
            C.constant(-1e30))
        logits2 = C.softmax(trans_matrix, axis=0,
                            name='level 2 weight')  # [#,c2][*=c]
        unpack_new_q, new_q_mask = C.sequence.unpack(
            new_q, 0).outputs  # [#][*=q,2*d] [#][*=q]
        expand_new_q = C.transpose(
            C.sequence.broadcast_as(unpack_new_q, trans_matrix),
            [1, 0])  # [#,c2][2d,*=q]
        c_over_q = C.reshape(C.reduce_sum(logits2 * expand_new_q, axis=1),
                             (-1, )) / (2 * self.hidden_dim)**0.5  # [#,c2][2d]
        c_over_q = C.reconcile_dynamic_axes(c_over_q, cln_inp_ph)

        weighted_q = c_over_q.clone(C.CloneMethod.share, {
            cln_mem_ph: memory_,
            cln_inp_ph: inputs_
        })  # [#,c][2d]
        c2c = q_over_c.clone(C.CloneMethod.share, {
            cln_mem_ph: inputs_,
            cln_inp_ph: inputs_
        })  # [#,c][2d]

        att_context = C.splice(input_ph, weighted_q, c2c)  # 2d+2d+2d

        return C.as_block(att_context, [(input_ph, context),
                                        (input_mem, query)], 'attention_layer',
                          'attention_layer')
Exemple #22
0
def test_times_const_broadcast():
    x = C.input_variable((3, ))
    a = C.constant(np.ones((3, ), dtype=np.float32))
    y = C.times_transpose(a, x)
    result = y.eval({x: np.asarray([[1, 2, 3], [1, 2, 3]], dtype=np.float32)})
    assert np.array_equal(result, [[6], [6]])
Exemple #23
0
def gpt2_self_attention(token_dims: int,
                        head_dims: int,
                        mask_opt: bool = False,
                        as_block: bool = False,
                        name: str = 'self_attention'):
    X = C.placeholder(token_dims,
                      dynamic_axes=(C.Axis.default_batch_axis(),
                                    C.Axis.default_dynamic_axis()),
                      name=name)

    # q = C.layers.Dense(token_dims, name=name+'_q')(X)
    # k = C.layers.Dense(token_dims, name=name+'_k')(X)
    # v = C.layers.Dense(token_dims, name=name+'_v')(X)

    # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w')
    # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv')

    qkv = C.layers.Dense((3, token_dims), name='qkv')(X)
    q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2]

    q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q')
    k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k')
    v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v')

    #region split multi head attention
    q_heads = [
        C.squeeze(q_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    k_heads = [
        C.squeeze(k_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    v_heads = [
        C.squeeze(v_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    #endregion

    attention_head = []
    for i in range(head_dims):
        q = q_heads[i]
        k = k_heads[i]
        v = v_heads[i]

        #region score
        # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present
        q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i))  # q seq
        k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i))  # k seq
        v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i))  # v seq

        scores = C.times_transpose(q_, k_)
        scaled = scores * (1 / C.sqrt(v_.shape[-1]))

        #region mask opt
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')

        scaled = C.element_min(scaled, inf_mask)
        #endregion

        softmax = C.softmax(scaled)
        #endregion
        #region sum
        attention = C.times(softmax, v_)
        attention_seq = C.to_sequence_like(attention, X)
        #endregion
        attention_head.append(attention_seq)


#region merge attention heads
    attention = C.splice(*attention_head, name='merged_attention')
    #endergion

    #region project
    project = C.layers.Dense(token_dims, name='project')(attention)
    #endregion

    if as_block:
        return C.as_block(project, [(X, X)], 'gpt2_self_attention',
                          'gpt2_self_attention')

    return project
def gram(x):
    features = C.minus(flatten(x), C.reduce_mean(x))
    return C.times_transpose(features, features)
Exemple #25
0
def hierarchical_softmax_layer_for_sequence(input_var, num_output_classes, target_class, target_output_in_class, batch_size, w1, b1, w2s, b2s):
    '''
    A two layers hierarchical softmax function with sequence axis input:

    Example:
        >>> input_dim = 2
        >>> num_output_classes = 4
        >>> minibatch_size = 3
        >>> seq_size = 5
        >>> n_classes = int(math.ceil(math.sqrt(num_output_classes)))
        >>> n_outputs_per_class = n_classes

        >>> w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1')
        >>> b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1')
        >>> w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s')
        >>> b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s')

        # neural network structure for hierarchical softmax
        >>> h_input = C.sequence.input_variable(input_dim)
        >>> h_target_class = C.sequence.input_variable([1])
        >>> h_target_output_in_class = C.sequence.input_variable([1])
        >>> h_z, class_probs, all_probs = hierarchical_softmax_layer_for_sequence(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s)

        >>> a = np.reshape(np.arange(seq_size * minibatch_size * input_dim, dtype = np.float32), (seq_size, minibatch_size, input_dim))
        >>> labels = np.reshape(np.arange(seq_size * minibatch_size, dtype = np.float32), (seq_size, minibatch_size, 1)) % num_output_classes
        >>> target_labels = labels // n_outputs_per_class
        >>> target_output_in_labels = labels % n_outputs_per_class
        >>> h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})[1]
        array([[ 0.000859],
               [ 0.      ],
               [ 0.      ]], dtype=float32)

    Args:
        input_var: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis
        num_output_classes: int
        target_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis
        target_output_in_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis
        batch_size: int
        w1: C.parameter
        b1: C.parameter
        w2s: C.parameter
        b2s: C.parameter
    Returns:
        output_prob: class:`~cntk.ops.functions.Function`
        class_probs: class:`~cntk.ops.functions.Function`
        all_probs: a list of class:`~cntk.ops.functions.Function`
    '''
    input_dim = input_var.shape[0]

    n_classes = int(math.ceil(math.sqrt(num_output_classes)))
    n_outputs_per_class = n_classes

    class_probs = C.softmax(b1 + C.times(input_var, w1))

    w2_temp = C.gather(w2s, target_class)
    w2 = reshape(w2_temp, (input_dim, n_outputs_per_class))
    w2 = C.sequence.broadcast_as(w2, input_var)
    b2 = reshape(C.gather(b2s, target_class), (n_outputs_per_class))
    b2 = C.sequence.broadcast_as(b2, input_var)

    times_result = times(input_var, w2)
    probs_in_class = softmax(b2 + times_result)
    probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class)
    target_output_in_class = C.one_hot(target_output_in_class, n_outputs_per_class, False)
    probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class)
    prob_in_class = C.times_transpose(probs_in_class, target_output_in_class)
    target_class = C.one_hot(target_class, n_classes, False)
    class_probs = C.sequence.broadcast_as(class_probs, target_class)
    class_prob = C.times_transpose(class_probs, target_class)

    output_prob = C.element_times(class_prob, prob_in_class)

    # this is for calculating all the outputs' probabilities
    all_probs = []
    for i in range(n_classes):
        ci = C.constant(i)
        w2a = C.reshape(C.gather(w2s, ci), (input_dim, n_outputs_per_class))
        w2a = C.sequence.broadcast_as(w2a, input_var)
        b2a = C.reshape(C.gather(b2s, ci), (n_outputs_per_class))
        b2a = C.sequence.broadcast_as(b2a, input_var)

        probs_in_classa = C.softmax(b2a + times(input_var, w2a))
        cia = C.constant(i, shape=[1])
        cia = C.reconcile_dynamic_axes(cia, class_probs)
        cia = C.one_hot(cia, n_outputs_per_class, False)
        class_proba = C.times_transpose(class_probs, cia)
        class_proba = C.sequence.broadcast_as(class_proba, probs_in_classa)

        output_proba = C.element_times(class_proba, probs_in_classa)
        all_probs.append(output_proba)

    return output_prob, class_probs, all_probs
Exemple #26
0
def hierarchical_softmax_layer_for_sequence(input_var, num_output_classes, target_class, target_output_in_class, batch_size, w1, b1, w2s, b2s):
    '''
    A two layers hierarchical softmax function with sequence axis input:

    Example:
        >>> input_dim = 2
        >>> num_output_classes = 4
        >>> minibatch_size = 3
        >>> seq_size = 5
        >>> n_classes = int(math.ceil(math.sqrt(num_output_classes)))
        >>> n_outputs_per_class = n_classes

        >>> w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1')
        >>> b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1')
        >>> w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s')
        >>> b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s')

        # neural network structure for hierarchical softmax
        >>> h_input = C.sequence.input_variable(input_dim)
        >>> h_target_class = C.sequence.input_variable([1])
        >>> h_target_output_in_class = C.sequence.input_variable([1])
        >>> h_z, class_probs, all_probs = hierarchical_softmax_layer_for_sequence(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s)

        >>> a = np.reshape(np.arange(seq_size * minibatch_size * input_dim, dtype = np.float32), (seq_size, minibatch_size, input_dim))
        >>> labels = np.reshape(np.arange(seq_size * minibatch_size, dtype = np.float32), (seq_size, minibatch_size, 1)) % num_output_classes
        >>> target_labels = labels // n_outputs_per_class
        >>> target_output_in_labels = labels % n_outputs_per_class
        >>> h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})[1]
        array([[ 0.000859],
               [ 0.      ],
               [ 0.      ]], dtype=float32)

    Args:
        input_var: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis
        num_output_classes: int
        target_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis
        target_output_in_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis
        batch_size: int
        w1: C.parameter
        b1: C.parameter
        w2s: C.parameter
        b2s: C.parameter
    Returns:
        output_prob: class:`~cntk.ops.functions.Function`
        class_probs: class:`~cntk.ops.functions.Function`
        all_probs: a list of class:`~cntk.ops.functions.Function`
    '''
    input_dim = input_var.shape[0]

    n_classes = int(math.ceil(math.sqrt(num_output_classes)))
    n_outputs_per_class = n_classes

    class_probs = C.softmax(b1 + C.times(input_var, w1))

    w2_temp = C.gather(w2s, target_class)
    w2 = reshape(w2_temp, (input_dim, n_outputs_per_class))
    w2 = C.sequence.broadcast_as(w2, input_var)
    b2 = reshape(C.gather(b2s, target_class), (n_outputs_per_class))
    b2 = C.sequence.broadcast_as(b2, input_var)

    times_result = times(input_var, w2)
    probs_in_class = softmax(b2 + times_result)
    probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class)
    target_output_in_class = C.one_hot(target_output_in_class, n_outputs_per_class, False)
    probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class)
    prob_in_class = C.times_transpose(probs_in_class, target_output_in_class)
    target_class = C.one_hot(target_class, n_classes, False)
    class_probs = C.sequence.broadcast_as(class_probs, target_class)
    class_prob = C.times_transpose(class_probs, target_class)

    output_prob = C.element_times(class_prob, prob_in_class)

    # this is for calculating all the outputs' probabilities
    all_probs = []
    for i in range(n_classes):
        ci = C.constant(i)
        w2a = C.reshape(C.gather(w2s, ci), (input_dim, n_outputs_per_class))
        w2a = C.sequence.broadcast_as(w2a, input_var)
        b2a = C.reshape(C.gather(b2s, ci), (n_outputs_per_class))
        b2a = C.sequence.broadcast_as(b2a, input_var)

        probs_in_classa = C.softmax(b2a + times(input_var, w2a))
        cia = C.constant(i, shape=[1])
        cia = C.reconcile_dynamic_axes(cia, class_probs)
        cia = C.one_hot(cia, n_outputs_per_class, False)
        class_proba = C.times_transpose(class_probs, cia)
        class_proba = C.sequence.broadcast_as(class_proba, probs_in_classa)

        output_proba = C.element_times(class_proba, probs_in_classa)
        all_probs.append(output_proba)

    return output_prob, class_probs, all_probs
Exemple #27
0
def test_times_const_broadcast():
    x = C.input_variable((3,))
    a = C.constant(np.ones((3,), dtype=np.float32))
    y = C.times_transpose(a, x)
    result = y.eval({x:np.asarray([[1,2,3],[1,2,3]], dtype=np.float32)})
    assert np.array_equal(result, [[6], [6]])