def build_graph(self_attention, self_penalty, embeded_dim=60, h_dim=150, d_a=350, r=30): with C.layers.default_options(init=C.xavier()): embeded = C.layers.Embedding(embeded_dim)(x) embeded = C.layers.Stabilizer()(embeded) H = create_birnn(C.layers.GRU(h_dim), C.layers.GRU(h_dim))(embeded) if self_attention: Ws1 = C.parameter(shape=(d_a, 2 * h_dim), name="Ws1") Ws2 = C.parameter(shape=(r, d_a), name="Ws2") A = C.softmax(C.times(Ws2, C.tanh(C.times_transpose(Ws1, H)))) H = C.times(A, H) # the M in the paper if self_penalty: I = C.constant(np.eye(r), dtype=np.float32) P = C.times_transpose(A, A) - I # r*r p = C.reduce_sum(C.abs(C.element_times( P, P))) # frobenius norm **2 y_ = C.layers.Dense(200, activation=C.ops.relu)(H) # y_pre = C.layers.Dense(num_labels, activation = None)(y_) def selfAtt(x): y_pre = C.layers.Dense(num_labels, activation=None)(y_) return y_pre if self_penalty: selfAtt.p = p return selfAtt
def create_word2vec_cbow_model(word_one_hot, context_one_hots, negative_one_hots): # shared_embedding_layer = Embedding(G.embedding_dimension, uniform(scale=1.0/2.0/G.embedding_dimension)) shared_embedding_layer = Embedding(G.embedding_dimension) word_embedding = shared_embedding_layer(word_one_hot) context_embeddings = [shared_embedding_layer(x) for x in context_one_hots] negative_embeddings = [shared_embedding_layer(x) for x in negative_one_hots] print(word_embedding.shape) word_embedding_reshaped = C.reshape(word_embedding, shape=(1, G.embedding_dimension)) print(word_embedding_reshaped.shape) context_embeddings_all = C.reshape(C.splice(*context_embeddings), shape=(context_size, G.embedding_dimension)) negative_embeddings_all = C.reshape(C.splice(*negative_embeddings), shape=(G.negative, G.embedding_dimension)) print(context_embeddings_all.shape) print(negative_embeddings_all.shape) cbow = C.reshape(C.reduce_mean(context_embeddings_all, 0), shape=(G.embedding_dimension)) print(cbow.shape) # word_context_product = C.times_transpose(word_embedding_reshaped, cbow) word_context_product = C.times_transpose(word_embedding, cbow) print(word_context_product.shape) negative_context_product = C.reshape(C.times_transpose(negative_embeddings_all, cbow), shape=(G.negative)) print(negative_context_product.shape) word_negative_context_product = C.splice(word_context_product, negative_context_product) print(word_negative_context_product.shape) # return model and shared embedding layer return word_negative_context_product, shared_embedding_layer
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates=False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.layers.Parameter(shape=(vocab_dim, 1), init=0) weights = C.layers.Parameter(shape=(vocab_dim, hidden_dim), init=C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample( sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency( sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: " + str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:" + str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times( sample_selector, bias, name='zS2') - C.times_transpose( sample_selector, log_prior, name='zS3') # [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times( target_vector, bias, name='zT2') - C.times_transpose( target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape=(vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def hierarchical_softmax_layer(input_var, label_index, label_dim, label_classes=None): ''' A two layers hierarchical softmax function: Args: input_var: Variable with shape: [#,*](dim_x) label_index: index of label's category: [#,*](1) label_dim: number of the label categories label_classes: number of classes of the label categories Returns: output_prob: the probability of the given label [#,*](1) class_probs: the probability of all the label classes [#,*](label_classes) all_probs: the probability of all label classes ''' input_dim = input_var.shape[0] if not label_classes: label_classes = int(np.ceil(np.sqrt(float(label_dim)))) n_outputs_per_class = int(np.ceil(label_dim / label_classes)) target_class = C.floor((label_index + 0.5) / n_outputs_per_class) target_output_in_class = C.round(label_index - target_class * n_outputs_per_class) w1 = parameter(shape=(input_dim, label_classes), init=C.glorot_normal(), name='hsoftmax_w1') b1 = parameter(shape=(label_classes), init=C.glorot_normal(), name='hsoftmax_b1') w2s = parameter(shape=(label_classes, input_dim, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_w2s') b2s = parameter(shape=(label_classes, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_b2s') class_probs = softmax(b1 + times(input_var, w1)) # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate target_class_one_hot = C.one_hot(target_class, num_classes=label_classes, sparse_output=False) w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2), [input_dim, -1]) b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1]) probs_in_class = softmax(b2 + times(input_var, w2)) prob_in_class = C.times_transpose(C.one_hot(target_output_in_class, num_classes=n_outputs_per_class, sparse_output=False), probs_in_class) class_prob = C.times_transpose(C.one_hot(target_class, num_classes=label_classes, sparse_output=False), class_probs) output_prob = prob_in_class * class_prob # this is for calculating all the outputs' probabilities all_probs = [] for i in range(label_classes): ci = C.constant(i) ci_one_hot = C.one_hot(ci, num_classes=label_classes, sparse_output=False) w2a = C.times(ci_one_hot, w2s, output_rank=2) b2a = C.times(ci_one_hot, b2s, output_rank=1) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) class_proba = C.times_transpose(ci_one_hot, class_probs) output_proba = probs_in_classa * class_proba all_probs.append(output_proba) return output_prob, class_probs, all_probs
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates = False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: "+str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:"+str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')# [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(target_vector, bias, name='zT2') - C.times_transpose(target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape = (vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def cross_entropy_with_full_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim # Dimension of the hidden vector ): bias = C.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) z = C.reshape(C.times_transpose(weights, hidden_vector) + bias, (1,vocab_dim)) zT = C.times_transpose(z, target_vector) ce = C.reduce_log_sum_exp(z) - zT zMax = C.reduce_max(z) error_on_samples = C.less(zT, zMax) return (z, ce, error_on_samples)
def triangular_matrix_seq(mode: int = 1): X = C.placeholder(1) ones = C.ones_like(X[0]) perm_1 = C.layers.Recurrence(C.plus, return_full_state=True)(ones) perm_2 = C.layers.Recurrence(C.plus, go_backwards=True, return_full_state=True)(ones) arr_1 = C.sequence.unpack(perm_1, 0, True) arr_2 = C.sequence.unpack(perm_2, 0, True) mat = C.times_transpose(arr_1, arr_2) mat_c = arr_1 * arr_2 diagonal_mat = mat - mat_c final_mat = diagonal_mat if mode == 0: final_mat = C.equal(final_mat, 0) elif mode == 1: final_mat = C.less_equal(final_mat, 0) elif mode == 2: final_mat = C.less(final_mat, 0) elif mode == -1: final_mat = C.greater_equal(final_mat, 0) elif mode == -2: final_mat = C.greater(final_mat, 0) result = C.as_block(final_mat, [(X, X)], 'triangular_matrix') return C.stop_gradient(result)
def cross_entropy_with_full_softmax( output, # Node providing the output of the lstm layers target_vector, # Node providing the expected labels sv_dim, vocab_dim ): sv_vector = output.outputs[3] z = output.outputs[0] zT = C.times_transpose(z, target_vector) # cross entropy loss with softmax function ce = - C.log(zT) # the error zMax = C.reduce_max(z) error = C.less(zT, zMax) ce = sequence.reduce_sum(ce) # discourages the network from turning more than one gate off in a single time step. sumc = C.abs(C.sequence.slice(sv_vector, 1, 0) - C.sequence.slice(sv_vector, 0, -1)) sumc = sequence.reduce_sum(0.0001 * C.pow(100.0, sumc)) #ce += sumc # penalise generated utterances that failed to render all the required slots sumc += C.abs(C.sequence.last(sv_vector)) sumc += C.abs(C.sequence.first(sv_vector) - output.outputs[4]) sumc = C.reduce_sum(sumc) ce = C.reduce_sum(ce) ce += sumc return ce, error
def createDecoderNetwork(self, networkHiddenSrc, srcLength, trgLength): timeZeroHidden = C.slice(networkHiddenSrc, 0, 0, 1) srcSentEmb = C.slice(timeZeroHidden, -1, Config.SrcHiddenSize, Config.SrcHiddenSize * 2) networkHiddenTrg = {} inputTrg = C.reshape(self.inputMatrixTrg, shape=(Config.TrgMaxLength, Config.BatchSize, Config.TrgVocabSize)) attProbAll = [] tce = 0 for i in range(0, trgLength, 1): preTrgEmb = self.initTrgEmb if i == 0 else self.EmbTrg(inputTrg[i - 1]) if (i == 0): networkHiddenTrg[i] = self.createDecoderInitNetwork(srcSentEmb) else: (networkHiddenTrg[i], attProb) = self.createDecoderRNNNetwork( networkHiddenSrc, preTrgEmb, networkHiddenTrg[i - 1], srcLength) attProbAll = attProb if i == 1 else C.splice( attProbAll, attProb, axis=0) preSoftmax = self.createReadOutNetwork(networkHiddenTrg[i], preTrgEmb) ce = C.cross_entropy_with_softmax(preSoftmax, inputTrg[i], 2) ce = C.reshape(ce, shape=(1, Config.BatchSize)) tce += C.times_transpose(ce, self.maskMatrixTrg[i]) return tce
def attention(query, key, value): dk = C.reduce_sum(C.ones_like(query)) # cannot use sequence.last, will conflict with recurrence # dk: [#, *] [1, ] and value = int(dim_of_query) unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True) # [#] [-3, key_dim] unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True) # [#] [-3, value_dim] broadcasted_key = C.sequence.broadcast_as(unpacked_key, query) # [#, *] [-3, key_dim] scaled = C.times_transpose(query, broadcasted_key) / dk # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score # masked out invalid temporal connections to obey_sequence_order if obey_sequence_order and max_seq_len: unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs # unpacked_scaled: [#] [-3, -3] <== matrix will be top right diagonally zero-ed # scaled_mask: [#] [-3,] minus_inf = C.constant(-1e+30) valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0)) # [] [max_seq, max_seq] valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled) # [#] [max_seq, max_seq] valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0) # [#] [-3, -3] unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf) # [#] [-3, -3] scaled = C.to_sequence_like(unpacked_scaled, query) # [#, *] [-3] elif obey_sequence_order and not max_seq_len: raise ValueError("max_seq_len must be defined when obey_sequence_order is True") attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query)) # [#, *] [value_dim,] return attended
def cross_entropy_with_full_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim # Dimension of the hidden vector ): bias = C.Parameter(shape=(vocab_dim, 1), init=0) weights = C.Parameter(shape=(vocab_dim, hidden_dim), init=C.initializer.glorot_uniform()) z = C.reshape( C.times_transpose(weights, hidden_vector) + bias, (1, vocab_dim)) zT = C.times_transpose(z, target_vector) ce = C.reduce_log_sum_exp(z) - zT zMax = C.reduce_max(z) error_on_samples = C.less(zT, zMax) return (z, ce, error_on_samples)
def model(seq_image, decoded): params = dense(decoded) g_x, g_y, sigma2, delta, gamma = attention_parameters(params) i = C.Constant(np.arange(n) + 1, ) # col of patch j = C.Constant(np.arange(n) + 1, ) # row of patch mu_x = g_x + (i - n / 2 - 0.5) * delta mu_y = g_y + (j - n / 2 - 0.5) * delta mu_x = C.expand_dims(mu_x, axis=-1) mu_y = C.expand_dims(mu_y, axis=-1) # mu_x: [#, *] [n, 1] # mu_y: [#, *] [n, 1] image = C.sequence.unpack(seq_image, padding_value=0, no_mask_output=True) # image: [#] [*image_width, filters, image_height] width_pos = Cx.sequence.position(seq_image) # width_pos: [#, *] [1] width_pos_unpacked = C.sequence.unpack(width_pos, padding_value=999_999, no_mask_output=True) # width_pos: [#] [*image_width, 1] a = C.sequence.broadcast_as(C.swapaxes(width_pos_unpacked), mu_x) # a: [#, *] [1, *image_width] # x pos index of image (width) b = C.Constant(np.arange(image_height).reshape((1, -1))) # b: [] [1, image_height] # y pos index of image (height) # calculate the which portion of the image that is attended by the gaussian filter f_xi = C.exp(-0.5 * C.square(a - mu_x) / sigma2) f_yj = C.exp(-0.5 * C.square(b - mu_y) / sigma2) # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] z_x = C.reduce_sum(f_xi, axis=1) z_y = C.reduce_sum(f_yj, axis=1) # z_x: [#, *] [n] # z_y: [#, *] [n] f_xi = f_xi / z_x f_yj = f_yj / z_y # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] # combine filters from x and y image_broadcasted = C.sequence.broadcast_as(image, f_yj) attended = gamma * C.times( f_xi, C.times_transpose(image_broadcasted, f_yj), output_rank=2) # attended: [#, *] [n, filters, n] attended = C.swapaxes(attended) # attended: [#, *] [filters, n (x) , n (y)] return attended
def build_trainer(self): # Set the learning rate, and the momentum parameters for the Adam optimizer. lr = learning_rate_schedule(self.lr, UnitType.minibatch) beta1 = momentum_schedule(0.9) beta2 = momentum_schedule(0.99) # Calculate the losses. loss_on_v = cntk.squared_error(self.R, self.v) pi_a_s = cntk.log(cntk.times_transpose(self.pi, self.action)) loss_on_pi = cntk.variables.Constant(-1) * (cntk.plus( cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc)), 0.01 * cntk.times_transpose(self.pi, cntk.log(self.pi)))) #loss_on_pi = cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc)) self.tensorboard_v_writer = TensorBoardProgressWriter( freq=10, log_dir="tensorboard_v_logs", model=self.v) self.tensorboard_pi_writer = TensorBoardProgressWriter( freq=10, log_dir="tensorboard_pi_logs", model=self.pi) # tensorboard --logdir=tensorboard_pi_logs http://localhost:6006/ # tensorboard --logdir=tensorboard_v_logs http://localhost:6006/ # Create the trainiers. self.trainer_v = cntk.Trainer(self.v, (loss_on_v), [ adam(self.pms_v, lr, beta1, variance_momentum=beta2, gradient_clipping_threshold_per_sample=2, l2_regularization_weight=0.01) ], self.tensorboard_v_writer) self.trainer_pi = cntk.Trainer(self.pi, (loss_on_pi), [ adam(self.pms_pi, lr, beta1, variance_momentum=beta2, gradient_clipping_threshold_per_sample=2, l2_regularization_weight=0.01) ], self.tensorboard_pi_writer)
def cross_entropy_with_sampled_softmax( hidden_vector, label_vector, vocab_dim, hidden_dim, num_samples, sampling_weights, allow_duplicates = False ): bias = C.layers.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.layers.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) sample_selector = sample_selector_sparse inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) log_prior = C.log(inclusion_probs) wS = C.times(sample_selector, weights, name='wS') zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3') # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(label_vector, weights, name='wT') zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(label_vector, bias, name='zT2') - C.times_transpose(label_vector, log_prior, name='zT3') zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape = (vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def test_op_times_sparse_grad(device_id, precision): dt_precision = PRECISION_TO_TYPE[precision] from cntk import times, times_transpose, parameter, reshape, Value, sequence dim = 5 num_sequences = 2 seq = [i for i in range(dim)] identity = np.identity(dim, dtype=dt_precision) input_data = Value.one_hot([seq]*num_sequences, dim, dtype=dt_precision) input_var = sequence.input_variable(shape=(dim), is_sparse=True, needs_gradient=False, dtype=dt_precision) e = parameter(shape = (dim, dim), init = identity, dtype=dt_precision) z = reshape(times_transpose(e, times(input_var, e)), dim) e_grad = z.grad({input_var : input_data}, [e]) assert np.allclose(e_grad, np.ones((dim,dim))*4)
def test_op_times_sparse_grad(device_id, precision): dt_precision = PRECISION_TO_TYPE[precision] from cntk import times, times_transpose, parameter, reshape, one_hot dim = 5 num_sequences = 2 seq = [i for i in range(dim)] identity = np.identity(dim, dtype=np.float32) input_data = one_hot([seq] * num_sequences, dim) input_var = I(shape=(dim), is_sparse=True, needs_gradient=False) e = parameter(shape=(dim, dim), init=identity) z = reshape(times_transpose(e, times(input_var, e)), dim) e_grad = z.grad({input_var: input_data}, [e]) assert np.allclose(e_grad, np.ones((dim, dim)) * 4)
def test_times_transpose_sequence_param(device_id, precision): dt_precision = PRECISION_TO_TYPE[precision] from cntk import times_transpose, parameter, sequence, Value dim = 5 num_sequences = 2 seq = [i for i in range(dim)] identity = np.identity(dim, dtype=dt_precision) input_data = Value.one_hot([seq] * num_sequences, dim, dtype=dt_precision) input_var = sequence.input_variable(shape=(dim), needs_gradient=True, dtype=dt_precision) e = parameter(shape=(dim, ), init=1, dtype=dt_precision) z = times_transpose(e, input_var) e_grad = z.grad({input_var: input_data}, [e, input_var])
def dot_attention(self, inputs, memory, dim): ''' @inputs: [#,c][d] a sequence need attention @memory(key): [#,q][d] a sequence input refers to compute similarity(weight) @value: [#,q][d] a sequence input refers to weighted sum @output: [#,c][d] attention vector ''' input_ph = C.placeholder() input_mem = C.placeholder() with C.layers.default_options( bias=False, activation=C.relu): # all the projections have no bias attn_proj_enc = C.layers.Dense(dim, init=glorot_uniform(), input_rank=1, name="Wqu") attn_proj_dec = C.layers.Dense(dim, init=glorot_uniform(), input_rank=1) inputs_ = attn_proj_enc(input_ph) # [#,c][d] memory_ = attn_proj_dec(input_mem) # [#,q][d] unpack_memory, mem_mask = C.sequence.unpack( memory_, 0).outputs # [#][*=q, d], [#][*=q] unpack_memory_expand = C.sequence.broadcast_as(unpack_memory, inputs_) # [#,c][*=q,d] matrix = C.times_transpose(inputs_, unpack_memory_expand) / ( dim**0.5) # [#,c][*=q] mem_mask_expand = C.sequence.broadcast_as(mem_mask, inputs_) # [#,c][*=q] matrix = C.element_select(mem_mask_expand, matrix, C.constant(-1e+30)) # [#,c][*=q] logits = C.reshape(C.softmax(matrix), (-1, 1)) # [#,c][*=q,1] # [#,c][*=q, d] memory_expand = C.sequence.broadcast_as( C.sequence.unpack(input_mem, 0, no_mask_output=True), input_ph) weighted_att = C.reshape(C.reduce_sum(logits * memory_expand, axis=0), (-1, )) # [#,c][d] return C.as_block(C.combine(weighted_att, logits), [(input_ph, inputs), (input_mem, memory)], 'dot attention', 'dot attention')
def hierarchical_softmax_layer(input_var, label_index, label_dim, label_classes=None): ''' A two layers hierarchical softmax function: Args: input_var: Variable with shape: [#,*](dim_x) label_index: index of label's category: [#,*](1) label_dim: number of the label categories label_classes: number of classes of the label categories Returns: output_prob: the probability of the given label [#,*](1) class_probs: the probability of all the label classes [#,*](label_classes) all_probs: the probability of all label classes ''' input_dim = input_var.shape[0] if not label_classes: label_classes = int(np.ceil(np.sqrt(float(label_dim)))) n_outputs_per_class = int(np.ceil(label_dim / label_classes)) target_class = C.floor((label_index + 0.5) / n_outputs_per_class) target_output_in_class = C.round(label_index - target_class * n_outputs_per_class) w1 = parameter(shape=(input_dim, label_classes), init=C.glorot_normal(), name='hsoftmax_w1') b1 = parameter(shape=(label_classes), init=C.glorot_normal(), name='hsoftmax_b1') w2s = parameter(shape=( label_classes, input_dim, n_outputs_per_class, ), init=C.glorot_normal(), name='hsoftmax_w2s') b2s = parameter(shape=( label_classes, n_outputs_per_class, ), init=C.glorot_normal(), name='hsoftmax_b2s') class_probs = softmax(b1 + times(input_var, w1)) # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate target_class_one_hot = C.one_hot(target_class, num_classes=label_classes, sparse_output=False) w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2), [input_dim, -1]) b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1]) probs_in_class = softmax(b2 + times(input_var, w2)) prob_in_class = C.times_transpose( C.one_hot(target_output_in_class, num_classes=n_outputs_per_class, sparse_output=False), probs_in_class) class_prob = C.times_transpose( C.one_hot(target_class, num_classes=label_classes, sparse_output=False), class_probs) output_prob = prob_in_class * class_prob # this is for calculating all the outputs' probabilities all_probs = [] for i in range(label_classes): ci = C.constant(i) ci_one_hot = C.one_hot(ci, num_classes=label_classes, sparse_output=False) w2a = C.times(ci_one_hot, w2s, output_rank=2) b2a = C.times(ci_one_hot, b2s, output_rank=1) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) class_proba = C.times_transpose(ci_one_hot, class_probs) output_proba = probs_in_classa * class_proba all_probs.append(output_proba) return output_prob, class_probs, all_probs
def self_attention_layer(in_dims: int, out_dims: int, name='self_attention', as_block: bool = False, k_ph: bool = False, v_ph: bool = False, mask_opt: bool = False) -> C.Function: sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims') X = C.placeholder( in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') if k_ph is False and v_ph is False: q = C.layers.Dense(out_dims, name=name + '_q')( X ) # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q') k = C.layers.Dense(out_dims, name=name + '_k')( X ) # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k') v = C.layers.Dense(out_dims, name=name + '_v')( X ) # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v') elif k_ph is True and v_ph is True: q = C.layers.Dense(out_dims, name=name + '_q')(X) k = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_k_ph') v = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_v_ph') else: raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}') q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q') k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k') v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v') scores = C.times_transpose(q_, k_, name=name + '_score_matrix') scaled = scores / sq_sa_dims # div_k if mask_opt: mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) softmax = C.softmax(scaled, name=name + '_softmax') attention = C.times(softmax, v_, name=name + '_attention') result = C.to_sequence_like(attention, X) if as_block: if k_ph is False and v_ph is False: return C.as_block(result, [(X, X)], 'self_attention', 'self_attention_') elif k_ph is True and v_ph is True: return C.as_block(result, [(X, X), (k, k), (v, v)], 'self_attention', 'self_attention_') else: raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}') else: return result
def attention_layer(self, context, query, dim): input_ph = C.placeholder(shape=(dim, )) input_mem = C.placeholder(shape=(dim, )) with C.layers.default_options(bias=False, activation=C.relu): attn_proj_enc = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1, name="Wqu") attn_proj_dec = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1) inputs_ = attn_proj_enc(input_ph) # [#,c][d] memory_ = attn_proj_dec(input_mem) # [#,q][d] cln_mem_ph = C.placeholder() # [#,q][?=d] cln_inp_ph = C.placeholder() # [#,c][?=d] unpack_inputs, inputs_mask = C.sequence.unpack( cln_inp_ph, 0).outputs # [#][*=c,d] [#][*=c] expand_inputs = C.sequence.broadcast_as(unpack_inputs, cln_mem_ph) # [#,q][*=c,d] matrix = C.reshape( C.times_transpose(cln_mem_ph, expand_inputs) / (self.hidden_dim**0.5), (-1, )) # [#,q][*=c] matrix = C.element_select( C.sequence.broadcast_as(inputs_mask, cln_mem_ph), matrix, C.constant(-1e30)) logits = C.softmax(matrix, axis=0, name='level 1 weight') # [#,q][*=c] trans_expand_inputs = C.transpose(expand_inputs, [1, 0]) # [#,q][d,*=c] q_over_c = C.reshape( C.reduce_sum(logits * trans_expand_inputs, axis=1), (-1, )) / (self.hidden_dim**0.5) # [#,q][d] new_q = C.splice(cln_mem_ph, q_over_c) # [#,q][2*d] # over unpack_matrix, matrix_mask = C.sequence.unpack( matrix, 0).outputs # [#][*=q,*=c] [#][*=q] inputs_mask_s = C.to_sequence(C.reshape(inputs_mask, (-1, 1))) # [#,c'][1] trans_matrix = C.to_sequence_like(C.transpose(unpack_matrix, [1, 0]), inputs_mask_s) # [#,c'][*=q] trans_matrix = C.sequence.gather(trans_matrix, inputs_mask_s) # [#,c2][*=q] trans_matrix = C.element_select( C.sequence.broadcast_as(matrix_mask, trans_matrix), trans_matrix, C.constant(-1e30)) logits2 = C.softmax(trans_matrix, axis=0, name='level 2 weight') # [#,c2][*=c] unpack_new_q, new_q_mask = C.sequence.unpack( new_q, 0).outputs # [#][*=q,2*d] [#][*=q] expand_new_q = C.transpose( C.sequence.broadcast_as(unpack_new_q, trans_matrix), [1, 0]) # [#,c2][2d,*=q] c_over_q = C.reshape(C.reduce_sum(logits2 * expand_new_q, axis=1), (-1, )) / (2 * self.hidden_dim)**0.5 # [#,c2][2d] c_over_q = C.reconcile_dynamic_axes(c_over_q, cln_inp_ph) weighted_q = c_over_q.clone(C.CloneMethod.share, { cln_mem_ph: memory_, cln_inp_ph: inputs_ }) # [#,c][2d] c2c = q_over_c.clone(C.CloneMethod.share, { cln_mem_ph: inputs_, cln_inp_ph: inputs_ }) # [#,c][2d] att_context = C.splice(input_ph, weighted_q, c2c) # 2d+2d+2d return C.as_block(att_context, [(input_ph, context), (input_mem, query)], 'attention_layer', 'attention_layer')
def test_times_const_broadcast(): x = C.input_variable((3, )) a = C.constant(np.ones((3, ), dtype=np.float32)) y = C.times_transpose(a, x) result = y.eval({x: np.asarray([[1, 2, 3], [1, 2, 3]], dtype=np.float32)}) assert np.array_equal(result, [[6], [6]])
def gpt2_self_attention(token_dims: int, head_dims: int, mask_opt: bool = False, as_block: bool = False, name: str = 'self_attention'): X = C.placeholder(token_dims, dynamic_axes=(C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name) # q = C.layers.Dense(token_dims, name=name+'_q')(X) # k = C.layers.Dense(token_dims, name=name+'_k')(X) # v = C.layers.Dense(token_dims, name=name+'_v')(X) # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w') # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv') qkv = C.layers.Dense((3, token_dims), name='qkv')(X) q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2] q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q') k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k') v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v') #region split multi head attention q_heads = [ C.squeeze(q_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] k_heads = [ C.squeeze(k_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] v_heads = [ C.squeeze(v_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] #endregion attention_head = [] for i in range(head_dims): q = q_heads[i] k = k_heads[i] v = v_heads[i] #region score # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i)) # q seq k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i)) # k seq v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i)) # v seq scores = C.times_transpose(q_, k_) scaled = scores * (1 / C.sqrt(v_.shape[-1])) #region mask opt mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) #endregion softmax = C.softmax(scaled) #endregion #region sum attention = C.times(softmax, v_) attention_seq = C.to_sequence_like(attention, X) #endregion attention_head.append(attention_seq) #region merge attention heads attention = C.splice(*attention_head, name='merged_attention') #endergion #region project project = C.layers.Dense(token_dims, name='project')(attention) #endregion if as_block: return C.as_block(project, [(X, X)], 'gpt2_self_attention', 'gpt2_self_attention') return project
def gram(x): features = C.minus(flatten(x), C.reduce_mean(x)) return C.times_transpose(features, features)
def hierarchical_softmax_layer_for_sequence(input_var, num_output_classes, target_class, target_output_in_class, batch_size, w1, b1, w2s, b2s): ''' A two layers hierarchical softmax function with sequence axis input: Example: >>> input_dim = 2 >>> num_output_classes = 4 >>> minibatch_size = 3 >>> seq_size = 5 >>> n_classes = int(math.ceil(math.sqrt(num_output_classes))) >>> n_outputs_per_class = n_classes >>> w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1') >>> b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1') >>> w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s') >>> b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s') # neural network structure for hierarchical softmax >>> h_input = C.sequence.input_variable(input_dim) >>> h_target_class = C.sequence.input_variable([1]) >>> h_target_output_in_class = C.sequence.input_variable([1]) >>> h_z, class_probs, all_probs = hierarchical_softmax_layer_for_sequence(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s) >>> a = np.reshape(np.arange(seq_size * minibatch_size * input_dim, dtype = np.float32), (seq_size, minibatch_size, input_dim)) >>> labels = np.reshape(np.arange(seq_size * minibatch_size, dtype = np.float32), (seq_size, minibatch_size, 1)) % num_output_classes >>> target_labels = labels // n_outputs_per_class >>> target_output_in_labels = labels % n_outputs_per_class >>> h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})[1] array([[ 0.000859], [ 0. ], [ 0. ]], dtype=float32) Args: input_var: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis num_output_classes: int target_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis target_output_in_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis batch_size: int w1: C.parameter b1: C.parameter w2s: C.parameter b2s: C.parameter Returns: output_prob: class:`~cntk.ops.functions.Function` class_probs: class:`~cntk.ops.functions.Function` all_probs: a list of class:`~cntk.ops.functions.Function` ''' input_dim = input_var.shape[0] n_classes = int(math.ceil(math.sqrt(num_output_classes))) n_outputs_per_class = n_classes class_probs = C.softmax(b1 + C.times(input_var, w1)) w2_temp = C.gather(w2s, target_class) w2 = reshape(w2_temp, (input_dim, n_outputs_per_class)) w2 = C.sequence.broadcast_as(w2, input_var) b2 = reshape(C.gather(b2s, target_class), (n_outputs_per_class)) b2 = C.sequence.broadcast_as(b2, input_var) times_result = times(input_var, w2) probs_in_class = softmax(b2 + times_result) probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class) target_output_in_class = C.one_hot(target_output_in_class, n_outputs_per_class, False) probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class) prob_in_class = C.times_transpose(probs_in_class, target_output_in_class) target_class = C.one_hot(target_class, n_classes, False) class_probs = C.sequence.broadcast_as(class_probs, target_class) class_prob = C.times_transpose(class_probs, target_class) output_prob = C.element_times(class_prob, prob_in_class) # this is for calculating all the outputs' probabilities all_probs = [] for i in range(n_classes): ci = C.constant(i) w2a = C.reshape(C.gather(w2s, ci), (input_dim, n_outputs_per_class)) w2a = C.sequence.broadcast_as(w2a, input_var) b2a = C.reshape(C.gather(b2s, ci), (n_outputs_per_class)) b2a = C.sequence.broadcast_as(b2a, input_var) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) cia = C.constant(i, shape=[1]) cia = C.reconcile_dynamic_axes(cia, class_probs) cia = C.one_hot(cia, n_outputs_per_class, False) class_proba = C.times_transpose(class_probs, cia) class_proba = C.sequence.broadcast_as(class_proba, probs_in_classa) output_proba = C.element_times(class_proba, probs_in_classa) all_probs.append(output_proba) return output_prob, class_probs, all_probs
def test_times_const_broadcast(): x = C.input_variable((3,)) a = C.constant(np.ones((3,), dtype=np.float32)) y = C.times_transpose(a, x) result = y.eval({x:np.asarray([[1,2,3],[1,2,3]], dtype=np.float32)}) assert np.array_equal(result, [[6], [6]])