def attention(query, key, value): dk = C.reduce_sum(C.ones_like(query)) # cannot use sequence.last, will conflict with recurrence # dk: [#, *] [1, ] and value = int(dim_of_query) unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True) # [#] [-3, key_dim] unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True) # [#] [-3, value_dim] broadcasted_key = C.sequence.broadcast_as(unpacked_key, query) # [#, *] [-3, key_dim] scaled = C.times_transpose(query, broadcasted_key) / dk # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score # masked out invalid temporal connections to obey_sequence_order if obey_sequence_order and max_seq_len: unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs # unpacked_scaled: [#] [-3, -3] <== matrix will be top right diagonally zero-ed # scaled_mask: [#] [-3,] minus_inf = C.constant(-1e+30) valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0)) # [] [max_seq, max_seq] valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled) # [#] [max_seq, max_seq] valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0) # [#] [-3, -3] unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf) # [#] [-3, -3] scaled = C.to_sequence_like(unpacked_scaled, query) # [#, *] [-3] elif obey_sequence_order and not max_seq_len: raise ValueError("max_seq_len must be defined when obey_sequence_order is True") attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query)) # [#, *] [value_dim,] return attended
def triangular_matrix_seq(mode: int = 1): X = C.placeholder(1) ones = C.ones_like(X[0]) perm_1 = C.layers.Recurrence(C.plus, return_full_state=True)(ones) perm_2 = C.layers.Recurrence(C.plus, go_backwards=True, return_full_state=True)(ones) arr_1 = C.sequence.unpack(perm_1, 0, True) arr_2 = C.sequence.unpack(perm_2, 0, True) mat = C.times_transpose(arr_1, arr_2) mat_c = arr_1 * arr_2 diagonal_mat = mat - mat_c final_mat = diagonal_mat if mode == 0: final_mat = C.equal(final_mat, 0) elif mode == 1: final_mat = C.less_equal(final_mat, 0) elif mode == 2: final_mat = C.less(final_mat, 0) elif mode == -1: final_mat = C.greater_equal(final_mat, 0) elif mode == -2: final_mat = C.greater(final_mat, 0) result = C.as_block(final_mat, [(X, X)], 'triangular_matrix') return C.stop_gradient(result)
def test_ctc_encoder_train_and_network_output_to_labels(): # test CTC encoder in training loop and CTCEncoder.network_output_to_labels a = C.sequence.input_variable(10) labels = ['a', 'b', 'c'] encoder = CTCEncoder(labels) labels_tensor = C.sequence.input_variable(len( encoder.classes_)) # number of classes = 4 input_tensor = C.sequence.input_variable(100) prediction_tensor = Dense(4)(Recurrence(LSTM(100))( C.ones_like(input_tensor))) labels_graph = C.labels_to_graph(labels_tensor) fb = C.forward_backward(labels_graph, prediction_tensor, blankTokenId=encoder.blankTokenId) ground_truth = ['a', 'b', 'b', 'b', 'c'] seq_length = 10 # must be the same length as the sequence length in network_out pred = np.array([ [0., 2., 0., 0.], [0., 2., 0., 0.], [0., 0., 2., 0.], [2., 0., 0., 0.], [0., 0., 2., 0.], [2., 0., 0., 0.], [0., 0., 2., 0.], [2., 0., 0., 0.], [0., 0., 0., 2.], [0., 0., 0., 2.], ]).astype(np.float32) n = np.random.random((10, 100)).astype(np.float32) # result = fb.eval({labels_tensor: [encoder.transform(ground_truth, seq_length=seq_length)], # input_tensor: [n]}) # print(result) adam = C.adam(prediction_tensor.parameters, 0.01, 0.912) trainer = C.Trainer(prediction_tensor, (fb, ), [adam]) for i in range(300): trainer.train_minibatch({ labels_tensor: [encoder.transform(ground_truth, seq_length=seq_length)], input_tensor: [n] }) # print(trainer.previous_minibatch_loss_average) result = prediction_tensor.eval({input_tensor: [n]}) assert encoder.network_output_to_labels(result[0], squash_repeat=True) == ground_truth
def positional_encoding(token_dims: int, discount_factor: float = 0.99): X = C.placeholder(token_dims, name='positional_encoding') encoder = C.layers.Recurrence(C.element_times, initial_state=1, return_full_state=True)(C.ones_like(X) * discount_factor) return C.stop_gradient( C.as_block(encoder, [(X, X)], 'positional_encoding', 'positional_encoding_'))
def true_density(z): z1, z2 = z[0], z[1] w1 = lambda x: C.sin(2 * np.pi * x/4) u = 0.5 * C.square((z2 - w1(z1))/0.4) dummy = C.ones_like(u) * 1e7 # u = C.element_select(C.less_equal(z1,4), u, dummy) cond = C.less_equal(z1,4) u = C.element_select(cond, u, dummy) # u = cond*u + (1-cond)*dummy return C.exp(-u)
def __cntk_cov2__(m): m = C.reshape(m, -1) m = C.unpack_batch(m) m = C.transpose(m, [1, 0]) count = C.reduce_sum(C.reduce_mean(C.ones_like(m), axis=0)) fact = 1.0 / (count - 1) m -= C.reduce_mean(m, axis=1) mt = C.transpose(m, [1, 0]) return fact * C.squeeze(m @ mt)
def cgan_discriminator(x, y): with C.layers.default_options(init=C.normal(scale=0.02), map_rank=1, use_cntk_engine=True): hx = C.reshape(x, (1, 28, 28)) hy = C.ones_like(hx) * C.reshape(y, (label_dim, 1, 1)) h = C.splice(hx, hy, axis=0) h = C.leaky_relu((Convolution2D((5, 5), 1, strides=(2, 2))(h)), alpha=0.2) h = C.leaky_relu(BatchNormalization()(Convolution2D((5, 5), 64, strides=(2, 2))(h)), alpha=0.2) h = C.leaky_relu(BatchNormalization()(Dense(1024)(h)), alpha=0.2) h = Dense(1, activation=C.sigmoid)(h) return h
def inner(x): mask = dropout(C.ones_like(C.sequence.first(x))) mask = C.sequence.broadcast_as(mask, x) return mask * x
def inner(a): # reconcile_dynamic_axes is necessary to avoid subtle bugs e.g. sequence.where and one_hot return C.reconcile_dynamic_axes(C.sequence.where(C.ones_like(Cx.scalar(a))), a)