def test_h_softmax_for_sequence(): input_dim = 2 num_output_classes = 4 minibatch_size = 3 seq_size = 2 n_classes = int(ceil(sqrt(num_output_classes))) n_outputs_per_class = n_classes w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1') b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1') w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s') b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s') # neural network structure for hierarchical softmax h_input = C.sequence.input_variable(input_dim) h_target_class = C.sequence.input_variable([1]) h_target_output_in_class = C.sequence.input_variable([1]) h_z, class_probs, all_probs = C.hierarchical_softmax_layer_for_sequence(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s) a = np.reshape(np.arange(seq_size * minibatch_size * input_dim, dtype = np.float32), (seq_size, minibatch_size, input_dim)) labels = np.reshape(np.arange(seq_size * minibatch_size, dtype = np.float32), (seq_size, minibatch_size, 1)) % num_output_classes target_labels = labels // n_outputs_per_class target_output_in_labels = labels % n_outputs_per_class val_z = h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) val_class_probs = class_probs.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) val_all_probs = [x.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) for x in all_probs] expected_z = [[[ 0.16448107], [ 0.00597861], [ 0.99322051]], [[ 8.59128195e-04], [ 3.77086673e-09], [ 3.42400197e-12]]] expected_class_probs = [[[ 5.81252098e-01, 4.18747932e-01], [ 1.03938626e-02, 9.89606142e-01], [ 7.94661901e-05, 9.99920487e-01]], [[ 6.01340048e-07, 9.99999404e-01], [ 4.55011762e-09, 1.00000000e+00], [ 3.44291574e-11, 1.00000000e+00]]] expected_all_probs = [[[[ 1.64481074e-01, 4.16771024e-01], [ 4.41524992e-03, 5.97861316e-03], [ 4.61043091e-05, 3.33618809e-05]], [[ 4.33648694e-07, 1.67691354e-07], [ 3.77086673e-09, 7.79251219e-10], [ 3.10051568e-11, 3.42400197e-12]]], [[[ 0.29590073, 0.12284722], [ 0.93986785, 0.04973821], [ 0.99322051, 0.00669997]], [[ 9.99140263e-01, 8.59128195e-04], [ 9.99890447e-01, 1.09594235e-04], [ 9.99986053e-01, 1.39711719e-05]]]] assert np.allclose(expected_z, val_z) assert np.allclose(expected_class_probs, val_class_probs) assert np.allclose(expected_all_probs, val_all_probs)
def hierarchical_softmax_layer(input_var, label_index, label_dim, label_classes=None): ''' A two layers hierarchical softmax function: Args: input_var: Variable with shape: [#,*](dim_x) label_index: index of label's category: [#,*](1) label_dim: number of the label categories label_classes: number of classes of the label categories Returns: output_prob: the probability of the given label [#,*](1) class_probs: the probability of all the label classes [#,*](label_classes) all_probs: the probability of all label classes ''' input_dim = input_var.shape[0] if not label_classes: label_classes = int(np.ceil(np.sqrt(float(label_dim)))) n_outputs_per_class = int(np.ceil(label_dim / label_classes)) target_class = C.floor((label_index + 0.5) / n_outputs_per_class) target_output_in_class = C.round(label_index - target_class * n_outputs_per_class) w1 = parameter(shape=(input_dim, label_classes), init=C.glorot_normal(), name='hsoftmax_w1') b1 = parameter(shape=(label_classes), init=C.glorot_normal(), name='hsoftmax_b1') w2s = parameter(shape=(label_classes, input_dim, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_w2s') b2s = parameter(shape=(label_classes, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_b2s') class_probs = softmax(b1 + times(input_var, w1)) # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate target_class_one_hot = C.one_hot(target_class, num_classes=label_classes, sparse_output=False) w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2), [input_dim, -1]) b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1]) probs_in_class = softmax(b2 + times(input_var, w2)) prob_in_class = C.times_transpose(C.one_hot(target_output_in_class, num_classes=n_outputs_per_class, sparse_output=False), probs_in_class) class_prob = C.times_transpose(C.one_hot(target_class, num_classes=label_classes, sparse_output=False), class_probs) output_prob = prob_in_class * class_prob # this is for calculating all the outputs' probabilities all_probs = [] for i in range(label_classes): ci = C.constant(i) ci_one_hot = C.one_hot(ci, num_classes=label_classes, sparse_output=False) w2a = C.times(ci_one_hot, w2s, output_rank=2) b2a = C.times(ci_one_hot, b2s, output_rank=1) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) class_proba = C.times_transpose(ci_one_hot, class_probs) output_proba = probs_in_classa * class_proba all_probs.append(output_proba) return output_prob, class_probs, all_probs
def BiRecurrence(step_function: C.Function, initial_state=0, dropout_rate_input=None, dropout_rate_output=None, weight_tie: bool = False, seed=SentinelValueForAutoSelectRandomSeed, name=''): """ Wrapper to create a bidirectional rnn Also comes with the option to to half the number of parameters required by bidirectional recurrent layer. This is done by only using one recurrent unit to do both forward and backward computation instead of the usual two. A forward and backward token is used to initialise the hidden state so that the recurrent unit can tell the directionality. More details can be found in the paper 'Efficient Bidirectional Neural Machine Translation' (https://arxiv.org/abs/1908.09329) Example: a = C.sequence.input_variable(10) b = BiRecurrence(LSTM(100), weight_tie=True)(a) assert b.shape == (200, ) Arguments: step_function (:class:`~cntk.ops.functions.Function` or equivalent Python function): This function must have N+1 inputs and N outputs, where N is the number of state variables (typically 1 for GRU and plain RNNs, and 2 for LSTMs). initial_state: dropout_rate_input: variational dropout on input dropout_rate_output: variational dropoput on output weight_tie (bool): whether to use only one recurrent function for computation in both direction. seed (int): seed for randomisation name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function that accepts one argument (which must be a sequence) and performs the recurrent operation on it """ fxn1 = step_function fxn2 = step_function.clone(C.CloneMethod.clone, {}) if not weight_tie else fxn1 forward_token = initial_state backward_token = initial_state if weight_tie: forward_token = C.Parameter(shape=(-1,), init=C.glorot_normal(), name='f_token') backward_token = C.Parameter(shape=(-1,), init=C.glorot_normal(), name='b_token') forward = Recurrence(fxn1, dropout_rate_input=dropout_rate_input, dropout_rate_output=dropout_rate_output, initial_state=forward_token, seed=seed) backward = Recurrence(fxn2, dropout_rate_input=dropout_rate_input, dropout_rate_output=dropout_rate_output, initial_state=backward_token, seed=seed, go_backwards=True) @C.Function def inner(x): output = C.splice(forward(x), backward(x), axis=-1) return C.layers.Label(name)(output) if name else output return inner
def test_h_softmax(): input_dim = 2 num_output_classes = 4 minibatch_size = 3 n_classes = int(ceil(sqrt(num_output_classes))) n_outputs_per_class = n_classes w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=1), name='w1') b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=2), name='b1') w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=3), name='w2s') b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=4), name='b2s') # neural network structure for hierarchical softmax h_input = C.input_variable(input_dim) h_target_class = C.input_variable([1]) h_target_output_in_class = C.input_variable([1]) h_z, class_probs, all_probs = C.hierarchical_softmax_layer(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s) a = np.reshape(np.arange(minibatch_size * input_dim, dtype = np.float32), (minibatch_size, input_dim)) labels = np.reshape(np.arange(minibatch_size, dtype = np.float32), (minibatch_size, 1)) % num_output_classes target_labels = labels // n_outputs_per_class target_output_in_labels = labels % n_outputs_per_class val_z = h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) val_class_probs = class_probs.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) val_all_probs = [x.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) for x in all_probs] expected_z = [[0.0313047], [0.00323934], [0.99006385]] expected_class_probs = [[ 0.04346574, 0.95653421], [ 0.0204236 , 0.97957635], [ 0.0094756 , 0.99052447]] expected_all_probs = [[[ 0.0313047 , 0.01216104], [ 0.01718426, 0.00323934], [ 0.00868148, 0.00079412]], [[ 5.82283854e-01, 3.74250382e-01], [ 9.62925494e-01, 1.66507624e-02], [ 9.90063846e-01, 4.60594223e-04]]] assert np.allclose(expected_z, val_z) assert np.allclose(expected_class_probs, val_class_probs) assert np.allclose(expected_all_probs, val_all_probs)
def test_sequential_convolution(): # -------------------------------------------------------------------------- # Normal use case - image check kernel initialisation shape # -------------------------------------------------------------------------- image_shape = (3, 32) # rgb image of variable width a = C.sequence.input_variable(image_shape) b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=1, pad=True)(a) c = Cx.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=1, pad=True)(a) assert b.shape == c.shape assert b.W.shape == c.W.shape assert b.b.shape == c.b.shape image_shape = (32, ) # black and white image of variable width a = C.sequence.input_variable(image_shape) b = C.layers.SequentialConvolution(filter_shape=(2, 2), num_filters=16, reduction_rank=0, pad=True)(a) c = Cx.layers.SequentialConvolution(filter_shape=(2, 2), num_filters=16, reduction_rank=0, pad=True)(a) assert b.shape == c.shape assert b.W.shape == c.W.shape assert b.b.shape == c.b.shape image_shape = (32, ) # text vector of variable sequence length a = C.sequence.input_variable(image_shape) b = C.layers.SequentialConvolution(filter_shape=(2, 2), num_filters=16, reduction_rank=1, pad=True)(a) c = Cx.layers.SequentialConvolution(filter_shape=(2, 2), num_filters=16, reduction_rank=1, pad=True)(a) assert b.shape == c.shape assert b.W.shape == c.W.shape assert b.b.shape == c.b.shape # -------------------------------------------------------------------------- # Normal use case - image # -------------------------------------------------------------------------- kernel_init = C.glorot_normal(seed=5) image_shape = (3, 32) width = 40 a = C.sequence.input_variable(image_shape) b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=1, pad=True, init=kernel_init)(a) c = Cx.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=1, pad=True, init=kernel_init)(a) assert b.shape == c.shape n = np.random.random((1, width) + image_shape).astype(np.float32) desired = b.eval({a: n}) actual = c.eval({a: n}) if isinstance(desired, list) and len(desired) == 1: desired = desired[0] if isinstance(actual, list) and len(actual) == 1: actual = actual[0] np.testing.assert_equal(actual, desired) # -------------------------------------------------------------------------- # Normal use case - image, mix padding across axis # -------------------------------------------------------------------------- kernel_init = C.glorot_normal(seed=5) image_shape = (3, 32) width = 40 a = C.sequence.input_variable(image_shape) b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=1, pad=(False, True), init=kernel_init)(a) c = Cx.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=1, pad=(False, True), init=kernel_init)(a) assert b.shape == c.shape n = np.random.random((1, width) + image_shape).astype(np.float32) desired = b.eval({a: n}) actual = c.eval({a: n}) if isinstance(desired, list) and len(desired) == 1: desired = desired[0] if isinstance(actual, list) and len(actual) == 1: actual = actual[0] np.testing.assert_equal(actual, desired) # -------------------------------------------------------------------------- # Normal use case - vector data for qrnn implementation # -------------------------------------------------------------------------- kernel_init = C.glorot_normal(seed=5) image_shape = (25, ) width = 40 a = C.sequence.input_variable(image_shape) b = C.layers.SequentialConvolution(filter_shape=(2, ), num_filters=16, reduction_rank=1, pad=True, init=kernel_init)(a) c = Cx.layers.SequentialConvolution(filter_shape=(2, ), num_filters=16, reduction_rank=1, pad=True, init=kernel_init)(a) assert b.shape == c.shape n = np.random.random((1, width) + image_shape).astype(np.float32) desired = b.eval({a: n}) actual = c.eval({a: n}) if isinstance(desired, list) and len(desired) == 1: desired = desired[0] if isinstance(actual, list) and len(actual) == 1: actual = actual[0] np.testing.assert_equal(actual, desired) # -------------------------------------------------------------------------- # Normal use case - B&W image # -------------------------------------------------------------------------- kernel_init = C.glorot_normal(seed=5) image_shape = (32, ) width = 40 a = C.sequence.input_variable(image_shape) b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=0, pad=True, init=kernel_init)(a) c = Cx.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=0, pad=True, init=kernel_init)(a) assert b.shape == c.shape n = np.random.random((1, width) + image_shape).astype(np.float32) desired = b.eval({a: n}) actual = c.eval({a: n}) if isinstance(desired, list) and len(desired) == 1: desired = desired[0] if isinstance(actual, list) and len(actual) == 1: actual = actual[0] np.testing.assert_equal(actual, desired) image_shape = (32, ) width = 40 a = C.sequence.input_variable(image_shape) b = C.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=0, pad=True)(a) c = Cx.layers.SequentialConvolution(filter_shape=(3, 3), num_filters=16, reduction_rank=0, pad=True, init=b.W.value)(a) assert b.shape == c.shape n = np.random.random((1, width) + image_shape).astype(np.float32) desired = b.eval({a: n}) actual = c.eval({a: n}) if isinstance(desired, list) and len(desired) == 1: desired = desired[0] if isinstance(actual, list) and len(actual) == 1: actual = actual[0] np.testing.assert_equal(actual, desired)
def test_h_softmax_for_sequence(): input_dim = 2 num_output_classes = 4 minibatch_size = 3 seq_size = 2 n_classes = int(ceil(sqrt(num_output_classes))) n_outputs_per_class = n_classes w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1') b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1') w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s') b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s') # neural network structure for hierarchical softmax h_input = C.sequence.input_variable(input_dim) h_target_class = C.sequence.input_variable([1]) h_target_output_in_class = C.sequence.input_variable([1]) h_z, class_probs, all_probs = C.hierarchical_softmax_layer_for_sequence( h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s) a = np.reshape( np.arange(seq_size * minibatch_size * input_dim, dtype=np.float32), (seq_size, minibatch_size, input_dim)) labels = np.reshape(np.arange(seq_size * minibatch_size, dtype=np.float32), (seq_size, minibatch_size, 1)) % num_output_classes target_labels = labels // n_outputs_per_class target_output_in_labels = labels % n_outputs_per_class val_z = h_z.eval({ h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels }) val_class_probs = class_probs.eval({ h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels }) val_all_probs = [ x.eval({ h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels }) for x in all_probs ] expected_z = [[[0.16448107], [0.00597861], [0.99322051]], [[8.59128195e-04], [3.77086673e-09], [3.42400197e-12]]] expected_class_probs = [[[5.81252098e-01, 4.18747932e-01], [1.03938626e-02, 9.89606142e-01], [7.94661901e-05, 9.99920487e-01]], [[6.01340048e-07, 9.99999404e-01], [4.55011762e-09, 1.00000000e+00], [3.44291574e-11, 1.00000000e+00]]] expected_all_probs = [[[[1.64481074e-01, 4.16771024e-01], [4.41524992e-03, 5.97861316e-03], [4.61043091e-05, 3.33618809e-05]], [[4.33648694e-07, 1.67691354e-07], [3.77086673e-09, 7.79251219e-10], [3.10051568e-11, 3.42400197e-12]]], [[[0.29590073, 0.12284722], [0.93986785, 0.04973821], [0.99322051, 0.00669997]], [[9.99140263e-01, 8.59128195e-04], [9.99890447e-01, 1.09594235e-04], [9.99986053e-01, 1.39711719e-05]]]] assert np.allclose(expected_z, val_z) assert np.allclose(expected_class_probs, val_class_probs) assert np.allclose(expected_all_probs, val_all_probs)
def test_h_softmax(): input_dim = 2 num_output_classes = 4 minibatch_size = 3 n_classes = int(ceil(sqrt(num_output_classes))) n_outputs_per_class = n_classes w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=1), name='w1') b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=2), name='b1') w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=3), name='w2s') b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=4), name='b2s') # neural network structure for hierarchical softmax h_input = C.input_variable(input_dim) h_target_class = C.input_variable([1]) h_target_output_in_class = C.input_variable([1]) h_z, class_probs, all_probs = C.hierarchical_softmax_layer( h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s) a = np.reshape(np.arange(minibatch_size * input_dim, dtype=np.float32), (minibatch_size, input_dim)) labels = np.reshape(np.arange(minibatch_size, dtype=np.float32), (minibatch_size, 1)) % num_output_classes target_labels = labels // n_outputs_per_class target_output_in_labels = labels % n_outputs_per_class val_z = h_z.eval({ h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels }) val_class_probs = class_probs.eval({ h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels }) val_all_probs = [ x.eval({ h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels }) for x in all_probs ] expected_z = [[0.0313047], [0.00323934], [0.99006385]] expected_class_probs = [[0.04346574, 0.95653421], [0.0204236, 0.97957635], [0.0094756, 0.99052447]] expected_all_probs = [[[0.0313047, 0.01216104], [0.01718426, 0.00323934], [0.00868148, 0.00079412]], [[5.82283854e-01, 3.74250382e-01], [9.62925494e-01, 1.66507624e-02], [9.90063846e-01, 4.60594223e-04]]] assert np.allclose(expected_z, val_z) assert np.allclose(expected_class_probs, val_class_probs) assert np.allclose(expected_all_probs, val_all_probs)
def linear_layer(input_var, num_classes): num_features = input_var.shape[0] weight_param = C.parameter(shape=(num_features, num_classes), init=C.glorot_normal()) bias_param = C.parameter(shape=(num_classes), init=C.glorot_normal()) return C.times(input_var, weight_param) + bias_param
def hierarchical_softmax_layer(input_var, label_index, label_dim, label_classes=None): ''' A two layers hierarchical softmax function: Args: input_var: Variable with shape: [#,*](dim_x) label_index: index of label's category: [#,*](1) label_dim: number of the label categories label_classes: number of classes of the label categories Returns: output_prob: the probability of the given label [#,*](1) class_probs: the probability of all the label classes [#,*](label_classes) all_probs: the probability of all label classes ''' input_dim = input_var.shape[0] if not label_classes: label_classes = int(np.ceil(np.sqrt(float(label_dim)))) n_outputs_per_class = int(np.ceil(label_dim / label_classes)) target_class = C.floor((label_index + 0.5) / n_outputs_per_class) target_output_in_class = C.round(label_index - target_class * n_outputs_per_class) w1 = parameter(shape=(input_dim, label_classes), init=C.glorot_normal(), name='hsoftmax_w1') b1 = parameter(shape=(label_classes), init=C.glorot_normal(), name='hsoftmax_b1') w2s = parameter(shape=( label_classes, input_dim, n_outputs_per_class, ), init=C.glorot_normal(), name='hsoftmax_w2s') b2s = parameter(shape=( label_classes, n_outputs_per_class, ), init=C.glorot_normal(), name='hsoftmax_b2s') class_probs = softmax(b1 + times(input_var, w1)) # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate target_class_one_hot = C.one_hot(target_class, num_classes=label_classes, sparse_output=False) w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2), [input_dim, -1]) b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1]) probs_in_class = softmax(b2 + times(input_var, w2)) prob_in_class = C.times_transpose( C.one_hot(target_output_in_class, num_classes=n_outputs_per_class, sparse_output=False), probs_in_class) class_prob = C.times_transpose( C.one_hot(target_class, num_classes=label_classes, sparse_output=False), class_probs) output_prob = prob_in_class * class_prob # this is for calculating all the outputs' probabilities all_probs = [] for i in range(label_classes): ci = C.constant(i) ci_one_hot = C.one_hot(ci, num_classes=label_classes, sparse_output=False) w2a = C.times(ci_one_hot, w2s, output_rank=2) b2a = C.times(ci_one_hot, b2s, output_rank=1) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) class_proba = C.times_transpose(ci_one_hot, class_probs) output_proba = probs_in_classa * class_proba all_probs.append(output_proba) return output_prob, class_probs, all_probs