Beispiel #1
0
def test_h_softmax_for_sequence():
  input_dim = 2
  num_output_classes = 4
  minibatch_size = 3
  seq_size = 2
  n_classes = int(ceil(sqrt(num_output_classes)))
  n_outputs_per_class = n_classes

  w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1')
  b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1')
  w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s')
  b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s')

  # neural network structure for hierarchical softmax
  h_input = C.sequence.input_variable(input_dim)
  h_target_class = C.sequence.input_variable([1])
  h_target_output_in_class = C.sequence.input_variable([1])
  h_z, class_probs, all_probs = C.hierarchical_softmax_layer_for_sequence(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s)

  a = np.reshape(np.arange(seq_size * minibatch_size * input_dim, dtype = np.float32), (seq_size, minibatch_size, input_dim))
  labels = np.reshape(np.arange(seq_size * minibatch_size, dtype = np.float32), (seq_size, minibatch_size, 1)) % num_output_classes
  target_labels = labels // n_outputs_per_class
  target_output_in_labels = labels % n_outputs_per_class
  val_z = h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})
  val_class_probs = class_probs.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})
  val_all_probs = [x.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) for x in all_probs]

  expected_z = [[[ 0.16448107],
            [ 0.00597861],
            [ 0.99322051]],
            [[  8.59128195e-04],
            [  3.77086673e-09],
            [  3.42400197e-12]]]
  expected_class_probs = [[[  5.81252098e-01,   4.18747932e-01],
                          [  1.03938626e-02,   9.89606142e-01],
                          [  7.94661901e-05,   9.99920487e-01]],
                          [[  6.01340048e-07,   9.99999404e-01],
                          [  4.55011762e-09,   1.00000000e+00],
                          [  3.44291574e-11,   1.00000000e+00]]]
  expected_all_probs =  [[[[  1.64481074e-01,   4.16771024e-01],
                          [  4.41524992e-03,   5.97861316e-03],
                          [  4.61043091e-05,   3.33618809e-05]],
                          [[  4.33648694e-07,   1.67691354e-07],
                          [  3.77086673e-09,   7.79251219e-10],
                          [  3.10051568e-11,   3.42400197e-12]]],
                          [[[ 0.29590073,  0.12284722],
                          [ 0.93986785,  0.04973821],
                          [ 0.99322051,  0.00669997]],
                          [[  9.99140263e-01,   8.59128195e-04],
                          [  9.99890447e-01,   1.09594235e-04],
                          [  9.99986053e-01,   1.39711719e-05]]]]
                     
  assert np.allclose(expected_z, val_z)
  assert np.allclose(expected_class_probs, val_class_probs)
  assert np.allclose(expected_all_probs, val_all_probs)
Beispiel #2
0
def hierarchical_softmax_layer(input_var, label_index, label_dim, label_classes=None):
    '''
    A two layers hierarchical softmax function:

    Args:
        input_var: Variable with shape: [#,*](dim_x)
        label_index: index of label's category:  [#,*](1)
        label_dim: number of the label categories
        label_classes: number of classes of the label categories
    Returns:
        output_prob: the probability of the given label [#,*](1)
        class_probs: the probability of all the label classes [#,*](label_classes)
        all_probs: the probability of all label classes 
    '''
    input_dim = input_var.shape[0]

    if not label_classes:
        label_classes = int(np.ceil(np.sqrt(float(label_dim))))

    n_outputs_per_class = int(np.ceil(label_dim / label_classes))

    target_class = C.floor((label_index + 0.5) / n_outputs_per_class)
    target_output_in_class = C.round(label_index - target_class * n_outputs_per_class)

    w1 = parameter(shape=(input_dim, label_classes), init=C.glorot_normal(), name='hsoftmax_w1')
    b1 = parameter(shape=(label_classes), init=C.glorot_normal(), name='hsoftmax_b1')
    w2s = parameter(shape=(label_classes, input_dim, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_w2s')
    b2s = parameter(shape=(label_classes, n_outputs_per_class,), init=C.glorot_normal(), name='hsoftmax_b2s')

    class_probs = softmax(b1 + times(input_var, w1))

    # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate
    target_class_one_hot = C.one_hot(target_class, num_classes=label_classes, sparse_output=False)
    w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2), [input_dim, -1])
    b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1])
    probs_in_class = softmax(b2 + times(input_var, w2))

    prob_in_class = C.times_transpose(C.one_hot(target_output_in_class, num_classes=n_outputs_per_class, sparse_output=False), probs_in_class)
    class_prob = C.times_transpose(C.one_hot(target_class, num_classes=label_classes, sparse_output=False), class_probs)
    output_prob = prob_in_class * class_prob

    # this is for calculating all the outputs' probabilities
    all_probs = []
    for i in range(label_classes):
        ci = C.constant(i)
        ci_one_hot = C.one_hot(ci, num_classes=label_classes, sparse_output=False)
        w2a = C.times(ci_one_hot, w2s, output_rank=2)
        b2a = C.times(ci_one_hot, b2s, output_rank=1)
        probs_in_classa = C.softmax(b2a + times(input_var, w2a))
        class_proba = C.times_transpose(ci_one_hot, class_probs)
        output_proba = probs_in_classa * class_proba
        all_probs.append(output_proba)

    return output_prob, class_probs, all_probs
Beispiel #3
0
def BiRecurrence(step_function: C.Function, initial_state=0, dropout_rate_input=None, dropout_rate_output=None,
                 weight_tie: bool = False, seed=SentinelValueForAutoSelectRandomSeed, name=''):
    """ Wrapper to create a bidirectional rnn

    Also comes with the option to to half the number of parameters required by  bidirectional recurrent layer.
    This is done by only using one recurrent unit to do both forward and backward computation instead of
    the usual two. A forward and backward token is used to initialise the hidden state so that the recurrent
    unit can tell the directionality.

    More details can be found in the paper 'Efficient Bidirectional Neural Machine Translation' (https://arxiv.org/abs/1908.09329)

    Example:
        a = C.sequence.input_variable(10)
        b = BiRecurrence(LSTM(100), weight_tie=True)(a)

        assert b.shape == (200, )

    Arguments:
        step_function (:class:`~cntk.ops.functions.Function` or equivalent Python function):
            This function must have N+1 inputs and N outputs, where N is the number of state variables
            (typically 1 for GRU and plain RNNs, and 2 for LSTMs).
        initial_state:
        dropout_rate_input: variational dropout on input
        dropout_rate_output: variational dropoput on output
        weight_tie (bool): whether to use only one recurrent function for computation in both direction.
        seed (int): seed for randomisation
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function that accepts one argument (which must be a sequence) and performs the recurrent operation on it
    """
    fxn1 = step_function
    fxn2 = step_function.clone(C.CloneMethod.clone, {}) if not weight_tie else fxn1

    forward_token = initial_state
    backward_token = initial_state
    if weight_tie:
        forward_token = C.Parameter(shape=(-1,), init=C.glorot_normal(), name='f_token')
        backward_token = C.Parameter(shape=(-1,), init=C.glorot_normal(), name='b_token')

    forward = Recurrence(fxn1, dropout_rate_input=dropout_rate_input, dropout_rate_output=dropout_rate_output, initial_state=forward_token, seed=seed)
    backward = Recurrence(fxn2, dropout_rate_input=dropout_rate_input, dropout_rate_output=dropout_rate_output, initial_state=backward_token, seed=seed, go_backwards=True)

    @C.Function
    def inner(x):
        output = C.splice(forward(x), backward(x), axis=-1)
        return C.layers.Label(name)(output) if name else output

    return inner
Beispiel #4
0
def test_h_softmax():
  input_dim = 2
  num_output_classes = 4
  minibatch_size = 3
  n_classes = int(ceil(sqrt(num_output_classes)))
  n_outputs_per_class = n_classes

  w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=1), name='w1')
  b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=2), name='b1')
  w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=3), name='w2s')
  b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=4), name='b2s')

  # neural network structure for hierarchical softmax
  h_input = C.input_variable(input_dim)
  h_target_class = C.input_variable([1])
  h_target_output_in_class = C.input_variable([1])
  h_z, class_probs, all_probs = C.hierarchical_softmax_layer(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s)

  a = np.reshape(np.arange(minibatch_size * input_dim, dtype = np.float32), (minibatch_size, input_dim))
  labels = np.reshape(np.arange(minibatch_size, dtype = np.float32), (minibatch_size, 1)) % num_output_classes
  target_labels = labels // n_outputs_per_class
  target_output_in_labels = labels % n_outputs_per_class
  val_z = h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})
  val_class_probs = class_probs.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})
  val_all_probs = [x.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels}) for x in all_probs]

  expected_z = [[0.0313047], [0.00323934], [0.99006385]]
  expected_class_probs = [[ 0.04346574,  0.95653421],
                          [ 0.0204236 ,  0.97957635],
                          [ 0.0094756 ,  0.99052447]]
  expected_all_probs =  [[[ 0.0313047 ,  0.01216104],
                          [ 0.01718426,  0.00323934],
                          [ 0.00868148,  0.00079412]],
                          [[  5.82283854e-01,   3.74250382e-01],
                          [  9.62925494e-01,   1.66507624e-02],
                          [  9.90063846e-01,   4.60594223e-04]]]                       

  assert np.allclose(expected_z, val_z)
  assert np.allclose(expected_class_probs, val_class_probs)
  assert np.allclose(expected_all_probs, val_all_probs)
Beispiel #5
0
def test_sequential_convolution():
    # --------------------------------------------------------------------------
    # Normal use case - image check kernel initialisation shape
    # --------------------------------------------------------------------------
    image_shape = (3, 32)  # rgb image of variable width
    a = C.sequence.input_variable(image_shape)
    b = C.layers.SequentialConvolution(filter_shape=(3, 3),
                                       num_filters=16,
                                       reduction_rank=1,
                                       pad=True)(a)
    c = Cx.layers.SequentialConvolution(filter_shape=(3, 3),
                                        num_filters=16,
                                        reduction_rank=1,
                                        pad=True)(a)

    assert b.shape == c.shape
    assert b.W.shape == c.W.shape
    assert b.b.shape == c.b.shape

    image_shape = (32, )  # black and white image of variable width
    a = C.sequence.input_variable(image_shape)
    b = C.layers.SequentialConvolution(filter_shape=(2, 2),
                                       num_filters=16,
                                       reduction_rank=0,
                                       pad=True)(a)
    c = Cx.layers.SequentialConvolution(filter_shape=(2, 2),
                                        num_filters=16,
                                        reduction_rank=0,
                                        pad=True)(a)

    assert b.shape == c.shape
    assert b.W.shape == c.W.shape
    assert b.b.shape == c.b.shape

    image_shape = (32, )  # text vector of variable sequence length
    a = C.sequence.input_variable(image_shape)
    b = C.layers.SequentialConvolution(filter_shape=(2, 2),
                                       num_filters=16,
                                       reduction_rank=1,
                                       pad=True)(a)
    c = Cx.layers.SequentialConvolution(filter_shape=(2, 2),
                                        num_filters=16,
                                        reduction_rank=1,
                                        pad=True)(a)

    assert b.shape == c.shape
    assert b.W.shape == c.W.shape
    assert b.b.shape == c.b.shape

    # --------------------------------------------------------------------------
    # Normal use case - image
    # --------------------------------------------------------------------------
    kernel_init = C.glorot_normal(seed=5)
    image_shape = (3, 32)
    width = 40
    a = C.sequence.input_variable(image_shape)
    b = C.layers.SequentialConvolution(filter_shape=(3, 3),
                                       num_filters=16,
                                       reduction_rank=1,
                                       pad=True,
                                       init=kernel_init)(a)
    c = Cx.layers.SequentialConvolution(filter_shape=(3, 3),
                                        num_filters=16,
                                        reduction_rank=1,
                                        pad=True,
                                        init=kernel_init)(a)

    assert b.shape == c.shape

    n = np.random.random((1, width) + image_shape).astype(np.float32)

    desired = b.eval({a: n})
    actual = c.eval({a: n})

    if isinstance(desired, list) and len(desired) == 1:
        desired = desired[0]

    if isinstance(actual, list) and len(actual) == 1:
        actual = actual[0]

    np.testing.assert_equal(actual, desired)

    # --------------------------------------------------------------------------
    # Normal use case - image, mix padding across axis
    # --------------------------------------------------------------------------
    kernel_init = C.glorot_normal(seed=5)
    image_shape = (3, 32)
    width = 40
    a = C.sequence.input_variable(image_shape)
    b = C.layers.SequentialConvolution(filter_shape=(3, 3),
                                       num_filters=16,
                                       reduction_rank=1,
                                       pad=(False, True),
                                       init=kernel_init)(a)
    c = Cx.layers.SequentialConvolution(filter_shape=(3, 3),
                                        num_filters=16,
                                        reduction_rank=1,
                                        pad=(False, True),
                                        init=kernel_init)(a)

    assert b.shape == c.shape

    n = np.random.random((1, width) + image_shape).astype(np.float32)

    desired = b.eval({a: n})
    actual = c.eval({a: n})

    if isinstance(desired, list) and len(desired) == 1:
        desired = desired[0]

    if isinstance(actual, list) and len(actual) == 1:
        actual = actual[0]

    np.testing.assert_equal(actual, desired)

    # --------------------------------------------------------------------------
    # Normal use case - vector data for qrnn implementation
    # --------------------------------------------------------------------------
    kernel_init = C.glorot_normal(seed=5)
    image_shape = (25, )
    width = 40
    a = C.sequence.input_variable(image_shape)
    b = C.layers.SequentialConvolution(filter_shape=(2, ),
                                       num_filters=16,
                                       reduction_rank=1,
                                       pad=True,
                                       init=kernel_init)(a)
    c = Cx.layers.SequentialConvolution(filter_shape=(2, ),
                                        num_filters=16,
                                        reduction_rank=1,
                                        pad=True,
                                        init=kernel_init)(a)

    assert b.shape == c.shape

    n = np.random.random((1, width) + image_shape).astype(np.float32)

    desired = b.eval({a: n})
    actual = c.eval({a: n})

    if isinstance(desired, list) and len(desired) == 1:
        desired = desired[0]

    if isinstance(actual, list) and len(actual) == 1:
        actual = actual[0]

    np.testing.assert_equal(actual, desired)

    # --------------------------------------------------------------------------
    # Normal use case - B&W image
    # --------------------------------------------------------------------------
    kernel_init = C.glorot_normal(seed=5)
    image_shape = (32, )
    width = 40
    a = C.sequence.input_variable(image_shape)
    b = C.layers.SequentialConvolution(filter_shape=(3, 3),
                                       num_filters=16,
                                       reduction_rank=0,
                                       pad=True,
                                       init=kernel_init)(a)
    c = Cx.layers.SequentialConvolution(filter_shape=(3, 3),
                                        num_filters=16,
                                        reduction_rank=0,
                                        pad=True,
                                        init=kernel_init)(a)

    assert b.shape == c.shape

    n = np.random.random((1, width) + image_shape).astype(np.float32)

    desired = b.eval({a: n})
    actual = c.eval({a: n})

    if isinstance(desired, list) and len(desired) == 1:
        desired = desired[0]

    if isinstance(actual, list) and len(actual) == 1:
        actual = actual[0]

    np.testing.assert_equal(actual, desired)

    image_shape = (32, )
    width = 40
    a = C.sequence.input_variable(image_shape)
    b = C.layers.SequentialConvolution(filter_shape=(3, 3),
                                       num_filters=16,
                                       reduction_rank=0,
                                       pad=True)(a)
    c = Cx.layers.SequentialConvolution(filter_shape=(3, 3),
                                        num_filters=16,
                                        reduction_rank=0,
                                        pad=True,
                                        init=b.W.value)(a)

    assert b.shape == c.shape

    n = np.random.random((1, width) + image_shape).astype(np.float32)

    desired = b.eval({a: n})
    actual = c.eval({a: n})

    if isinstance(desired, list) and len(desired) == 1:
        desired = desired[0]

    if isinstance(actual, list) and len(actual) == 1:
        actual = actual[0]

    np.testing.assert_equal(actual, desired)
Beispiel #6
0
def test_h_softmax_for_sequence():
    input_dim = 2
    num_output_classes = 4
    minibatch_size = 3
    seq_size = 2
    n_classes = int(ceil(sqrt(num_output_classes)))
    n_outputs_per_class = n_classes

    w1 = C.parameter(shape=(input_dim, n_classes),
                     init=C.glorot_normal(seed=2),
                     name='w1')
    b1 = C.parameter(shape=(n_classes),
                     init=C.glorot_normal(seed=3),
                     name='b1')
    w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class),
                      init=C.glorot_normal(seed=4),
                      name='w2s')
    b2s = C.parameter(shape=(n_classes, n_outputs_per_class),
                      init=C.glorot_normal(seed=5),
                      name='b2s')

    # neural network structure for hierarchical softmax
    h_input = C.sequence.input_variable(input_dim)
    h_target_class = C.sequence.input_variable([1])
    h_target_output_in_class = C.sequence.input_variable([1])
    h_z, class_probs, all_probs = C.hierarchical_softmax_layer_for_sequence(
        h_input, num_output_classes, h_target_class, h_target_output_in_class,
        minibatch_size, w1, b1, w2s, b2s)

    a = np.reshape(
        np.arange(seq_size * minibatch_size * input_dim, dtype=np.float32),
        (seq_size, minibatch_size, input_dim))
    labels = np.reshape(np.arange(seq_size * minibatch_size, dtype=np.float32),
                        (seq_size, minibatch_size, 1)) % num_output_classes
    target_labels = labels // n_outputs_per_class
    target_output_in_labels = labels % n_outputs_per_class
    val_z = h_z.eval({
        h_input: a,
        h_target_class: target_labels,
        h_target_output_in_class: target_output_in_labels
    })
    val_class_probs = class_probs.eval({
        h_input:
        a,
        h_target_class:
        target_labels,
        h_target_output_in_class:
        target_output_in_labels
    })
    val_all_probs = [
        x.eval({
            h_input: a,
            h_target_class: target_labels,
            h_target_output_in_class: target_output_in_labels
        }) for x in all_probs
    ]

    expected_z = [[[0.16448107], [0.00597861], [0.99322051]],
                  [[8.59128195e-04], [3.77086673e-09], [3.42400197e-12]]]
    expected_class_probs = [[[5.81252098e-01, 4.18747932e-01],
                             [1.03938626e-02, 9.89606142e-01],
                             [7.94661901e-05, 9.99920487e-01]],
                            [[6.01340048e-07, 9.99999404e-01],
                             [4.55011762e-09, 1.00000000e+00],
                             [3.44291574e-11, 1.00000000e+00]]]
    expected_all_probs = [[[[1.64481074e-01, 4.16771024e-01],
                            [4.41524992e-03, 5.97861316e-03],
                            [4.61043091e-05, 3.33618809e-05]],
                           [[4.33648694e-07, 1.67691354e-07],
                            [3.77086673e-09, 7.79251219e-10],
                            [3.10051568e-11, 3.42400197e-12]]],
                          [[[0.29590073, 0.12284722], [0.93986785, 0.04973821],
                            [0.99322051, 0.00669997]],
                           [[9.99140263e-01, 8.59128195e-04],
                            [9.99890447e-01, 1.09594235e-04],
                            [9.99986053e-01, 1.39711719e-05]]]]

    assert np.allclose(expected_z, val_z)
    assert np.allclose(expected_class_probs, val_class_probs)
    assert np.allclose(expected_all_probs, val_all_probs)
Beispiel #7
0
def test_h_softmax():
    input_dim = 2
    num_output_classes = 4
    minibatch_size = 3
    n_classes = int(ceil(sqrt(num_output_classes)))
    n_outputs_per_class = n_classes

    w1 = C.parameter(shape=(input_dim, n_classes),
                     init=C.glorot_normal(seed=1),
                     name='w1')
    b1 = C.parameter(shape=(n_classes),
                     init=C.glorot_normal(seed=2),
                     name='b1')
    w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class),
                      init=C.glorot_normal(seed=3),
                      name='w2s')
    b2s = C.parameter(shape=(n_classes, n_outputs_per_class),
                      init=C.glorot_normal(seed=4),
                      name='b2s')

    # neural network structure for hierarchical softmax
    h_input = C.input_variable(input_dim)
    h_target_class = C.input_variable([1])
    h_target_output_in_class = C.input_variable([1])
    h_z, class_probs, all_probs = C.hierarchical_softmax_layer(
        h_input, num_output_classes, h_target_class, h_target_output_in_class,
        minibatch_size, w1, b1, w2s, b2s)

    a = np.reshape(np.arange(minibatch_size * input_dim, dtype=np.float32),
                   (minibatch_size, input_dim))
    labels = np.reshape(np.arange(minibatch_size, dtype=np.float32),
                        (minibatch_size, 1)) % num_output_classes
    target_labels = labels // n_outputs_per_class
    target_output_in_labels = labels % n_outputs_per_class
    val_z = h_z.eval({
        h_input: a,
        h_target_class: target_labels,
        h_target_output_in_class: target_output_in_labels
    })
    val_class_probs = class_probs.eval({
        h_input:
        a,
        h_target_class:
        target_labels,
        h_target_output_in_class:
        target_output_in_labels
    })
    val_all_probs = [
        x.eval({
            h_input: a,
            h_target_class: target_labels,
            h_target_output_in_class: target_output_in_labels
        }) for x in all_probs
    ]

    expected_z = [[0.0313047], [0.00323934], [0.99006385]]
    expected_class_probs = [[0.04346574, 0.95653421], [0.0204236, 0.97957635],
                            [0.0094756, 0.99052447]]
    expected_all_probs = [[[0.0313047, 0.01216104], [0.01718426, 0.00323934],
                           [0.00868148, 0.00079412]],
                          [[5.82283854e-01, 3.74250382e-01],
                           [9.62925494e-01, 1.66507624e-02],
                           [9.90063846e-01, 4.60594223e-04]]]

    assert np.allclose(expected_z, val_z)
    assert np.allclose(expected_class_probs, val_class_probs)
    assert np.allclose(expected_all_probs, val_all_probs)
Beispiel #8
0
def linear_layer(input_var, num_classes):
    num_features = input_var.shape[0]
    weight_param = C.parameter(shape=(num_features, num_classes),
                               init=C.glorot_normal())
    bias_param = C.parameter(shape=(num_classes), init=C.glorot_normal())
    return C.times(input_var, weight_param) + bias_param
def hierarchical_softmax_layer(input_var,
                               label_index,
                               label_dim,
                               label_classes=None):
    '''
    A two layers hierarchical softmax function:

    Args:
        input_var: Variable with shape: [#,*](dim_x)
        label_index: index of label's category:  [#,*](1)
        label_dim: number of the label categories
        label_classes: number of classes of the label categories
    Returns:
        output_prob: the probability of the given label [#,*](1)
        class_probs: the probability of all the label classes [#,*](label_classes)
        all_probs: the probability of all label classes 
    '''
    input_dim = input_var.shape[0]

    if not label_classes:
        label_classes = int(np.ceil(np.sqrt(float(label_dim))))

    n_outputs_per_class = int(np.ceil(label_dim / label_classes))

    target_class = C.floor((label_index + 0.5) / n_outputs_per_class)
    target_output_in_class = C.round(label_index -
                                     target_class * n_outputs_per_class)

    w1 = parameter(shape=(input_dim, label_classes),
                   init=C.glorot_normal(),
                   name='hsoftmax_w1')
    b1 = parameter(shape=(label_classes),
                   init=C.glorot_normal(),
                   name='hsoftmax_b1')
    w2s = parameter(shape=(
        label_classes,
        input_dim,
        n_outputs_per_class,
    ),
                    init=C.glorot_normal(),
                    name='hsoftmax_w2s')
    b2s = parameter(shape=(
        label_classes,
        n_outputs_per_class,
    ),
                    init=C.glorot_normal(),
                    name='hsoftmax_b2s')

    class_probs = softmax(b1 + times(input_var, w1))

    # TODO: fix the bug in backprop for sparse, and use sparse embedding to accelerate
    target_class_one_hot = C.one_hot(target_class,
                                     num_classes=label_classes,
                                     sparse_output=False)
    w2 = C.reshape(C.times(target_class_one_hot, w2s, output_rank=2),
                   [input_dim, -1])
    b2 = C.reshape(times(target_class_one_hot, b2s, output_rank=1), [-1])
    probs_in_class = softmax(b2 + times(input_var, w2))

    prob_in_class = C.times_transpose(
        C.one_hot(target_output_in_class,
                  num_classes=n_outputs_per_class,
                  sparse_output=False), probs_in_class)
    class_prob = C.times_transpose(
        C.one_hot(target_class, num_classes=label_classes,
                  sparse_output=False), class_probs)
    output_prob = prob_in_class * class_prob

    # this is for calculating all the outputs' probabilities
    all_probs = []
    for i in range(label_classes):
        ci = C.constant(i)
        ci_one_hot = C.one_hot(ci,
                               num_classes=label_classes,
                               sparse_output=False)
        w2a = C.times(ci_one_hot, w2s, output_rank=2)
        b2a = C.times(ci_one_hot, b2s, output_rank=1)
        probs_in_classa = C.softmax(b2a + times(input_var, w2a))
        class_proba = C.times_transpose(ci_one_hot, class_probs)
        output_proba = probs_in_classa * class_proba
        all_probs.append(output_proba)

    return output_prob, class_probs, all_probs