Python stop_gradient Examples, cntk.stop_gradient Python Examples

Example #1

0

Show file

File: stop_gradient_test.py Project: delpart/CNTK

def test_stop_gradient():
    x = C.sequence.input_variable(shape=(2,), sequence_axis=C.Axis("B"), needs_gradient=True)
    y = C.sequence.input_variable(shape=(2,), sequence_axis=C.Axis("B"), needs_gradient=True)
    z = C.element_times(x, y)
    w = z + C.stop_gradient(z)
    a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2))
    b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2))
    bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output]))
    value = list(fwd.values())[0]
    expected = np.multiply(a, b)*2
    assert np.allclose(value, expected)
    grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y]))
    assert np.allclose(grad[x], b)
    assert np.allclose(grad[y], a)

    #test stop_gradient with function as input whose arguments should have no gradients (zeros reading)
    w = C.stop_gradient(z)
    bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output]))
    value = list(fwd.values())[0]
    expected = np.multiply(a, b)
    assert np.allclose(value, expected)
    grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y]))
    #there should be no gradients backward to x and y
    assert np.allclose(grad[x], np.zeros_like(b))
    assert np.allclose(grad[y], np.zeros_like(a))

Example #2

0

Show file

File: stop_gradient_test.py Project: bigdatasciencegroup/Microsoft-Cognitive-Recognition-Toolkit-CNTK

def test_stop_gradient():
    x = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    y = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    z = C.element_times(x, y)
    w = z + C.stop_gradient(z)
    a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2))
    b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2))
    bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output]))
    value = list(fwd.values())[0]
    expected = np.multiply(a, b) * 2
    assert np.allclose(value, expected)
    grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y]))
    assert np.allclose(grad[x], b)
    assert np.allclose(grad[y], a)

    #test stop_gradient with function as input whose arguments should have no gradients (zeros reading)
    w = C.stop_gradient(z)
    bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output]))
    value = list(fwd.values())[0]
    expected = np.multiply(a, b)
    assert np.allclose(value, expected)
    grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y]))
    #there should be no gradients backward to x and y
    assert np.allclose(grad[x], np.zeros_like(b))
    assert np.allclose(grad[y], np.zeros_like(a))

Example #3

0

Show file

def triangular_matrix_seq(mode: int = 1):
    X = C.placeholder(1)
    ones = C.ones_like(X[0])
    perm_1 = C.layers.Recurrence(C.plus, return_full_state=True)(ones)
    perm_2 = C.layers.Recurrence(C.plus,
                                 go_backwards=True,
                                 return_full_state=True)(ones)

    arr_1 = C.sequence.unpack(perm_1, 0, True)
    arr_2 = C.sequence.unpack(perm_2, 0, True)

    mat = C.times_transpose(arr_1, arr_2)
    mat_c = arr_1 * arr_2

    diagonal_mat = mat - mat_c

    final_mat = diagonal_mat
    if mode == 0:
        final_mat = C.equal(final_mat, 0)
    elif mode == 1:
        final_mat = C.less_equal(final_mat, 0)
    elif mode == 2:
        final_mat = C.less(final_mat, 0)
    elif mode == -1:
        final_mat = C.greater_equal(final_mat, 0)
    elif mode == -2:
        final_mat = C.greater(final_mat, 0)

    result = C.as_block(final_mat, [(X, X)], 'triangular_matrix')

    return C.stop_gradient(result)

Example #4

0

Show file

def positional_encoding(token_dims: int, discount_factor: float = 0.99):
    X = C.placeholder(token_dims, name='positional_encoding')
    encoder = C.layers.Recurrence(C.element_times,
                                  initial_state=1,
                                  return_full_state=True)(C.ones_like(X) *
                                                          discount_factor)
    return C.stop_gradient(
        C.as_block(encoder, [(X, X)], 'positional_encoding',
                   'positional_encoding_'))

Example #5

0

Show file

File: CapsLayer.py Project: southworks/CapsNet-CNTK

    def masking(input, labels):

        if not is_onehot_encoded:
            mask = ct.reshape(ct.one_hot(
                ct.reshape(ct.argmax(labels, axis=0), shape=(-1, )), 10),
                              shape=(10, 1, 1))
            mask = ct.stop_gradient(mask)
        else:
            mask = ct.reshape(labels, shape=(10, 1, 1))

        mask = ct.splice(*([mask] * 16), axis=1)
        return ct.reshape(ct.element_times(input, mask), shape=(-1, ))

Example #6

0

Show file

    def Loss(self):
        # Evaluating old actions and values :
        logprobs, state_value, dist_entropy = self.policy.evaluate()

        # Finding the ratio (pi_theta / pi_theta__old): # (importance sampling)
        c_old_logprobs = C.input_variable(logprobs.shape, name='old_log_probs')
        ratios = C.exp(logprobs - C.stop_gradient(c_old_logprobs))

        c_rewards = C.input_variable(1, name='rewards')
        advantages = c_rewards - C.stop_gradient(state_value)

        # Finding Surrogate Loss:
        surr1 = ratios * advantages
        surr2 = C.clip(ratios, 1 - self.eps_clip,
                       1 + self.eps_clip) * advantages
        neglog_loss = -C.element_min(surr1, surr2)
        entropy_loss = -0.01 * dist_entropy
        actor_loss = C.reduce_mean(neglog_loss + entropy_loss)
        critic_loss = 0.5 * C.reduce_mean(C.square(state_value - c_rewards))
        loss = actor_loss + critic_loss

        chunk = {
            'neglog_loss': neglog_loss,
            'entropy_loss': entropy_loss,
            'actor_loss': actor_loss,
            'critic_loss': critic_loss
        }

        trainer = C.Trainer(
            loss, (loss, None),
            C.adam(loss.parameters,
                   C.learning_parameter_schedule_per_sample(self.lr),
                   C.momentum_schedule_per_sample(self.betas[0]),
                   variance_momentum=C.momentum_schedule_per_sample(
                       self.betas[1])))
        # trainer = C.Trainer(loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule(10), C.momentum_schedule(0.9), variance_momentum=C.momentum_schedule(0.999))) # higher learning rate

        return loss, chunk, trainer

Example #7

0

Show file

def test_stop_gradient():
    x = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    y = C.sequence.input_variable(shape=(2, ),
                                  sequence_axis=C.Axis("B"),
                                  needs_gradient=True)
    z = C.element_times(x, y)
    w = z + C.stop_gradient(z)
    a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2))
    b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2))
    bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output]))
    value = list(fwd.values())[0]
    expected = np.multiply(a, b) * 2
    assert np.allclose(value, expected)
    grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y]))
    assert np.allclose(grad[x], b)
    assert np.allclose(grad[y], a)

Example #8

0

Show file

File: CapsLayer.py Project: southworks/CapsNet-CNTK

def DigitCaps(input,
              num_capsules,
              dim_out_vector,
              routings=3,
              name='DigitCaps'):
    '''
    Function to create an instance of a digit capsule.

    Args:
        input: Input Tensor
        num_capsules (int): Number of output capsules
        dim_out_vector (int): Number of dimensions of the capsule output vector
        routings (int, optional): The number of routing iterations
        name (str, optional): The name of the Function instance in the network.
    '''
    # Learnable Parameters
    W = ct.Parameter(shape=(1152, 10, 16, 8),
                     init=ct.normal(0.01),
                     name=name + '_Weights')

    # reshape input for broadcasting on all output capsules
    input = ct.reshape(input, (1152, 1, 1, 8), name='reshape_input')

    # Output shape = [#](1152, 10, 16, 1)
    u_hat = ct.reduce_sum(W * input, axis=3)

    # we don't need gradients on routing
    u_hat_stopped = ct.stop_gradient(u_hat, name='stop_gradient')

    # all the routing logits (Bij) are initialized to zero for each routing.
    Bij = ct.Constant(np.zeros((1152, 10, 1, 1), dtype=np.float32))

    # line 3, for r iterations do
    for r_iter in range(routings):
        # line 4: for all capsule i in layer l: ci ← softmax(bi) => Cij
        # Output shape = [#][1152, 10, 1, 1]
        Cij = ct.softmax(Bij, axis=1)

        # At last iteration, use `u_hat` in order to receive gradients from the following graph
        if r_iter == routings - 1:
            # line 5: for all capsule j in layer (l + 1): sj ← sum(cij * u_hat)
            # Output shape = [#][1152, 10, 16, 1]
            Sj = ct.reduce_sum(ct.element_times(Cij, u_hat, 'weighted_u_hat'),
                               axis=0)

            # line 6: for all capsule j in layer (l + 1): vj ← squash(sj)
            # Output shape = [#][1, 10, 16, 1]
            Vj = Squash(Sj)
        elif r_iter < routings - 1:
            # line 5: for all capsule j in layer (l + 1): sj ← sum(cij * u_hat)
            # Output shape = [#][1152, 10, 16, 1]
            Sj = ct.reduce_sum(ct.element_times(Cij, u_hat_stopped), axis=0)

            # line 6: for all capsule j in layer (l + 1): vj ← squash(sj)
            # Output shape = [#][1, 10, 16, 1]
            Vj = Squash(Sj)

            # line 7: for all capsule i in layer l and capsule j in layer (l + 1): bij ← bij + ^uj|i * vj
            # Output shape = [#][1, 10, 1, 16]
            Vj_Transpose = ct.transpose(ct.reshape(Vj, (1, 10, 16, 1)),
                                        (0, 1, 3, 2),
                                        name='Vj_Transpose')

            # Output shape = [#][1152, 10, 1, 1]
            UV = ct.reduce_sum(ct.reshape(u_hat_stopped,
                                          (1152, 10, 1, 16)) * Vj_Transpose,
                               axis=3)
            Bij += UV

    # Output shape = [#][10, 16, 1]
    Vj = ct.reshape(Vj, (10, 16, 1), name='digit_caps_output')
    return Vj