def test_stop_gradient(): x = C.sequence.input_variable(shape=(2,), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(2,), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.element_times(x, y) w = z + C.stop_gradient(z) a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2)) b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2)) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b)*2 assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) assert np.allclose(grad[x], b) assert np.allclose(grad[y], a) #test stop_gradient with function as input whose arguments should have no gradients (zeros reading) w = C.stop_gradient(z) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b) assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) #there should be no gradients backward to x and y assert np.allclose(grad[x], np.zeros_like(b)) assert np.allclose(grad[y], np.zeros_like(a))
def test_stop_gradient(): x = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.element_times(x, y) w = z + C.stop_gradient(z) a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2)) b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2)) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b) * 2 assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) assert np.allclose(grad[x], b) assert np.allclose(grad[y], a) #test stop_gradient with function as input whose arguments should have no gradients (zeros reading) w = C.stop_gradient(z) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b) assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) #there should be no gradients backward to x and y assert np.allclose(grad[x], np.zeros_like(b)) assert np.allclose(grad[y], np.zeros_like(a))
def triangular_matrix_seq(mode: int = 1): X = C.placeholder(1) ones = C.ones_like(X[0]) perm_1 = C.layers.Recurrence(C.plus, return_full_state=True)(ones) perm_2 = C.layers.Recurrence(C.plus, go_backwards=True, return_full_state=True)(ones) arr_1 = C.sequence.unpack(perm_1, 0, True) arr_2 = C.sequence.unpack(perm_2, 0, True) mat = C.times_transpose(arr_1, arr_2) mat_c = arr_1 * arr_2 diagonal_mat = mat - mat_c final_mat = diagonal_mat if mode == 0: final_mat = C.equal(final_mat, 0) elif mode == 1: final_mat = C.less_equal(final_mat, 0) elif mode == 2: final_mat = C.less(final_mat, 0) elif mode == -1: final_mat = C.greater_equal(final_mat, 0) elif mode == -2: final_mat = C.greater(final_mat, 0) result = C.as_block(final_mat, [(X, X)], 'triangular_matrix') return C.stop_gradient(result)
def positional_encoding(token_dims: int, discount_factor: float = 0.99): X = C.placeholder(token_dims, name='positional_encoding') encoder = C.layers.Recurrence(C.element_times, initial_state=1, return_full_state=True)(C.ones_like(X) * discount_factor) return C.stop_gradient( C.as_block(encoder, [(X, X)], 'positional_encoding', 'positional_encoding_'))
def masking(input, labels): if not is_onehot_encoded: mask = ct.reshape(ct.one_hot( ct.reshape(ct.argmax(labels, axis=0), shape=(-1, )), 10), shape=(10, 1, 1)) mask = ct.stop_gradient(mask) else: mask = ct.reshape(labels, shape=(10, 1, 1)) mask = ct.splice(*([mask] * 16), axis=1) return ct.reshape(ct.element_times(input, mask), shape=(-1, ))
def Loss(self): # Evaluating old actions and values : logprobs, state_value, dist_entropy = self.policy.evaluate() # Finding the ratio (pi_theta / pi_theta__old): # (importance sampling) c_old_logprobs = C.input_variable(logprobs.shape, name='old_log_probs') ratios = C.exp(logprobs - C.stop_gradient(c_old_logprobs)) c_rewards = C.input_variable(1, name='rewards') advantages = c_rewards - C.stop_gradient(state_value) # Finding Surrogate Loss: surr1 = ratios * advantages surr2 = C.clip(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages neglog_loss = -C.element_min(surr1, surr2) entropy_loss = -0.01 * dist_entropy actor_loss = C.reduce_mean(neglog_loss + entropy_loss) critic_loss = 0.5 * C.reduce_mean(C.square(state_value - c_rewards)) loss = actor_loss + critic_loss chunk = { 'neglog_loss': neglog_loss, 'entropy_loss': entropy_loss, 'actor_loss': actor_loss, 'critic_loss': critic_loss } trainer = C.Trainer( loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule_per_sample(self.lr), C.momentum_schedule_per_sample(self.betas[0]), variance_momentum=C.momentum_schedule_per_sample( self.betas[1]))) # trainer = C.Trainer(loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule(10), C.momentum_schedule(0.9), variance_momentum=C.momentum_schedule(0.999))) # higher learning rate return loss, chunk, trainer
def test_stop_gradient(): x = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) y = C.sequence.input_variable(shape=(2, ), sequence_axis=C.Axis("B"), needs_gradient=True) z = C.element_times(x, y) w = z + C.stop_gradient(z) a = np.reshape(np.float32([0.25, 0.5, 0.1, 1]), (1, 2, 2)) b = np.reshape(np.float32([-1.25, 1.5, 0.1, -1]), (1, 2, 2)) bwd, fwd = w.forward({x: a, y: b}, [w.output], set([w.output])) value = list(fwd.values())[0] expected = np.multiply(a, b) * 2 assert np.allclose(value, expected) grad = w.backward(bwd, {w.output: np.ones_like(value)}, set([x, y])) assert np.allclose(grad[x], b) assert np.allclose(grad[y], a)
def DigitCaps(input, num_capsules, dim_out_vector, routings=3, name='DigitCaps'): ''' Function to create an instance of a digit capsule. Args: input: Input Tensor num_capsules (int): Number of output capsules dim_out_vector (int): Number of dimensions of the capsule output vector routings (int, optional): The number of routing iterations name (str, optional): The name of the Function instance in the network. ''' # Learnable Parameters W = ct.Parameter(shape=(1152, 10, 16, 8), init=ct.normal(0.01), name=name + '_Weights') # reshape input for broadcasting on all output capsules input = ct.reshape(input, (1152, 1, 1, 8), name='reshape_input') # Output shape = [#](1152, 10, 16, 1) u_hat = ct.reduce_sum(W * input, axis=3) # we don't need gradients on routing u_hat_stopped = ct.stop_gradient(u_hat, name='stop_gradient') # all the routing logits (Bij) are initialized to zero for each routing. Bij = ct.Constant(np.zeros((1152, 10, 1, 1), dtype=np.float32)) # line 3, for r iterations do for r_iter in range(routings): # line 4: for all capsule i in layer l: ci ← softmax(bi) => Cij # Output shape = [#][1152, 10, 1, 1] Cij = ct.softmax(Bij, axis=1) # At last iteration, use `u_hat` in order to receive gradients from the following graph if r_iter == routings - 1: # line 5: for all capsule j in layer (l + 1): sj ← sum(cij * u_hat) # Output shape = [#][1152, 10, 16, 1] Sj = ct.reduce_sum(ct.element_times(Cij, u_hat, 'weighted_u_hat'), axis=0) # line 6: for all capsule j in layer (l + 1): vj ← squash(sj) # Output shape = [#][1, 10, 16, 1] Vj = Squash(Sj) elif r_iter < routings - 1: # line 5: for all capsule j in layer (l + 1): sj ← sum(cij * u_hat) # Output shape = [#][1152, 10, 16, 1] Sj = ct.reduce_sum(ct.element_times(Cij, u_hat_stopped), axis=0) # line 6: for all capsule j in layer (l + 1): vj ← squash(sj) # Output shape = [#][1, 10, 16, 1] Vj = Squash(Sj) # line 7: for all capsule i in layer l and capsule j in layer (l + 1): bij ← bij + ^uj|i * vj # Output shape = [#][1, 10, 1, 16] Vj_Transpose = ct.transpose(ct.reshape(Vj, (1, 10, 16, 1)), (0, 1, 3, 2), name='Vj_Transpose') # Output shape = [#][1152, 10, 1, 1] UV = ct.reduce_sum(ct.reshape(u_hat_stopped, (1152, 10, 1, 16)) * Vj_Transpose, axis=3) Bij += UV # Output shape = [#][10, 16, 1] Vj = ct.reshape(Vj, (10, 16, 1), name='digit_caps_output') return Vj