def test_disconnected_paths(self): # Test that taking gradient going through a disconnected # path rasises an exception T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix('x') # This MUST raise a DisconnectedInputError error. # This also rasies an additional warning from gradients.py. self.assertRaises(gradient.DisconnectedInputError, gradient.grad, gradient.disconnected_grad(x).sum(), x) # This MUST NOT raise a DisconnectedInputError error. y = gradient.grad((x + gradient.disconnected_grad(x)).sum(), x) a = T.matrix('a') b = T.matrix('b') y = a + gradient.disconnected_grad(b) # This MUST raise a DisconnectedInputError error. # This also rasies an additional warning from gradients.py. self.assertRaises(gradient.DisconnectedInputError, gradient.grad, y.sum(), b) # This MUST NOT raise a DisconnectedInputError error. gradient.grad(y.sum(), a)
def test_disconnected_paths(self): # Test that taking gradient going through a disconnected # path rasises an exception T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix("x") # This MUST raise a DisconnectedInputError error. # This also rasies an additional warning from gradients.py. with pytest.raises(gradient.DisconnectedInputError): gradient.grad(gradient.disconnected_grad(x).sum(), x) # This MUST NOT raise a DisconnectedInputError error. y = gradient.grad((x + gradient.disconnected_grad(x)).sum(), x) a = T.matrix("a") b = T.matrix("b") y = a + gradient.disconnected_grad(b) # This MUST raise a DisconnectedInputError error. # This also rasies an additional warning from gradients.py. with pytest.raises(gradient.DisconnectedInputError): gradient.grad(y.sum(), b) # This MUST NOT raise a DisconnectedInputError error. gradient.grad(y.sum(), a)
def compute_hessian(self, objective, argument): """ Computes the directional derivative of the gradient (which is equal to the Hessian multiplied by direction). """ g = T.grad(objective, argument) # Create a new tensor A, which has the same type (i.e. same # dimensionality) as argument. is_product_manifold = isinstance(argument, (list, tuple)) if not is_product_manifold: A = argument.type() else: A = [arg.type() for arg in argument] # First attempt efficient 'R-op', this directly calculates the # directional derivative of the gradient. try: R = T.Rop(g, argument, A) except NotImplementedError: # Implementation based on # tensorflow.python.ops.gradients_impl._hessian_vector_product if not is_product_manifold: proj = T.sum(g * disconnected_grad(A)) R = T.grad(proj, argument) else: proj = [ T.sum(g_elem * disconnected_grad(a_elem)) for g_elem, a_elem in zip(g, A) ] proj_grad = [ T.grad(proj_elem, argument, disconnected_inputs="ignore", return_disconnected="None") for proj_elem in proj ] proj_grad_transpose = map(list, zip(*proj_grad)) proj_grad_stack = [ T.stacklists([c for c in row if c is not None]) for row in proj_grad_transpose ] R = [T.sum(stack, axis=0) for stack in proj_grad_stack] if not is_product_manifold: hess = theano.function([argument, A], R, on_unused_input="warn") else: hess_prod = theano.function(argument + A, R, on_unused_input="warn") def hess(x, a): return hess_prod(*(x + a)) return hess
def __step(img, prev_bbox, state, timestep): conv1 = conv2d(img, conv1_filters, subsample=(conv1_stride, conv1_stride), border_mode='half') act1 = NN.relu(conv1) flat1 = TT.reshape(act1, (-1, conv1_output_dim)) gru_in = TT.concatenate([flat1, prev_bbox], axis=1) gru_z = NN.sigmoid(TT.dot(gru_in, Wz) + TT.dot(state, Uz) + bz) gru_r = NN.sigmoid(TT.dot(gru_in, Wr) + TT.dot(state, Ur) + br) gru_h_ = TT.tanh(TT.dot(gru_in, Wg) + TT.dot(gru_r * state, Ug) + bg) gru_h = (1 - gru_z) * state + gru_z * gru_h_ bbox = TT.tanh(TT.dot(gru_h, W_fc2) + b_fc2) bbox_cx = ((bbox[:, 2] + bbox[:, 0]) / 2 + 1) / 2 * img_row bbox_cy = ((bbox[:, 3] + bbox[:, 1]) / 2 + 1) / 2 * img_col bbox_w = TT.abs_(bbox[:, 2] - bbox[:, 0]) / 2 * img_row bbox_h = TT.abs_(bbox[:, 3] - bbox[:, 1]) / 2 * img_col x = TT.arange(img_row, dtype=T.config.floatX) y = TT.arange(img_col, dtype=T.config.floatX) mx = TT.maximum(TT.minimum(-TT.abs_(x.dimshuffle('x', 0) - bbox_cx.dimshuffle(0, 'x')) + bbox_w.dimshuffle(0, 'x') / 2., 1), 1e-4) my = TT.maximum(TT.minimum(-TT.abs_(y.dimshuffle('x', 0) - bbox_cy.dimshuffle(0, 'x')) + bbox_h.dimshuffle(0, 'x') / 2., 1), 1e-4) bbox_mask = mx.dimshuffle(0, 1, 'x') * my.dimshuffle(0, 'x', 1) new_cls1_f = cls_f new_cls1_b = cls_b mask = act1 * bbox_mask.dimshuffle(0, 'x', 1, 2) new_featmaps = TG.disconnected_grad(TT.set_subtensor(featmaps[:, timestep], mask)) new_featmaps.name = 'new_featmaps' new_probmaps = TG.disconnected_grad(TT.set_subtensor(probmaps[:, timestep], bbox_mask)) new_probmaps.name = 'new_probmaps' train_featmaps = TG.disconnected_grad(new_featmaps[:, :timestep+1].reshape(((timestep + 1) * batch_size, conv1_nr_filters, img_row, img_col))) train_featmaps.name = 'train_featmaps' train_probmaps = TG.disconnected_grad(new_probmaps[:, :timestep+1]) train_probmaps.name = 'train_probmaps' for _ in range(0, 5): train_convmaps = conv2d(train_featmaps, new_cls1_f, subsample=(cls1_stride, cls1_stride), border_mode='half').reshape((batch_size, timestep + 1, batch_size, img_row, img_col)) train_convmaps.name = 'train_convmaps' train_convmaps_selected = train_convmaps[TT.arange(batch_size).repeat(timestep+1), TT.tile(TT.arange(timestep+1), batch_size), TT.arange(batch_size).repeat(timestep+1)].reshape((batch_size, timestep+1, img_row, img_col)) train_convmaps_selected.name = 'train_convmaps_selected' train_predmaps = NN.sigmoid(train_convmaps_selected + new_cls1_b.dimshuffle(0, 'x', 'x', 'x')) train_loss = NN.binary_crossentropy(train_predmaps, train_probmaps).mean() train_grad_cls1_f, train_grad_cls1_b = T.grad(train_loss, [new_cls1_f, new_cls1_b]) new_cls1_f -= train_grad_cls1_f * 0.1 new_cls1_b -= train_grad_cls1_b * 0.1 return (bbox, gru_h, timestep + 1, mask, bbox_mask), {cls_f: TG.disconnected_grad(new_cls1_f), cls_b: TG.disconnected_grad(new_cls1_b), featmaps: TG.disconnected_grad(new_featmaps), probmaps: TG.disconnected_grad(new_probmaps)}
def compute_activations(self, input_data, do_round=True): layer_input = input_data layer_signals = [] for i, (w, b, k) in enumerate(zip(self.ws, self.bs, self.get_scales())): scaled_input = layer_input * k if not do_round: eta = None spikes = scaled_input else: eta = tt.round(scaled_input) - scaled_input spikes = scaled_input + disconnected_grad(eta) nonlinearity = get_named_activation_function( self.hidden_activations if i < len(self.ws) - 1 else self.output_activation) output = nonlinearity((spikes / k).dot(w) + b) layer_signals.append({ 'input': layer_input, 'scaled_input': scaled_input, 'eta': eta, 'spikes': spikes, 'output': output }) layer_input = output return layer_signals
def __init__(self, rewards_getter, seq2seq): """ Args: rewards_getter (BeLikeXRewards): seq2seq (seq2seq.Seq2Seq): """ self.rewards_getter = rewards_getter self.s2s = seq2seq self.rewards = rewards_getter.get(self.s2s.gentrain.words_seq) self.baseline = rewards_getter.get(self.s2s.gentrain.words_seq_greedy) self.advantage = disconnected_grad(self.rewards - self.baseline) # [batch_size,] assert self.advantage.ndim == 1, "WHAT IS WRONG WITH ADVANTAGE FUNCTION???" predicted_probas = self.s2s.gentrain.predicted_probas # [batch_size*n_steps, n_tokens] self.action_probs = predicted_probas[T.arange(predicted_probas.shape[0]), self.s2s.gentrain.words_seq[:,:-1].ravel()] self.action_probs = self.action_probs.reshape((self.advantage.shape[0], -1)) # [batch_size, n_steps] self.weights = self.s2s.gentest.weights self.loss = (-self.advantage[:, None] * self.action_probs).mean() + self.s2s.gentrain.llh_loss * self.LLH_ALPHA self.pg_grads = lasagne.updates.total_norm_constraint(T.grad(self.loss, self.weights), Config.TOTAL_NORM_GRAD_CLIP) self.pg_updates = lasagne.updates.adam(self.pg_grads, self.weights) self.train_step = theano.function(self.rewards_getter.input_vars + [self.s2s.enc.input_phrase, self.s2s.gentrain.reference_answers], [self.loss, self.rewards.mean()], updates=self.pg_updates + self.s2s.gentrain.recurrence.get_automatic_updates() + self.s2s.gentrain.recurrence_greedy_updates, on_unused_input='warn')
def _compute_nary_hessian_vector_product(self, gradients, arguments): """Returns a function accepting `2 * len(arguments)` arguments to compute a Hessian-vector product of a multivariate function. Notes ----- The implementation is based on TensorFlow's '_hessian_vector_product' function in 'tensorflow.python.ops.gradients_impl'. """ argument_types = [argument.type() for argument in arguments] try: Rop = T.Rop(gradients, arguments, argument_types) except NotImplementedError: proj = [ T.sum(gradient * disconnected_grad(argument_type)) for gradient, argument_type in zip(gradients, argument_types) ] proj_grad = [ T.grad(proj_elem, arguments, disconnected_inputs="ignore", return_disconnected="None") for proj_elem in proj ] proj_grad_transpose = map(list, zip(*proj_grad)) proj_grad_stack = [ T.stacklists([c for c in row if c is not None]) for row in proj_grad_transpose ] Rop = [T.sum(stack, axis=0) for stack in proj_grad_stack] return self._compile_function_without_warnings( list(itertools.chain(arguments, argument_types)), Rop)
def virtual_adversarial_training(predict_fn, inputs, logits, epsilon, num_iterations=1, xi=1e-6): vat_perturbation = virtual_adversarial_perturbation( predict_fn, inputs, logits, epsilon, num_iterations, xi) logits_vat = predict_fn(inputs + vat_perturbation) loss = kl_with_logits(gradient.disconnected_grad(logits), logits_vat) return loss
def build_model(self, p): S = Input(p['input_shape'], name='input_state') A = Input((1, ), name='input_action', dtype='int32') R = Input((1, ), name='input_reward') T = Input((1, ), name='input_terminate', dtype='int32') NS = Input(p['input_shape'], name='input_next_sate') self.Q_model = self.build_cnn_model(p) self.Q_old_model = self.build_cnn_model(p, False) # Q hat in paper self.Q_old_model.set_weights(self.Q_model.get_weights()) # Q' = Q Q_S = self.Q_model(S) # batch * actions Q_NS = disconnected_grad( self.Q_old_model(NS)) # disconnected gradient is not necessary y = R + p['discount'] * (1 - T) * K.max(Q_NS, axis=1, keepdims=True) # batch * 1 action_mask = K.equal( Tht.arange(p['num_actions']).reshape((1, -1)), A.reshape((-1, 1))) output = K.sum(Q_S * action_mask, axis=1).reshape((-1, 1)) loss = K.sum((output - y)**2) # sum could also be mean() optimizer = adam(p['learning_rate']) params = self.Q_model.trainable_weights update = optimizer.get_updates(params, [], loss) self.training_func = K.function([S, A, R, T, NS], loss, updates=update) self.Q_func = K.function([S], Q_S)
def build_model(self, p): S = Input(p['input_shape'], name='input_state') A = Input((1,), name='input_action', dtype='int32') R = Input((1,), name='input_reward') T = Input((1,), name='input_terminate', dtype='int32') NS = Input(p['input_shape'], name='input_next_sate') self.Q_model = self.build_cnn_model(p) self.Q_old_model = self.build_cnn_model(p, False) # Q hat in paper self.Q_old_model.set_weights(self.Q_model.get_weights()) # Q' = Q Q_S = self.Q_model(S) # batch * actions Q_NS = disconnected_grad(self.Q_old_model(NS)) # disconnected gradient is not necessary y = R + p['discount'] * (1-T) * K.max(Q_NS, axis=1, keepdims=True) # batch * 1 action_mask = K.equal(Tht.arange(p['num_actions']).reshape((1, -1)), A.reshape((-1, 1))) output = K.sum(Q_S * action_mask, axis=1).reshape((-1, 1)) loss = K.sum((output - y) ** 2) # sum could also be mean() optimizer = adam(p['learning_rate']) params = self.Q_model.trainable_weights update = optimizer.get_updates(params, [], loss) self.training_func = K.function([S, A, R, T, NS], loss, updates=update) self.Q_func = K.function([S], Q_S)
def test_connection_pattern(self): T = theano.tensor x = T.matrix('x') y = gradient.disconnected_grad(x) connection_pattern = y.owner.op.connection_pattern(y.owner) assert connection_pattern == [[False]]
def test_op_removed(self): x = theano.tensor.matrix("x") y = x * gradient.disconnected_grad(x) f = theano.function([x], y) # need to refer to theano.gradient.disconnected_grad here, # theano.gradient.disconnected_grad is a wrapper function! assert gradient.disconnected_grad_ not in [node.op for node in f.maker.fgraph.toposort()]
def _encode(self, application_call, text, mask, def_embs=None, def_map=None, text_name=None): if not self._random_unk: text = (tensor.lt(text, self._num_input_words) * text + tensor.ge(text, self._num_input_words) * self._vocab.unk) if text_name: application_call.add_auxiliary_variable( unk_ratio(text, mask, self._vocab.unk), name='{}_unk_ratio'.format(text_name)) embs = self._lookup.apply(text) if self._random_unk: embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs + tensor.ge(text, self._num_input_words)[:, :, None] * disconnected_grad(embs)) if def_embs: embs = self._combiner.apply(embs, mask, def_embs, def_map) add_role(embs, EMBEDDINGS) encoded = flip01( self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)), mask=mask.T)[0]) return encoded
def test_connection_pattern(self): T = theano.tensor x = T.matrix("x") y = gradient.disconnected_grad(x) connection_pattern = y.owner.op.connection_pattern(y.owner) assert connection_pattern == [[False]]
def __init__(self, weights, neurons_topology, cr_adjusting_sigma=1.5, cr_adjusting_sigma_decay=0.9, cr_learning_rate=0.005, cr_learning_rate_decay=0.9, **kwargs): super(ClusterRefiningSOM, self).__init__(weights, neurons_topology, **kwargs) self.cr_adjusting_sigma = theano.shared(cr_adjusting_sigma) self.cr_adjusting_sigma_decay = cr_adjusting_sigma_decay self.cr_learning_rate = theano.shared(cr_learning_rate) self.cr_learning_rate_decay = cr_learning_rate_decay self.affinities_to_data_point = T.exp(-self.distance_from_y_row / (self.cr_adjusting_sigma)**2) self.smoothed_distances_from_data_point = T.mul( self.distance_from_y_row, G.disconnected_grad(self.affinities_to_data_point)) self.cr_affinity_cost_scal = self.smoothed_distances_from_data_point.sum( ) self.cr_updates = sgd(self.cr_affinity_cost_scal, [self.W_shar_mat], learning_rate=self.cr_learning_rate) self.cr_update_neurons = theano.function([self.x_row], self.cr_affinity_cost_scal, updates=self.cr_updates)
def value_recur(self, ngs_jv, ngt_jv, b_jm1v, b_jm1N): # padding dummy Wfbs = T.concatenate( [self.Wfbs, T.zeros_like(self.Wfbs[-1:, :])], axis=0) Wfbt = T.concatenate( [self.Wfbt, T.zeros_like(self.Wfbt[-1:, :])], axis=0) # get ngram embedding fembs_v = T.sum(Wfbs[ngs_jv, :], axis=0) fembt_v = T.sum(Wfbt[ngt_jv, :], axis=0) # calculate g value g_jv = T.dot( self.Whb, T.nnet.sigmoid(fembs_v + fembt_v + G.disconnected_grad(b_jm1v) * self.Wrec + G.disconnected_grad(b_jm1N) * self.Wnon + self.B0)) return g_jv
def test_op_removed(self): x = theano.tensor.matrix('x') y = x * gradient.disconnected_grad(x) f = theano.function([x], y) # need to refer to theano.gradient.disconnected_grad here, # theano.gradient.disconnected_grad is a wrapper function! assert gradient.disconnected_grad_ not in \ [node.op for node in f.maker.fgraph.toposort()]
def cost(self, given_x, application_call): """Computes the loss function. Parameters ---------- given_x : tensor variable Batch of given visible states from dataset. Notes ----- The `application_call` argument is an effect of the `application` decorator and isn't visible to users. It's used internally to set an updates dictionary for `h` that's discoverable by `ComputationGraph`. """ x = given_x h_prev = self.h + self.initial_noise * self.theano_rng.normal(size=self.h.shape, dtype=self.h.dtype) h = h_next = h_prev old_energy = self.pp(self.energy(x, h).sum(), "old_energy", 1) for iteration in range(self.n_inference_steps): h_prev = h h = h_next h_next = self.pp( disconnected_grad(self.langevin_update(self.pp(x, "x", 3), self.pp(h_next, "h", 2))), "h_next", 2 ) new_energy = self.pp(self.energy(x, h_next).sum(), "new_energy", 1) delta_energy = self.pp(old_energy - new_energy, "delta_energy", 1) old_energy = new_energy h_prediction_residual = ( h_next - self.pp(h_prev, "h_prev", 3) + self.epsilon * tensor.grad(self.energy(x, h_prev).sum(), h_prev) ) J_h = self.pp((h_prediction_residual * h_prediction_residual).sum(axis=1).mean(axis=0), "J_h", 1) x_prediction_residual = self.pp(tensor.grad(self.energy(given_x, h_prev).sum(), given_x), "x_residual", 2) J_x = self.pp((x_prediction_residual * x_prediction_residual).sum(axis=1).mean(axis=0), "J_x", 1) if self.debug > 1: application_call.add_auxiliary_variable(J_x, name="J_x" + str(iteration)) application_call.add_auxiliary_variable(J_h, name="J_h" + str(iteration)) if iteration == 0: total_cost = J_h + J_x else: total_cost = total_cost + J_h + J_x per_iteration_cost = total_cost / self.n_inference_steps updates = OrderedDict([(self.h, h_next)]) application_call.updates = dict_union(application_call.updates, updates) if self.debug > 0: application_call.add_auxiliary_variable(per_iteration_cost, name="per_iteration_cost") if self.debug > 1: application_call.add_auxiliary_variable(self.Wxh * 1.0, name="Wxh") application_call.add_auxiliary_variable(self.Whh * 1.0, name="Whh") application_call.add_auxiliary_variable(self.Wxx * 1.0, name="Wxx") application_call.add_auxiliary_variable(self.b * 1, name="b") application_call.add_auxiliary_variable(self.c * 1, name="c") return self.pp(total_cost, "total_cost")
def value_recur(self, ngs_jv, ngt_jv, b_jm1v, b_jm1N): # padding dummy Wfbs = T.concatenate([self.Wfbs,T.zeros_like(self.Wfbs[-1:,:])], axis=0) Wfbt = T.concatenate([self.Wfbt,T.zeros_like(self.Wfbt[-1:,:])], axis=0) # get ngram embedding fembs_v= T.sum(Wfbs[ngs_jv,:],axis=0) fembt_v= T.sum(Wfbt[ngt_jv,:],axis=0) # calculate g value g_jv = T.dot( self.Whb, T.nnet.sigmoid( fembs_v + fembt_v + G.disconnected_grad(b_jm1v)*self.Wrec + G.disconnected_grad(b_jm1N)*self.Wnon + self.B0 )) return g_jv
def mean_interp_pad(x, padding): padding = (padding, padding) if isinstance(padding, int) else tuple(padding) size = tuple(np.array(padding) * 2 + 1) resize = ((x.shape[2] + 2 * padding[0], x.shape[2] - 2 * padding[0]), (x.shape[3] + 2 * padding[1], x.shape[3] - 2 * padding[1])) y = pool(x, size, (1, 1), mode='average_exc_pad') z = G.disconnected_grad(nn.utils.frac_bilinear_upsampling(y, resize)) _, _, h, w = z.shape return T.set_subtensor(z[:, :, padding[0]:h - padding[0], padding[1]:w - padding[1]], x)
def value_recur(self, vsrcpos_jsv, vtarpos_jsv, ssrcpos_jsv, starpos_jsv, b_jm1v, b_jm1N, ngms_j, ngmt_jm1, uttms_j, uttmt_jm1): # source features ssrcemb_jsv = T.sum(ngms_j[ssrcpos_jsv,:],axis=0) vsrcemb_jsv = T.sum(ngms_j[vsrcpos_jsv,:],axis=0) src_jsv = T.concatenate([ssrcemb_jsv,vsrcemb_jsv,uttms_j],axis=0) # target features staremb_jsv = T.sum(ngmt_jm1[starpos_jsv,:],axis=0) vtaremb_jsv = T.sum(ngmt_jm1[vtarpos_jsv,:],axis=0) tar_jsv = T.concatenate([staremb_jsv,vtaremb_jsv,uttmt_jm1],axis=0) # update g_jv g_jv = T.dot( self.Whb, T.nnet.sigmoid( T.dot(src_jsv,self.Wfbs) + T.dot(tar_jsv,self.Wfbt)+ G.disconnected_grad(b_jm1v)*self.Wrec + G.disconnected_grad(b_jm1N)*self.Wnon + self.B0 )) return g_jv
def gather_end_points(inputs_var, *args, **kwargs): logits = lasagne.layers.get_output(net, inputs=inputs_var, **kwargs) predictions = gradient.disconnected_grad(T.argmax(logits, axis=1)) prob = T.nnet.softmax(logits) end_points = { 'logits': logits, 'predictions': predictions, 'prob': prob } return end_points
def build_functions(self): A = Input(shape=(1, ), dtype='int32') R = Input(shape=(1, ), dtype='float32') T = Input(shape=(1, ), dtype='int32') if self.is_building_mlp: CNN_State = Input(shape=self.cnn_input_size) NN_State = Input(shape=self.nn_input_size) State = [CNN_State, NN_State] CNN_NState = Input(shape=self.cnn_input_size) NN_NState = Input(shape=self.nn_input_size) NState = [CNN_NState, NN_NState] else: State = Input(shape=self.cnn_input_size) NState = Input(shape=self.cnn_input_size) self.log["debug"]("State : " + str(State)) self.log["debug"]("NState : " + str(NState)) self.build_cnn_model() if self.is_building_mlp: self.value_fn = K.function(State, self.model(State)) VS = self.model(State) VNS = disconnected_grad(self.model(NState)) else: self.value_fn = K.function([State], self.model(State)) VS = self.model([State]) VNS = disconnected_grad(self.model([NState])) future_value = (1 - T) * VNS.max(axis=1, keepdims=True) discounted_future_value = self.discount * future_value target = R + discounted_future_value cost = ((VS[:, A] - target)**2).mean() opt = RMSprop(lr=self.lr) params = self.model.trainable_weights updates = opt.get_updates(params, [], cost) if self.is_building_mlp: self.train_fn = K.function( [CNN_State, NN_State, CNN_NState, NN_NState, A, R, T], cost, updates=updates) else: self.train_fn = K.function([State, NState, A, R, T], cost, updates=updates)
def test_grad(self): T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix("x") expressions_gradients = [ (x * gradient.disconnected_grad(x), x), (x * gradient.disconnected_grad(T.exp(x)), T.exp(x)), (x ** 2 * gradient.disconnected_grad(x), 2 * x ** 2), ] for expr, expr_grad in expressions_gradients: g = gradient.grad(expr.sum(), x) # gradient according to theano f = theano.function([x], g, on_unused_input="ignore") # desired gradient f2 = theano.function([x], expr_grad, on_unused_input="ignore") assert np.allclose(f(a), f2(a))
def test_grad(self): T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix("x") expressions_gradients = [ (x * gradient.disconnected_grad(x), x), (x * gradient.disconnected_grad(T.exp(x)), T.exp(x)), (x**2 * gradient.disconnected_grad(x), 2 * x**2), ] for expr, expr_grad in expressions_gradients: g = gradient.grad(expr.sum(), x) # gradient according to theano f = theano.function([x], g, on_unused_input="ignore") # desired gradient f2 = theano.function([x], expr_grad, on_unused_input="ignore") assert np.allclose(f(a), f2(a))
def value_recur(self, vsrcpos_jsv, vtarpos_jsv, ssrcpos_jsv, starpos_jsv, b_jm1v, b_jm1N, ngms_j, ngmt_jm1, uttms_j, uttmt_jm1): # source features ssrcemb_jsv = T.sum(ngms_j[ssrcpos_jsv, :], axis=0) vsrcemb_jsv = T.sum(ngms_j[vsrcpos_jsv, :], axis=0) src_jsv = T.concatenate([ssrcemb_jsv, vsrcemb_jsv, uttms_j], axis=0) # target features staremb_jsv = T.sum(ngmt_jm1[starpos_jsv, :], axis=0) vtaremb_jsv = T.sum(ngmt_jm1[vtarpos_jsv, :], axis=0) tar_jsv = T.concatenate([staremb_jsv, vtaremb_jsv, uttmt_jm1], axis=0) # update g_jv g_jv = T.dot( self.Whb, T.nnet.sigmoid( T.dot(src_jsv, self.Wfbs) + T.dot(tar_jsv, self.Wfbt) + G.disconnected_grad(b_jm1v) * self.Wrec + G.disconnected_grad(b_jm1N) * self.Wnon + self.B0)) return g_jv
def fgm(x, predictions, y=None, eps=0.3, ord=np.inf, clip_min=None, clip_max=None): """ Theano implementation of the Fast Gradient Sign method. :param x: the input placeholder :param predictions: the model's output tensor :param y: the output placeholder. Use None (the default) to avoid the label leaking effect. :param eps: the epsilon (input variation parameter) :param ord: (optional) Order of the norm (mimics Numpy). Possible values: np.inf (other norms not implemented yet). :param clip_min: optional parameter that can be used to set a minimum value for components of the example returned :param clip_max: optional parameter that can be used to set a maximum value for components of the example returned :return: a tensor for the adversarial example """ warnings.warn("cleverhans support for Theano is deprecated and " "will be dropped on 2017-11-08.") assert ord == np.inf, "Theano implementation not available for this norm." eps = np.asarray(eps, dtype=floatX) if y is None: # Using model predictions as ground truth to avoid label leaking y = T.eq(predictions, T.max(predictions, axis=1, keepdims=True)) y = T.cast(y, utils_th.floatX) y = y / T.sum(y, 1, keepdims=True) # Compute loss loss = utils_th.model_loss(y, predictions, mean=True) # Define gradient of loss wrt input grad = T.grad(loss, x) # Take sign of gradient signed_grad = T.sgn(grad) # Multiply by constant epsilon scaled_signed_grad = eps * signed_grad # Add perturbation to original example to obtain adversarial example adv_x = gradient.disconnected_grad(x + scaled_signed_grad) # If clipping is needed, reset all values outside of [clip_min, clip_max] if (clip_min is not None) and (clip_max is not None): adv_x = T.clip(adv_x, clip_min, clip_max) return adv_x
def _compute_unary_hessian_vector_product(self, gradient, argument): """Returns a function accepting two arguments to compute a Hessian-vector product of a scalar-valued unary function. """ argument_type = argument.type() try: Rop = T.Rop(gradient, argument, argument_type) except NotImplementedError: proj = T.sum(gradient * disconnected_grad(argument_type)) Rop = T.grad(proj, argument) return self._compile_function_without_warnings( [argument, argument_type], Rop)
def vatm(model, x, predictions, eps, num_iterations=1, xi=1e-6, clip_min=None, clip_max=None, seed=12345): """ Theano implementation of the perturbation method used for virtual adversarial training: https://arxiv.org/abs/1507.00677 :param model: the model which returns the network unnormalized logits :param x: the input placeholder :param predictions: the model's unnormalized output tensor :param eps: the epsilon (input variation parameter) :param num_iterations: the number of iterations :param xi: the finite difference parameter :param clip_min: optional parameter that can be used to set a minimum value for components of the example returned :param clip_max: optional parameter that can be used to set a maximum value for components of the example returned :param seed: the seed for random generator :return: a tensor for the adversarial example """ eps = np.asarray(eps, dtype=floatX) xi = np.asarray(xi, dtype=floatX) rng = RandomStreams(seed=seed) d = rng.normal(size=x.shape, dtype=x.dtype) for i in range(num_iterations): d = xi * utils_th.l2_batch_normalize(d) logits_d = model(x + d) kl = utils_th.kl_with_logits(predictions, logits_d) Hd = T.grad(kl.sum(), d) d = gradient.disconnected_grad(Hd) d = eps * utils_th.l2_batch_normalize(d) adv_x = gradient.disconnected_grad(x + d) if (clip_min is not None) and (clip_max is not None): adv_x = T.clip(adv_x, clip_min, clip_max) return adv_x
def virtual_adversarial_perturbation(predict_fn, inputs, logits, epsilon, num_iterations=1, xi=1e-6, seed=12345): epsilon = floatX(epsilon) xi = floatX(xi) rng = RandomStreams(seed=seed) d = rng.normal(size=inputs.shape, dtype=inputs.dtype) for i in range(num_iterations): d = xi * normalize_perturbation(d) logits_d = predict_fn(inputs + d) kl = kl_with_logits(logits, logits_d) Hd = T.grad(kl.sum(), d) d = gradient.disconnected_grad(Hd) return epsilon * normalize_perturbation(d)
def generate_adv_example(embedded, loss, perturb_scale): # embedded: [n_examples, input_length, feature_dim] grad = gradient.grad(loss, embedded) grad = gradient.disconnected_grad(grad) shifted = embedded + T.max(T.abs_(embedded)) + 1.0 grad_dim = (shifted / shifted).sum(axis=(1, 2)).mean( axis=0) # grad dim for each example sqrt_grad_dim = T.sqrt(grad_dim) # sqrt(input_length * emb_dim) perturb = perturb_scale * sqrt_grad_dim * _scale_unit_l2(grad) return embedded + perturb
def costs(self, application_call, prediction, prediction_mask, groundtruth, groundtruth_mask, **inputs): states = disconnected_grad(inputs['states']) merged = self.merge(**dict_subset(inputs, self.merge_names)) # Compute log-probabilities for the predicted tokens log_probs = -self.all_scores(prediction, merged) * prediction_mask # Compute per-token rewards rewards = self.reward_brick.apply(prediction, prediction_mask, groundtruth, groundtruth_mask).sum(axis=-1) # Encourage entropy by adding negated log-probs to the rewards application_call.add_auxiliary_variable(log_probs, name='log_probs') if self.entropy_coof: rewards += self.entropy_coof * disconnected_grad(-log_probs) future_rewards = rewards[::-1].cumsum(axis=0)[::-1] baselines = self.value_prediction.apply(states)[:, :, 0] application_call.add_auxiliary_variable(baselines, name='baselines') # Compute baseline error centered_future_rewards = future_rewards - baselines baseline_errors = ((centered_future_rewards * disconnected_grad(prediction_mask))**2).sum(axis=0) application_call.add_auxiliary_variable(baseline_errors, name='baseline_errors') # The gradient of this will be the REINFORCE 1-sample # gradient estimate costs = (disconnected_grad(centered_future_rewards) * log_probs * prediction_mask).sum(axis=0) # Add auxiliary variables for intermediate steps of the computation application_call.add_auxiliary_variable(rewards, name='rewards') application_call.add_auxiliary_variable(log_probs.copy(), name='prediction_log_probs') return costs
def build_functions(self): S = Input(shape=self.state_size) NS = Input(shape=self.state_size) A = Input(shape=(1, ), dtype='int32') R = Input(shape=(1, ), dtype='float32') T = Input(shape=(1, ), dtype='int32') self.build_model() self.value_fn = K.function([S], self.model(S)) VS = self.model(S) VNS = disconnected_grad(self.model(NS)) future_value = (1 - T) * VNS.max(axis=1, keepdims=True) discounted_future_value = self.discount * future_value target = R + discounted_future_value cost = ((VS[:, A] - target)**2).mean() opt = RMSprop(0.0001) params = self.model.trainable_weights updates = opt.get_updates(params, [], cost) self.train_fn = K.function([S, NS, A, R, T], cost, updates=updates)
def build_functions(self): S = Input(shape=self.state_size) NS = Input(shape=self.state_size) A = Input(shape=(1,), dtype='int32') R = Input(shape=(1,), dtype='float32') T = Input(shape=(1,), dtype='int32') self.build_model() self.value_fn = K.function([S], self.model(S)) VS = self.model(S) VNS = disconnected_grad(self.model(NS)) future_value = (1-T) * VNS.max(axis=1, keepdims=True) discounted_future_value = self.discount * future_value target = R + discounted_future_value cost = ((VS[:, A] - target)**2).mean() opt = RMSprop(0.0001) params = self.model.trainable_weights updates = opt.get_updates(params, [], cost) self.train_fn = K.function([S, NS, A, R, T], cost, updates=updates)
def __init__(self, weights, neurons_topology, learning_rate=0.1, learning_rate_decay=0.985, collaboration_sigma=1.0, collaboration_sigma_decay=0.95, verbosity=2): self._verbosity = verbosity self._history = [] self.neurons_number = weights.shape[0] self.W_shar_mat = theano.shared(weights) self.D_shar_mat = theano.shared(neurons_topology) self.collaboration_sigma = theano.shared(collaboration_sigma) self.collaboration_sigma_decay = collaboration_sigma_decay self.x_row = T.vector("exemplar") self.x_mat = T.matrix("batch") self.learning_rate = theano.shared(learning_rate) self.learning_rate_decay = learning_rate_decay self.distance_from_y_row = ((T.sub(self.W_shar_mat, self.x_row)**2).sum(axis=1)) self.closest_neuron_idx = T.argmin(self.distance_from_y_row) self.distances_from_closest_neuron = self.D_shar_mat[ self.closest_neuron_idx] self.affinities_to_closest_neuron = T.exp( -self.distances_from_closest_neuron / (self.collaboration_sigma)**2) self.smoothed_distances_from_closest_neuron = T.mul( self.distance_from_y_row, G.disconnected_grad(self.affinities_to_closest_neuron)) self.cost_scal = self.smoothed_distances_from_closest_neuron.sum() self.updates = sgd(self.cost_scal, [self.W_shar_mat], learning_rate=self.learning_rate) self.update_neurons = theano.function([self.x_row], self.cost_scal, updates=self.updates)
def __init__(self, weights, neurons_topology, relaxing_factor=-0.5, **kwargs): super(WinnerRelaxingSOM, self).__init__(weights, neurons_topology, **kwargs) self.wr_relaxing_factor = relaxing_factor self.wr_relaxing_member = ( self.smoothed_distances_from_closest_neuron.sum() - self.smoothed_distances_from_closest_neuron[ self.closest_neuron_idx]) self.cost_scal += self.wr_relaxing_factor * self.learning_rate * T.mul( self.W_shar_mat[self.closest_neuron_idx], G.disconnected_grad(self.wr_relaxing_member)).sum() self.updates = sgd(self.cost_scal, [self.W_shar_mat], learning_rate=self.learning_rate) self.update_neurons = theano.function([self.x_row], self.cost_scal, updates=self.updates)
def margin_sensitivity(inputs, logits, labels, num_outputs, ord=2): """Compute margin sensitivity (proposed regularization). """ assert ord in [2, np.inf] batch_size = inputs.shape[0] batch_indices = T.arange(batch_size) # shape: labels, batch, channels, height, width jac = jacobian(logits, inputs, num_outputs=num_outputs, pack_dim=0) # basically jac_labels = jac[labels, batch_indices] jac_flt = jac.reshape( (-1, inputs.shape[1], inputs.shape[2], inputs.shape[3])) jac_labels_flt = jac_flt[labels * batch_size + batch_indices] jac_labels = jac_labels_flt.reshape(inputs.shape) w = jac - T.shape_padaxis(jac_labels, axis=0) reduce_ind = range(2, inputs.ndim + 1) if ord == 2: dist = T.sum(w**2, axis=reduce_ind) elif ord == np.inf: dist = T.sum(T.abs_(w), axis=reduce_ind) else: raise ValueError l = T.argmax(dist, axis=0) l = gradient.disconnected_grad(l) corrects = logits[batch_indices, labels] others = logits[batch_indices, l] corrects_grad = T.grad(corrects.sum(), inputs) others_grad = T.grad(others.sum(), inputs) reduce_ind = range(1, inputs.ndim) if ord == 2: return T.sum((corrects_grad - others_grad)**2, axis=reduce_ind) elif ord == np.inf: return T.sum(T.abs_(corrects_grad - others_grad), axis=reduce_ind) else: raise ValueError
def cost(self, application_call, **kwargs): # pop inputs we know about inputs_mask = kwargs.pop('inputs_mask') labels = kwargs.pop('labels') labels_mask = kwargs.pop('labels_mask') # the rest is for bottom bottom_processed = self.bottom.apply(**kwargs) encoded, encoded_mask = self.encoder.apply(input_=bottom_processed, mask=inputs_mask) encoded = self.top.apply(encoded) outs_forward = self.generators[0].evaluate(labels, labels_mask, attended=encoded, attended_mask=encoded_mask) costs_forward, states_forward, _, _, _, _ = outs_forward outs_backward = self.generators[1].evaluate( labels[::-1], labels_mask[::-1] if labels_mask else None, attended=encoded[::-1], attended_mask=encoded_mask[::-1]) costs_backward, states_backward, _, _, _, _ = outs_backward costs_backward = costs_backward[::-1] states_backward = states_backward[::-1] states_shape = states_forward.shape backward_predicted = self.forward_to_backward.apply( states_forward.reshape((states_shape[0] * states_shape[1], -1))) backward_predicted = backward_predicted.reshape(states_shape) backward_predicted = backward_predicted * labels_mask[:, :, None] states_backward = gradient.disconnected_grad(states_backward) states_backward = states_backward * labels_mask[:, :, None] l2_cost = ((backward_predicted - states_backward)**2).mean(axis=2) l2_cost.name = 'l2_cost_aux' application_call.add_auxiliary_variable( l2_cost.sum(axis=0).mean().copy(name='l2_cost_aux')) costs_forward_aux = (costs_forward.sum(axis=0).mean()).copy( name='costs_forward_aux') application_call.add_auxiliary_variable(costs_forward_aux) return costs_forward + costs_backward + 1.5 * l2_cost
def fast_gradient_perturbation(inputs, logits, labels=None, epsilon=0.3, ord=np.inf): epsilon = floatX(epsilon) if labels is None: raise ValueError nll = categorical_crossentropy(logits, labels) grad = T.grad(nll.sum(), inputs, consider_constant=[labels]) if ord == np.inf: perturbation = T.sgn(grad) elif ord == 1: sum_ind = list(range(1, inputs.ndim)) perturbation = grad / T.sum(T.abs_(grad), axis=sum_ind, keepdims=True) elif ord == 2: sum_ind = list(range(1, inputs.ndim)) perturbation = grad / T.sqrt( T.sum(grad**2, axis=sum_ind, keepdims=True)) perturbation *= epsilon return gradient.disconnected_grad(perturbation)
def __step(img, prev_bbox, prev_att, state, prev_conf, prev_sugg, prev_W, prev_b, prev_pos, prev_neg, timestep): cx = (prev_bbox[:, 2] + prev_bbox[:, 0]) / 2. cy = (prev_bbox[:, 3] + prev_bbox[:, 1]) / 2. sigma = TT.exp(prev_att[:, 0]) * (max(img_col, img_row) / 2) fract = TT.exp(prev_att[:, 1]) amplifier = TT.exp(prev_att[:, 2]) eps = 1e-8 abs_cx = (cx + 1) / 2. * (img_col - 1) abs_cy = (cy + 1) / 2. * (img_row - 1) abs_stride = (fract * (max(img_col, img_row) - 1)) * ((1. / (NUM_N - 1.)) if NUM_N > 1 else 0) FX, FY = __filterbank(abs_cx, abs_cy, abs_stride, sigma) unnormalized_mask = (FX.dimshuffle(0, 'x', 1, 'x', 2) * FY.dimshuffle(0, 1, 'x', 2, 'x')).sum(axis=2).sum(axis=1) mask = unnormalized_mask# / (unnormalized_mask.sum(axis=2).sum(axis=1) + eps).dimshuffle(0, 'x', 'x') masked_img = img conv1 = conv2d(masked_img, conv1_filters, subsample=(conv1_stride, conv1_stride)) act1 = TT.tanh(conv1) flat1 = TT.reshape(act1, (-1, conv1_output_dim)) gru_in = TT.concatenate([flat1, prev_bbox, prev_conf.reshape((batch_size, 1)), prev_sugg], axis=1) gru_z = NN.sigmoid(TT.dot(gru_in, Wz) + TT.dot(state, Uz) + bz) gru_r = NN.sigmoid(TT.dot(gru_in, Wr) + TT.dot(state, Ur) + br) gru_h_ = TT.tanh(TT.dot(gru_in, Wg) + TT.dot(gru_r * state, Ug) + bg) gru_h = (1 - gru_z) * state + gru_z * gru_h_ bbox = TT.tanh(TT.dot(gru_h, W_fc2) + b_fc2) att = TT.dot(gru_h, W_fc3) + b_fc3 def batch_dot(a, b): return (a.dimshuffle(0, 1, 2, 'x') * b.dimshuffle(0, 'x', 1, 2)).sum(axis=2) def bounding(bbox): return TT.stack([TT.maximum(bbox[:, 0], -1), TT.minimum(bbox[:, 1], 1), TT.maximum(bbox[:, 2], -1), TT.minimum(bbox[:, 3], 1)], axis=1) def sample_positives(bbox): x0 = bbox[:, 0] y0 = bbox[:, 1] x1 = bbox[:, 2] y1 = bbox[:, 3] return TT.stack([bounding(TT.as_tensor([x0, y0, x1, y1]).T), bounding(TT.as_tensor([x0 * 0.75 + x1 * 0.25, y0, x1, y1]).T), bounding(TT.as_tensor([x0, y0 * 0.75 + y1 * 0.25, x1, y1]).T), bounding(TT.as_tensor([x0, y0, x1 * 0.75 + x0 * 0.25, y1]).T), bounding(TT.as_tensor([x0, y0, x1, y1 * 0.75 + y0 * 0.25]).T), bounding(TT.as_tensor([x0 * 1.25 - x1 * 0.25, y0, x1, y1]).T), bounding(TT.as_tensor([x0, y0 * 1.25 - y1 * 0.25, x1, y1]).T), bounding(TT.as_tensor([x0, y0, x1 * 1.25 - x0 * 0.25, y1]).T), bounding(TT.as_tensor([x0, y0, x1, y1 * 1.25 - y0 * 0.25]).T), ], axis=1) def sample_negatives(bbox): x0 = bbox[:, 0] y0 = bbox[:, 1] x1 = bbox[:, 2] y1 = bbox[:, 3] return TT.stack([bounding(TT.as_tensor([x0 * 0.5 + x1 * 0.5, y0, x1, y1]).T), bounding(TT.as_tensor([x0, y0 * 0.5 + y1 * 0.5, x1, y1]).T), bounding(TT.as_tensor([x0, y0, x1 * 0.5 + x0 * 0.5, y1]).T), bounding(TT.as_tensor([x0, y0, x1, y1 * 0.5 + y0 * 0.5]).T), bounding(TT.as_tensor([x0 * 1.5 - x1 * 0.5, y0, x1 * 0.5 + x0 * 0.5, y1]).T), bounding(TT.as_tensor([x0, y0 * 1.5 - y1 * 0.5, x1, y1 * 0.5 + y0 * 0.5]).T), bounding(TT.as_tensor([x0 * 0.5 + x1 * 0.5, y0, x1 * 1.5 - x0 * 0.5, y1]).T), bounding(TT.as_tensor([x0, y0 * 0.5 + y1 * 0.5, x1, y1 * 1.5 - y0 * 0.5]).T), ], axis=1) def sample_around(bbox): return TT.concatenate([sample_positives(bbox), sample_negatives(bbox)], axis=1) crop = batch_multicrop(bbox.dimshuffle(0, 'x', 1), img) feat = conv2d(crop.reshape((batch_size, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, 1, -1)) conf = NN.sigmoid(batch_dot(feat, prev_W) + TT.addbroadcast(prev_b, 1)) nr_samples = 17 sugg_bbox = sample_around(bbox) # (batch_size, nr_samples, 4) sugg_crop = batch_multicrop(sugg_bbox, img) sugg_feat = conv2d(sugg_crop.reshape((batch_size * nr_samples, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, nr_samples, -1)) sugg_conf = batch_dot(sugg_feat, prev_W) + TT.addbroadcast(prev_b, 1) print sugg_conf.dtype sugg_pos = TT.cast(sugg_conf > 0, T.config.floatX) print sugg_pos.dtype sugg = TG.disconnected_grad((sugg_bbox * TT.patternbroadcast(sugg_pos, [False, False, True])).sum(axis=1) / TT.patternbroadcast(sugg_pos.sum(axis=1), [False, True])) def classify(x, W, b): # x: (batch_size, samples_per_batch, feature_per_sample) return NN.sigmoid(batch_dot(x, W) + TT.addbroadcast(b, 1)) def update_step(W, b, x, y, alpha=1): y_hat = classify(x, W, b) loss = ((y_hat - y) ** 2).mean() g = T.grad(loss, [W, b]) return (W - alpha * g[0], b - alpha * g[1], loss), T.scan_module.until(loss < 0.01) nr_samples = 9 pos_bbox = sample_positives(bbox) pos_crop = batch_multicrop(pos_bbox, img) pos_feat = conv2d(pos_crop.reshape((batch_size * nr_samples, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, nr_samples, -1)) pos = TG.disconnected_grad(TT.set_subtensor(prev_pos[:, (nr_samples*timestep):(nr_samples*(timestep+1))], pos_feat)) nr_samples = 8 neg_bbox = sample_negatives(bbox) neg_crop = batch_multicrop(neg_bbox, img) neg_feat = conv2d(neg_crop.reshape((batch_size * nr_samples, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, nr_samples, -1)) neg = TG.disconnected_grad(TT.set_subtensor(prev_neg[:, (nr_samples*timestep):(nr_samples*(timestep+1))], neg_feat)) update_scan, _ = T.scan(fn=update_step, outputs_info=[prev_W, prev_b, None], non_sequences=[TT.concatenate([pos[:, :9*timestep], neg[:, :8*timestep]], axis=1), TT.concatenate([TT.ones((batch_size, 9*timestep, 1)), -TT.ones((batch_size, 8*timestep, 1))], axis=1)], n_steps=1000) new_W, new_b = TG.disconnected_grad(update_scan[0][-1]), TG.zero_grad(update_scan[1][-1]) return bbox, att, gru_h, TT.unbroadcast(conf, 1), sugg, new_W, TT.unbroadcast(new_b, 1), pos, neg, timestep + 1
def dg2(x): return disconnected_grad(disconnected_grad(x))
def approx_grad(self,Xvec,mcw): X = Xvec.reshape((-1,self.ndim)) means,covars,weights,_ = self.split_params(mcw) log_prob = calc_log_prob_gmm_componetwise(X,means,covars,weights) w = T.nnet.softmax(log_prob) s_w = T.sum(w,0) w_means = T.sum(w[:,:,None]*X[:,None,:],0)/(s_w[:,None]+0.0001) w_covars = T.sum(w[:,:,None]*((w_means[None,:,:]-X[:,None,:])**2),0)/(s_w[:,None]+0.0001) w_mcw = T.concatenate((w_means.flatten(),w_covars.flatten(),weights)) return jacobian(w_mcw,[Xvec],consider_constant=[mcw,Xvec,w,s_w])[0] def grad(self, (Yvec,), output_grads): Yvec = gradient.disconnected_grad(Yvec) mcw_vec = GMMOp(self.gm_num,self.ndim,self.gmm)(Yvec) if(self.use_approx_grad): return [output_grads[0].dot(self.approx_grad(Yvec,mcw_vec))] else: lam = Yvec.shape[0]//self.ndim mcwl_vec = T.concatenate((mcw_vec,lam.reshape((1,)))) N,M = self.build_linear_system(Yvec,mcwl_vec) dX = self.solve_linear_system(N,M) return [output_grads[0].dot(gradient.disconnected_grad(dX[0:dX.shape[0]-1, :]))] def get_gmm(X,gm_num,ndims,use_approx_grad=False,covariance_type='diag'): if(gm_num == 1): means = T.mean(X,0).reshape((1,-1)) covars = (T.std(X,0)**2).reshape((1,-1))+1e-8 weights = T.ones(1)
def encode(self, belief_t, degree_t, intent_t, masked_source_t, masked_source_len_t, masked_target_t, masked_target_len_t, utt_group_t, sample_t=None): # prepare belief state vector belief_t = G.disconnected_grad(T.concatenate(belief_t,axis=0)) ########################## # prior parameterisarion # ########################## hidden_t = T.tanh( T.dot(belief_t,self.Ws1)+ T.dot(degree_t,self.Ws2)+ T.dot(intent_t,self.Ws3)) prior_t = T.nnet.softmax( T.dot( T.tanh( T.dot(hidden_t,self.Wp1)+self.bp1), self.Wp2) ) ############################## # posterior parameterisation # ############################## # response encoding target_intent_t = bidirectional_encode( self.tfEncoder, self.tbEncoder, masked_target_t, masked_target_len_t ) source_intent_t = bidirectional_encode( self.sfEncoder, self.sbEncoder, masked_source_t, masked_source_len_t ) # scores before softmax layer q_logit_t = T.dot(T.tanh( T.dot(belief_t,self.Wq1)+ T.dot(degree_t,self.Wq2)+ T.dot(source_intent_t,self.Wq3)+ T.dot(target_intent_t,self.Wq4)), self.Wq5 ) # sampling from a scaled posterior if self.sample_mode=='posterior': print '\t\tSampling from posterior ...' posterior_t= T.nnet.softmax(q_logit_t) z_t = T.switch( T.lt(utt_group_t,self.dl-1), utt_group_t, G.disconnected_grad( T.argmax( self.srng.multinomial( pvals=posterior_t,dtype='float32')[0]) ) ) else: # choose to use the current sample or ground truth print '\t\tSampling from prior ...' z_t = T.switch( T.lt(utt_group_t,self.dl-1), utt_group_t, sample_t) # put sample into decoder to decode hidden_t = T.nnet.sigmoid(self.Wd2[z_t,:]+self.bd1)*hidden_t actEmb_t = T.tanh(T.dot( T.concatenate( [T.tanh(self.Wd1[z_t,:]),hidden_t],axis=0 ), self.Wd3)).dimshuffle('x',0) # return the true posterior posterior_t= T.nnet.softmax(q_logit_t) # compute baseline estimate b_t = self.baseline.encode(belief_t,degree_t,source_intent_t,target_intent_t) return actEmb_t, prior_t[0], posterior_t[0], z_t, b_t, posterior_t