def vtovMBall(self, VsampM): """ computes visible unit outputs given visible unit inputs (single MCMC iteration) multiple paralle MCMC iterations using rows of the input matrix args: VsampM (T.matrix): rows of this matrix are visible unit inputs return: ahtovMBres (T.matrix): rows of this matrix are visible unit outputs after a single MCMC iteration """ #v to h part aVomg = T.matrix(name="Vomg", dtype=theano.config.floatX) avtohMBres = T.matrix(name ="vtohMBres", dtype=theano.config.floatX) aT_HP = T.matrix(name="T_HP", dtype=theano.config.floatX) aVomg = T.dot(T.mul(T.fill(VsampM, T.exp(-self.T_z)), VsampM), self.T_omega) aT_Hp = T.nnet.ultra_fast_sigmoid(T.fill(aVomg, self.T_b) + aVomg) avtohMBres = self.T_rng.binomial(size = aT_Hp.shape, p=aT_Hp, dtype=theano.config.floatX) #h to v part: aT_omgH = T.matrix(name="T_omgH", dtype=theano.config.floatX) aT_means = T.matrix(name="T_means", dtype=theano.config.floatX) ahtovMBres = T.matrix(name="htovMBres", dtype=theano.config.floatX) aT_omgH = T.transpose(T.dot(self.T_omega, T.transpose(avtohMBres))) aT_means = T.fill(aT_omgH, self.T_a) + aT_omgH ahtovMBres = self.T_rng.normal(size=aT_means.shape, avg=aT_means, std=T.fill(aT_means,T.sqrt(T.exp(self.T_z))), dtype=theano.config.floatX) return [ahtovMBres, avtohMBres, aT_Hp, aT_means]
def test_exp_over_1_plus_exp(self): m = self.get_mode(excluding=['local_elemwise_fusion']) x = T.dvector() # tests exp_over_1_plus_exp f = theano.function([x], T.exp(x)/(1+T.exp(x)), mode=m) theano.printing.debugprint(f) assert [node.op for node in f.maker.env.toposort()] == [sigmoid] # tests inv_1_plus_exp f = theano.function([x], T.fill(x,1.0) / (1+T.exp(-x)), mode=m) theano.printing.debugprint(f) assert [node.op for node in f.maker.env.toposort()] == [sigmoid] # tests inv_1_plus_exp with neg f = theano.function([x], T.fill(x,-1.0) / (1+T.exp(-x)), mode=m) assert [node.op for node in f.maker.env.toposort()] == [sigmoid, theano.tensor.inplace.neg_inplace] # tests double inv_1_plus_exp with neg # (-1)(exp(x)) / (1+exp(x))(1+exp(-x)) # = (-1)/(1+exp(-x)) * exp(x)/(1+exp(x)) # = - (sigm(x) * sigm(x)) f = theano.function([x], (T.fill(x,-1.0)*T.exp(x)) / ((1+T.exp(x))*(1+T.exp(-x))), mode=m) theano.printing.debugprint(f) assert [node.op for node in f.maker.env.toposort()] == [sigmoid, T.mul, theano.tensor.inplace.neg_inplace]
def _gen_exprs(self, inpt): """Return the exprssions of the recognition model.""" P = self.parameters.gen n_layers = len(self.n_hiddens_gen) hidden_to_hiddens = [ getattr(P, 'hidden_to_hidden_%i' % i) for i in range(n_layers - 1) ] hidden_biases = [ getattr(P, 'hidden_bias_%i' % i) for i in range(n_layers) ] initial_hidden_means = [ getattr(P, 'initial_hidden_means_%i' % i) for i in range(n_layers) ] initial_hidden_vars = [ getattr(P, 'initial_hidden_vars_%i' % i)**2 + 1e-4 for i in range(n_layers) ] recurrents = [getattr(P, 'recurrent_%i' % i) for i in range(n_layers)] p_dropout_inpt = T.zeros_like(inpt[:, :, :self.n_latent]) p_dropout_inpt = T.fill(p_dropout_inpt, self.p_dropout_inpt) p_dropout_shortcut = T.zeros_like(inpt[:, :, self.n_latent:]) p_dropout_shortcut = T.fill(p_dropout_shortcut, self.p_dropout_inpt) p_dropout_inpt = T.concatenate([p_dropout_inpt, p_dropout_shortcut], axis=2) p_dropouts = [p_dropout_inpt] + self.p_dropout_hiddens if self.p_dropout_hidden_to_out is None: p_dropouts.append(self.p_dropout_hiddens[-1]) else: p_dropouts.append(self.p_dropout_hidden_to_out) exprs = vprnn.exprs(inpt, T.zeros_like(inpt), P.in_to_hidden, hidden_to_hiddens, P.hidden_to_out, hidden_biases, [1 for _ in hidden_biases], initial_hidden_means, initial_hidden_vars, recurrents, P.out_bias, 1, self.gen_transfers, self.assumptions.statify_visible, p_dropouts=p_dropouts) return exprs
def dlogp(inputs, gradients): g_logp, = gradients cov, delta = inputs g_logp.tag.test_value = floatX(1.) n, k = delta.shape chol_cov = cholesky(cov) diag = tt.nlinalg.diag(chol_cov) ok = tt.all(diag > 0) chol_cov = tt.switch(ok, chol_cov, tt.fill(chol_cov, 1)) delta_trans = solve_lower(chol_cov, delta.T).T inner = n * tt.eye(k) - tt.dot(delta_trans.T, delta_trans) g_cov = solve_upper(chol_cov.T, inner) g_cov = solve_upper(chol_cov.T, g_cov.T) tau_delta = solve_upper(chol_cov.T, delta_trans.T) g_delta = tau_delta.T g_cov = tt.switch(ok, g_cov, -np.nan) g_delta = tt.switch(ok, g_delta, -np.nan) return [-0.5 * g_cov * g_logp, -g_delta * g_logp]
def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (encoderInputs.shape[1], self.de_hidden_size)) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) state_below = tensor.nnet.softmax(newpred) return state_below, hs, Cs
def likelihoodFunc(v,size,graphMatrix,value): v=tt.fill(tt.ones(size),v)#create a vector of the null bernouli probs penalty=tt.dot(graphMatrix,value) #matrix multipl. to get how many correlated features are "on" penalty=5-4*tt.exp(-penalty) v=v*penalty ll=tt.sum(tt.log(tt.pow(v, value))) return ll
def _step2(ctx_, state_, hs_, Cs_): #print ctx_.shape, state_.shape, hs_.shape, Cs_.shape hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32") msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape( (ctx_.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis=1) newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) extra_p = T.zeros_like(hs[:, :, 0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs
def _step2(ctx_, state_, hs_, Cs_): ### ctx_: b x h ### state_ : b x h ### hs_ : 1 x b x h the first dimension is the number of the decoder layers ### Cs_ : 1 x b x h the first dimension is the number of the decoder layers hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (ctx_.shape[0], self.de_hidden_size)) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:, :, 0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs
def output_probabilistic(self, m_x, v_x): m_linear = T.dot(m_x, self.m_W[ 0, :, : ]) + T.tile(self.m_b[ 0, :, : ], [ m_x.shape[ 0 ], 1 ]) v_linear = T.dot(m_x**2, self.v_W[ 0, :, : ]) + T.dot(v_x, self.m_W[ 0, :, : ]**2) + T.dot(v_x, self.v_W[ 0, :, : ]) + \ T.tile(self.v_b[ 0, :, : ], [ m_x.shape[ 0 ], 1 ]) if not self.output_layer: # We compute the mean and variance after the ReLU activation alpha = m_linear / T.sqrt(v_linear) gamma = Network_layer.gamma(-alpha) gamma_robust = -alpha - 1.0 / alpha + 2.0 / alpha**3 gamma_final = T.switch(T.lt(-alpha, T.fill(alpha, 30)), gamma, gamma_robust) v_aux = m_linear + T.sqrt(v_linear) * gamma_final m_a = Network_layer.n_cdf(alpha) * v_aux v_a = m_a * v_aux * Network_layer.n_cdf(-alpha) + Network_layer.n_cdf(alpha) * v_linear * (1 - gamma_final * (gamma_final + alpha)) return (m_a, v_a) else: return (m_linear, v_linear)
def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill( lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq( dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest
def transform_targets(targets): """Transform targets into a format suitable for passing to cost().""" reshaped = T.shape_padleft(targets) blanks = T.fill(reshaped, _BLANK) result = T.concatenate([blanks, reshaped]).dimshuffle(1, 0, 2).reshape((2*targets.shape[0], targets.shape[1])) result = T.concatenate([result, T.shape_padleft(result[0])]) return result
def energyFnMB(self, VM, HM): """ evaluates the energy functions of the RBM given row vector(s) of v and h args: VM (T.matrix): rows of visible layer values HM (T.matrix): rows of hidden layer values return: a row Theano vector, elements being E(v_row, h_row) """ T_bh = T.dot(HM, self.T_b) T_omghv = T.transpose(T.sum(T.mul(T.dot(T.mul(T.fill(VM, T.exp(-self.T_z)), VM), self.T_omega), HM), axis=1,acc_dtype=theano.config.floatX)) T_Vsqr = T.mul(VM-T.fill(VM, self.T_a),VM-T.fill(VM, self.T_a)) T_VsqrOmg = T.transpose(T.sum(T.mul(T.fill(T_Vsqr,np.float32(0.5)*T.exp(-self.T_z)),T_Vsqr),axis=1, acc_dtype=theano.config.floatX)) return -T_VsqrOmg + T_omghv + T_bh
def my_crf_accuracy(energies): assert energies.ndim == 4 def inner_function(energies_one_step, prior_pi, prior_pointer): """ :param energies_one_step: [batch_size, t, t] :param prior_pi: [batch_size, t] :param prior_pointer: [batch_size, t] :return: """ prior_pi_shuffled = prior_pi.dimshuffle(0, 1, 'x') pi_t = T.max(prior_pi_shuffled + energies_one_step, axis=1) pointer_t = T.argmax(prior_pi_shuffled + energies_one_step, axis=1) return [pi_t, pointer_t] def back_pointer(pointer, pointer_tp1): """ :param pointer: [batch, t] :param point_tp1: [batch,] :return: """ return pointer[T.arange(pointer.shape[0]), pointer_tp1] # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels) energies_shuffled = energies.dimshuffle(1, 0, 2, 3) # pi at time 0 is the last rwo at time 0. but we need to remove the last column which is the pad symbol. pi_time0 = energies_shuffled[0, :, -1, :-1] # the last row and column is the tag for pad symbol. reduce these two dimensions by 1 to remove that. # now the shape of energies_shuffled is [n_time_steps, b_batch, t, t] where t = num_labels - 1. energies_shuffled = energies_shuffled[:, :, :-1, :-1] initials = [pi_time0, T.cast(T.fill(pi_time0, -1), 'int64')] [pis, pointers], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[energies_shuffled[1:]]) pi_n = pis[-1] pointer_n = T.argmax(pi_n, axis=1) back_pointers, _ = theano.scan(fn=back_pointer, outputs_info=pointer_n, sequences=[pointers], go_backwards=True) # prediction shape [batch_size, length] prediction_revered = T.concatenate( [pointer_n.dimshuffle(0, 'x'), back_pointers.dimshuffle(1, 0)], axis=1) prediction = prediction_revered[:, T.arange(prediction_revered.shape[1] - 1, -1, -1)] return prediction
def __init__(self, mean, var, rng=None): self.mean = mean # This allows to use var with shape (1, 1, n) self.var = T.fill(mean, var) self.stt = T.concatenate((mean, self.var), -1) self.maximum = self.mean super(DiagGauss, self).__init__(rng)
def chain_crf_loss(energies, targets, masks): """ compute minus log likelihood of chain crf as chain crf loss. :param energies: Theano 4D tensor energies of each step. the shape is [batch_size, n_time_steps, num_labels, num_labels], where the pad label index is at last. :param targets: Theano 2D tensor targets in the shape [batch_size, n_time_steps] :param masks: Theano 2D tensor masks in the shape [batch_size, n_time_steps] :return: Theano 1D tensor an expression for minus log likelihood loss. """ assert energies.ndim == 4 assert targets.ndim == 2 assert masks.ndim == 2 def inner_function(energies_one_step, targets_one_step, mask_one_step, prior_partition, prev_label, tg_energy): """ :param energies_one_step: [batch_size, t, t] :param targets_one_step: [batch_size] :param prior_partition: [batch_size, t] :param prev_label: [batch_size] :param tg_energy: [batch_size] :return: """ partition_shuffled = prior_partition.dimshuffle(0, 1, 'x') partition_t = T.switch(mask_one_step.dimshuffle(0, 'x'), theano_logsumexp(energies_one_step + partition_shuffled, axis=1), prior_partition) return [partition_t, targets_one_step, tg_energy + energies_one_step[T.arange(energies_one_step.shape[0]), prev_label, targets_one_step]] # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels) energies_shuffled = energies.dimshuffle(1, 0, 2, 3) targets_shuffled = targets.dimshuffle(1, 0) masks_shuffled = masks.dimshuffle(1, 0) # initials should be energies_shuffles[0, :, -1, :] init_label = T.cast(T.fill(energies[:, 0, 0, 0], -1), 'int32') energy_time0 = energies_shuffled[0] target_time0 = targets_shuffled[0] initials = [energies_shuffled[0, :, -1, :], target_time0, energy_time0[T.arange(energy_time0.shape[0]), init_label, target_time0]] [partitions, _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[energies_shuffled[1:], targets_shuffled[1:], masks_shuffled[1:]]) partition = partitions[-1] target_energy = target_energies[-1] loss = theano_logsumexp(partition, axis=1) - target_energy return loss
def transform_targets(targets): """Transform targets into a format suitable for passing to cost().""" reshaped = T.shape_padleft(targets) blanks = T.fill(reshaped, _BLANK) result = T.concatenate([blanks, reshaped]).dimshuffle(1, 0, 2).reshape( (2 * targets.shape[0], targets.shape[1])) result = T.concatenate([result, T.shape_padleft(result[0])]) return result
def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse(tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest
def MRR_loss(y_true, y_pred): ''' Training data have to be Xloop, Xtap, target = utils.MakeTrainingDataRank(Loop, Tap) Batch size have to be 40 ''' comp = T.zeros_like(y_true) comp = T.fill(comp, T.mean(y_pred[T.argmax(y_true)])) Rank = T.sum(T.gt(comp, y_pred)) #T.dot(y_pred[39].T, T.ones_like(y_true).T) return Rank + T.mean(y_true) * 0 + T.mean(y_pred) * 0
def output_deterministic(self, output_previous): # We add an additional input with value 1 output_previous_with_bias = \ T.concatenate([ output_previous, T.alloc(1, 1) ], 0) / \ T.sqrt(self.n_inputs) # We compute the mean and variance after the linear operation a = T.dot(self.w, output_previous_with_bias) if (self.non_linear): # We compute the ReLU activation a = T.switch(T.lt(a, T.fill(a, 0)), T.fill(a, 0), a) return a
def vtohMB(self, VsampM): """ computes hidden unit outputs given visible unit outputs ("half" a MCMC iteration) computes in parallel given input rows of visible units args: VsampM (T.matrix): rows of visible unit outputs returns: a T.matrix, rows of hidden unit outputs """ Vomg = T.matrix(name="Vomg", dtype=theano.config.floatX) vtohMBres = T.matrix(name ="vtohMBres", dtype=theano.config.floatX) T_HP = T.matrix(name="T_HP", dtype=theano.config.floatX) Vomg = T.dot(T.mul(T.fill(VsampM, T.exp(-self.T_z)), VsampM), self.T_omega) T_Hp = T.nnet.ultra_fast_sigmoid(T.fill(Vomg, self.T_b) + Vomg) vtohMBres = self.T_rng.binomial(size = T_Hp.shape, p=T_Hp, dtype=theano.config.floatX) return vtohMBres
def _gen_exprs(self, inpt): """Return the exprssions of the recognition model.""" P = self.parameters.gen n_layers = len(self.n_hiddens_gen) hidden_to_hiddens = [getattr(P, 'hidden_to_hidden_%i' % i) for i in range(n_layers - 1)] hidden_biases = [getattr(P, 'hidden_bias_%i' % i) for i in range(n_layers)] initial_hidden_means = [getattr(P, 'initial_hidden_means_%i' % i) for i in range(n_layers)] initial_hidden_vars = [getattr(P, 'initial_hidden_vars_%i' % i) for i in range(n_layers)] recurrents = [getattr(P, 'recurrent_%i' % i) for i in range(n_layers)] shortcut_size = self.n_hiddens_recog[-1] p_dropout_inpt = T.zeros_like(inpt[:, :, :self.n_latent]) p_dropout_inpt = T.fill(p_dropout_inpt, self.p_dropout_inpt) p_dropout_shortcut = T.zeros_like(inpt[:, :, self.n_latent:]) p_dropout_shortcut = T.fill(p_dropout_shortcut, self.p_dropout_inpt) p_dropout_inpt = T.concatenate([p_dropout_inpt, p_dropout_shortcut], axis=2) p_dropouts = [p_dropout_inpt] + self.p_dropout_hiddens if self.p_dropout_hidden_to_out is None: p_dropouts.append(self.p_dropout_hiddens[-1]) else: p_dropouts.append(self.p_dropout_hidden_to_out) exprs = vprnn.exprs( inpt, T.zeros_like(inpt), P.in_to_hidden, hidden_to_hiddens, P.hidden_to_out, hidden_biases, [1 for _ in hidden_biases], initial_hidden_means, initial_hidden_vars, recurrents, P.out_bias, 1, self.gen_transfers, self.assumptions.statify_visible, p_dropouts=p_dropouts) return exprs
def htovMB(self, HsampM): """ computes visible unit outputs given hidden unit inputs ("half" a MCMC iteration) computes in parallel given input rows of hidden units args: HsampM (T.matrix): rows of hidden unit inputs returns: a T.matrix, rows of visible unit outputs """ T_omgH = T.matrix(name="T_omgH", dtype=theano.config.floatX) T_means = T.matrix(name="T_means", dtype=theano.config.floatX) htovMBres = T.matrix(name="htovMBres", dtype=theano.config.floatX) T_omgH = T.transpose(T.dot(self.T_omega, T.transpose(HsampM))) T_means = T.fill(T_omgH, self.T_a) + T_omgH htovMBres = self.T_rng.normal(size=T_means.shape, avg=T_means, std=T.fill(T_means,T.sqrt(T.exp(self.T_z))), dtype=theano.config.floatX) return htovMBres
def _log_likelihood(self, x_vars, means): """ This function computes the symbolic log-likelihood for a diagonal gaussian defined by the given means and a fixed sigma. :param x_vars: :param means: :return: """ std = T.fill(T.zeros_like(means), self.policy.sigma) zs = (x_vars - means) / std return -T.sum(T.log(std), axis=-1)\ -0.5 * T.sum(T.square(zs), axis=-1)\ -0.5 * means.shape[-1] * np.log(2 * np.pi)
def CalculateCosineS(self, options, ctx=None, proj_h=None, h_mask=None): r = options['r'] fill_matrix = tensor.ones_like(h_mask) - h_mask norm_ctx = ctx.norm(2, 2) norm_proj = (proj_h + fill_matrix[:, :, None]).norm(2, 2) * h_mask mul_cp = (ctx * proj_h).sum(2) cos_cp = mul_cp / (norm_ctx * norm_proj + fill_matrix) r_ = tensor.zeros_like(cos_cp) r_ = tensor.fill(r_, r) exp_cp = tensor.exp(cos_cp * r_) * h_mask p = exp_cp / (exp_cp.sum(0)[None, :] + tensor.min(fill_matrix, axis=0)[None, :]) return p
def _r_loss(self, preds, y): """ :param preds: (n_batch, T) this variable stores the predictions for one path of size T :param y: (n_batch, ) this variable stores the targets :return: """ # y_rep: (n_batch, T, ) y_rep = T.stack([ T.fill(T.zeros((self.policy.n_steps)), y[b]) for b in xrange(self.policy.n_batch) ], axis=0) #return T.nnet.binary_crossentropy(probs[:, :, 1], y_rep).mean(axis=[0,1]) return T.neq(preds, y_rep).mean(axis=[0, 1])
def CalculateCosine(self, options, ctx=None, proj_h=None, ctx_mask=None): r = options['r'] fill_matrix = tensor.ones_like(ctx_mask) - ctx_mask norm_ctx = (ctx + fill_matrix[:, :, None]).norm(2, 2) * ctx_mask norm_proj = proj_h.norm(2, 1) mul_cp = (ctx * proj_h[None, :, :]).sum(2) cos_cp = mul_cp / (norm_ctx * norm_proj[None, :] + fill_matrix) r_ = tensor.zeros_like(cos_cp) r_ = tensor.fill(r_, r) exp_cp = tensor.exp(cos_cp * r_) * ctx_mask p = exp_cp / (exp_cp.sum(0)[None, :] + tensor.min(fill_matrix, axis=0)[None, :]) prob_max = p.argmax(0) return p, prob_max
def test_1msigmoid(self): if not register_local_1msigmoid: return m = self.get_mode() x = T.fmatrix() # tests exp_over_1_plus_exp f = theano.function([x], 1 - T.exp(x) / (1 + T.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [tensor.neg, sigmoid_inplace] # tests inv_1_plus_exp f = theano.function([x], 1 - T.fill(x, 1.0) / (1 + T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [tensor.neg, sigmoid_inplace]
def get_pi_from_v(self, Q): if self.v_to_pi == 'greedy' or self.v_to_pi == 'e-greedy': greedy_actions = T.argmax(Q, axis=-1) greedy_pi = T.extra_ops.to_one_hot(greedy_actions, nb_class=5, dtype='int32') if self.v_to_pi == 'greedy': return greedy_pi else: return T.fill( Q, self.epsilon / 5) + (1 - self.epsilon) * greedy_pi elif self.v_to_pi == 'softmax': return T.nnet.softmax(Q) else: raise Exception()
def test_1msigmoid(self): if not register_local_1msigmoid: return m = self.get_mode() x = T.fmatrix() # tests exp_over_1_plus_exp f = theano.function([x], 1 - T.exp(x) / (1 + T.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [ tensor.neg, sigmoid_inplace] # tests inv_1_plus_exp f = theano.function([x], 1 - T.fill(x, 1.0) / (1 + T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [tensor.neg, sigmoid_inplace]
def CalculateCosine_webS(self, options, ctx=None, proj_h=None, mask_x=None): r = options['r'] norm_ctx = ctx.norm(2, 2) norm_proj = proj_h.norm(2, 2) mul_cp = (ctx * proj_h).sum(2) cos_cp = mul_cp / (norm_ctx * norm_proj + 0.0001) r_ = tensor.zeros_like(cos_cp) r_ = tensor.fill(r_, r) exp_cp = tensor.exp(cos_cp * r_) exp_cp_ = exp_cp * (mask_x.reshape( [mask_x.shape[0], ctx.shape[0], ctx.shape[1]]).max(0)) p = exp_cp_ / (exp_cp_.sum(0)[None, :] + 0.0001) return p
def _step2(diag_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (encoderInputs.shape[1], self.de_hidden_size)) attn_index = tensor.nonzero(diag_, True) attn_value = tensor.nonzero_values(diag_) en_context = Encoder_shuffle[:, attn_index[0], :] attn_context = Encoder_shuffle_re[:, attn_index[0], :] attn_weight = tensor.batched_dot(attn_context, state_below0) attn_weight = tensor.nnet.softmax(attn_weight) #attn_weight *= (encoderMask.dimshuffle(1,0)) attn_weight *= (attn_value.dimshuffle('x', 0)) ##attn_weight = attn_weight/(tensor.sum(attn_weight, axis=1).dimshuffle(0,'x')) ####### ctx_ : (b, h) ctx_ = tensor.sum(en_context * attn_weight[:, :, None], axis=1) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:, :, 0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs
def crf_loss(energies, targets, masks): assert energies.ndim == 4 assert targets.ndim == 2 assert masks.ndim == 2 def inner_function(energies_one_step, targets_one_step, mask_one_step, prior_partition, prev_label, tg_energy): partition_shuffled = prior_partition.dimshuffle(0, 1, 'x') partition_t = T.switch( mask_one_step.dimshuffle(0, 'x'), theano_logsumexp(energies_one_step + partition_shuffled, axis=1), prior_partition) return [ partition_t, targets_one_step, tg_energy + energies_one_step[T.arange(energies_one_step.shape[0]), prev_label, targets_one_step] ] # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels) energies_shuffled = energies.dimshuffle(1, 0, 2, 3) targets_shuffled = targets.dimshuffle(1, 0) masks_shuffled = masks.dimshuffle(1, 0) # initials should be energies_shuffles[0, :, -1, :] init_label = T.cast(T.fill(energies[:, 0, 0, 0], -1), 'int32') energy_time0 = energies_shuffled[0] target_time0 = targets_shuffled[0] initials = [ energies_shuffled[0, :, -1, :], target_time0, energy_time0[T.arange(energy_time0.shape[0]), init_label, target_time0] ] [partitions, _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[ energies_shuffled[1:], targets_shuffled[1:], masks_shuffled[1:] ]) partition = partitions[-1] target_energy = target_energies[-1] loss = theano_logsumexp(partition, axis=1) - target_energy return loss
def __init__(self, input, input_sm, vocab_size, emb_dim, local_context_size, global_context_size): # initialize W_emb global rng global init_range if pretrain_file: linear_W_emb = load_pretrain_emb(pretrain_file) print "* Using pretrained linear_W_emb ..." assert(len(linear_W_emb) == vocab_size) else: linear_W_emb = np.asarray(rng.uniform( low=-init_range, high=init_range, size=(vocab_size, emb_dim)), dtype=theano.config.floatX) # shared variables self.W_emb = theano.shared(value=linear_W_emb, name='W_emb') # stack vectors input = T.cast(input, 'int32') # output is a matrix where each row correponds to a context_size embedding vector, and row number equals to batch size # output dimensions: batch_size * ((context_size + 1) * emb_dim) output_local = self.W_emb[input[:, :local_context_size].flatten()].reshape( (input.shape[0], local_context_size * emb_dim)) # self.W_emb.shape[1] # define symbolic functions for calculating the mean of sentences W = T.matrix('W') eos_vector = T.vector('eos_vector') eos_vector = T.fill(T.zeros_like(input[0,local_context_size:]), io_vocab.VocabConstants.EOS_INDEX) def weighted_sentence(sentence, W, eos_vector): sent_len = T.sum(T.neq(sentence, eos_vector)) return T.mean(W[sentence[:sent_len]], axis=0) output_global, updates = theano.scan(fn=weighted_sentence, outputs_info=None, sequences=input[:, local_context_size:], non_sequences=[self.W_emb, eos_vector]) # concatenate local output and global output to form the output matrix self.output = T.concatenate([output_local, output_global], axis=1) # params is the word embedding matrix self.params = [self.W_emb]
def test_1msigmoid(self): if not register_local_1msigmoid: return m = theano.config.mode if m == 'FAST_COMPILE': m = 'FAST_RUN' x = T.fmatrix() # tests exp_over_1_plus_exp f = theano.function([x], 1 - T.exp(x)/(1+T.exp(x)), mode=m) theano.printing.debugprint(f) assert [node.op for node in f.maker.env.toposort()] == [tensor.neg, sigmoid_inplace] # tests inv_1_plus_exp f = theano.function([x], 1 - T.fill(x,1.0) / (1+T.exp(-x)), mode=m) theano.printing.debugprint(f) assert [node.op for node in f.maker.env.toposort()] == [tensor.neg, sigmoid_inplace]
def crf_loss(energies, targets, masks): assert energies.ndim == 4 assert targets.ndim == 2 assert masks.ndim == 2 def inner_function(energies_one_step, targets_one_step, mask_one_step, prior_partition, prev_label, tg_energy): partition_shuffled = prior_partition.dimshuffle(0, 1, 'x') partition_t = T.switch( mask_one_step.dimshuffle(0, 'x'), theano_logsumexp(energies_one_step + partition_shuffled, axis=1), prior_partition) return [ partition_t, targets_one_step, tg_energy + energies_one_step[T.arange(energies_one_step.shape[0]), prev_label, targets_one_step] ] energies_shuffled = energies.dimshuffle(1, 0, 2, 3) targets_shuffled = targets.dimshuffle(1, 0) masks_shuffled = masks.dimshuffle(1, 0) init_label = T.cast(T.fill(energies[:, 0, 0, 0], -1), 'int32') energy_time0 = energies_shuffled[0] target_time0 = targets_shuffled[0] initials = [ energies_shuffled[0, :, -1, :], target_time0, energy_time0[T.arange(energy_time0.shape[0]), init_label, target_time0] ] [partitions, _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[ energies_shuffled[1:], targets_shuffled[1:], masks_shuffled[1:] ]) partition = partitions[-1] target_energy = target_energies[-1] loss = theano_logsumexp(partition, axis=1) - target_energy return loss
def test_1msigmoid(self): if not register_local_1msigmoid: return m = self.get_mode() x = tt.fmatrix() # tests exp_over_1_plus_exp f = theano.function([x], 1 - tt.exp(x) / (1 + tt.exp(x)), mode=m) assert check_stack_trace(f, ops_to_check=[tt.neg, sigmoid_inplace]) assert [node.op for node in f.maker.fgraph.toposort()] == [ tt.neg, sigmoid_inplace, ] # tests inv_1_plus_exp f = theano.function([x], 1 - tt.fill(x, 1.0) / (1 + tt.exp(-x)), mode=m) assert check_stack_trace(f, ops_to_check=[tt.neg, sigmoid_inplace]) assert [node.op for node in f.maker.fgraph.toposort()] == [ tt.neg, sigmoid_inplace, ]
def output_probabilistic(self, m_w_previous, v_w_previous): # We add an additional deterministic input with mean 1 and variance 0 m_w_previous_with_bias = \ T.concatenate([ m_w_previous, T.alloc(1, 1) ], 0) v_w_previous_with_bias = \ T.concatenate([ v_w_previous, T.alloc(0, 1) ], 0) # We compute the mean and variance after the linear operation m_linear = T.dot(self.m_w, m_w_previous_with_bias) / T.sqrt(self.n_inputs) v_linear = (T.dot(self.v_w, v_w_previous_with_bias) + \ T.dot(self.m_w**2, v_w_previous_with_bias) + \ T.dot(self.v_w, m_w_previous_with_bias**2)) / self.n_inputs if (self.non_linear): # We compute the mean and variance after the ReLU activation alpha = m_linear / T.sqrt(v_linear) gamma = Network_layer.gamma(-alpha) gamma_robust = -alpha - 1.0 / alpha + 2.0 / alpha**3 gamma_final = T.switch(T.lt(-alpha, T.fill(alpha, 30)), gamma, gamma_robust) v_aux = m_linear + T.sqrt(v_linear) * gamma_final m_a = Network_layer.n_cdf(alpha) * v_aux v_a = m_a * v_aux * Network_layer.n_cdf(-alpha) + \ Network_layer.n_cdf(alpha) * v_linear * \ (1 - gamma_final * (gamma_final + alpha)) return (m_a, v_a) else: return (m_linear, v_linear)
def _collect_samples(self, y): """ This function collect N samples of size T using the current policy. :param y: :return: locations (n_batch, N, T, 2), probabilities (n_batch, N, T, n_classes), rewards (n_batch, N, T, ) and returns (n_batch, N, T, ) """ means = [] locs = [] probs = [] returns = [] preds = [] # Reshape target labels to match the classification outputs along each path of length T y_rep = T.stack([ T.fill(T.zeros((self.policy.n_steps)), y[b]) for b in xrange(self.policy.n_batch) ], axis=0) for _ in xrange(self.policy.N): loc_means_t, locs_t, _, x_ts, p_ts = self.policy.step_forward() locs.append(locs_t) means.append(loc_means_t) probs.append(p_ts) pred = np.argmax(p_ts, axis=2) preds.append(pred) rewards = self._acc_score(pred, y_rep) returns.append(cumsum(rewards, axis=1)) locs = T.stack(locs).dimshuffle(1, 0, *range(2, T.stack(locs).ndim)) means = T.stack(means).dimshuffle(1, 0, *range(2, T.stack(means).ndim)) preds = T.stack(preds).dimshuffle(1, 0, *range(2, T.stack(preds).ndim)) returns = T.stack(returns).dimshuffle(1, 0, *range(2, T.stack(returns).ndim)) return locs, means, preds, returns
def myMask(input_t, mask, binarize=False, clip=True): """ Same of myMaskArr but with theano thensors. It performs the following : 1) masks *input_t* with *mask*, takes only value bigger than `settings.THRESHOLD` # no more 2) divides each entry to the maximum value in the resulting tensor 3) if *binarize* is True, sets 1.0 - EPS(0) in the maximum value of each column and EPS(0) in any other position of the column This uses EPS(0) to avoid nans in cross-entropy loss function. If *clip* is True, then the returned tensor will be clipped between EPS(0) and 1.0 - EPS(0) RETURNS : theano tensor """ assert (input_t.ndim == 4 and mask.ndim == 4 and binarize) or (not binarize),\ "input and mask MUST be 4D tensors to the end of binarization" masked = mask * input_t # masked = masked * (masked > settings.THRESHOLD) # masked = abs(input_t * mask) # # normalized = masked # normalized = masked / masked.max() if binarize: binarized = T.fill(masked, EPS(0)) max_rows = masked.argmax(axis=2) max_cols = T.arange(masked.shape[3]) normalized = T.set_subtensor(binarized[0, 0, max_rows, max_cols], 1.0 - EPS(0)) returned = normalized elif clip: returned = masked.clip(EPS(0), 1 - EPS(0)) else: returned = masked return returned
def test_exp_over_1_plus_exp(self): m = self.get_mode(excluding=["local_elemwise_fusion"]) x = tt.vector() data = np.random.rand(54).astype(config.floatX) backup = config.warn__identify_1pexp_bug config.warn__identify_1pexp_bug = False try: # tests exp_over_1_plus_exp f = theano.function([x], tt.exp(x) / (1 + tt.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid] f(data) f = theano.function([x], tt.exp(x) / (2 + tt.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = theano.function([x], tt.exp(x) / (1 - tt.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = theano.function([x], tt.exp(x + 1) / (1 + tt.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) # tests inv_1_plus_exp f = theano.function([x], tt.fill(x, 1.0) / (1 + tt.exp(-x)), mode=m) # todo: solve issue #4589 first # assert check_stack_trace(f, ops_to_check=sigmoid) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid] f(data) f = theano.function([x], tt.fill(x, 1.0) / (2 + tt.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = theano.function([x], tt.fill(x, 1.0) / (1 - tt.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = theano.function([x], tt.fill(x, 1.1) / (1 + tt.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) # tests inv_1_plus_exp with neg f = theano.function([x], tt.fill(x, -1.0) / (1 + tt.exp(-x)), mode=m) # todo: solve issue #4589 first # assert check_stack_trace( # f, ops_to_check=[sigmoid, neg_inplace]) assert [node.op for node in f.maker.fgraph.toposort()] == [ sigmoid, neg_inplace, ] f(data) f = theano.function([x], tt.fill(x, -1.0) / (1 - tt.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, neg_inplace, ] f(data) f = theano.function([x], tt.fill(x, -1.0) / (2 + tt.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, neg_inplace, ] f(data) f = theano.function([x], tt.fill(x, -1.1) / (1 + tt.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, neg_inplace, ] f(data) # tests double inv_1_plus_exp with neg # (-1)(exp(x)) / (1+exp(x))(1+exp(-x)) # = (-1)/(1+exp(-x)) * exp(x)/(1+exp(x)) # = - (sigm(x) * sigm(x)) f = theano.function( [x], (tt.fill(x, -1.0) * tt.exp(x)) / ((1 + tt.exp(x)) * (1 + tt.exp(-x))), mode=m, ) # todo: solve issue #4589 first # assert check_stack_trace(f, ops_to_check=[sigmoid, tt.mul]) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid, tt.mul] f(data) f = theano.function( [x], (tt.fill(x, -1.1) * tt.exp(x)) / ((1 + tt.exp(x)) * (1 + tt.exp(-x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, tt.mul, neg_inplace, ] f(data) f = theano.function( [x], (tt.fill(x, -1.0) * tt.exp(x)) / ((2 + tt.exp(x)) * (1 + tt.exp(-x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, tt.mul, neg_inplace, ] f(data) f = theano.function( [x], (tt.fill(x, -1.0) * tt.exp(x)) / ((1 + tt.exp(x)) * (2 + tt.exp(-x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, tt.mul, neg_inplace, ] f(data) f = theano.function( [x], (tt.fill(x, -1.0) * tt.exp(x)) / ((1 + tt.exp(x)) * (1 + tt.exp(x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, tt.mul, neg_inplace, ] f(data) f = theano.function( [x], (tt.fill(x, -1.0) * tt.exp(x)) / ((1 + tt.exp(x)) * (2 + tt.exp(-x))), mode=m, ) assert [node.op for node in f.maker.fgraph.toposort()] != [ sigmoid, tt.mul, neg_inplace, ] f(data) finally: # Restore config option. config.warn__identify_1pexp_bug = backup
def MvNormalLogp(): """Compute the log pdf of a multivariate normal distribution. This should be used in MvNormal.logp once Theano#5908 is released. Parameters ---------- cov : tt.matrix The covariance matrix. delta : tt.matrix Array of deviations from the mean. """ cov = tt.matrix('cov') cov.tag.test_value = floatX(np.eye(3)) delta = tt.matrix('delta') delta.tag.test_value = floatX(np.zeros((2, 3))) solve_lower = tt.slinalg.Solve(A_structure='lower_triangular') solve_upper = tt.slinalg.Solve(A_structure='upper_triangular') cholesky = Cholesky(lower=True, on_error='nan') n, k = delta.shape n, k = f(n), f(k) chol_cov = cholesky(cov) diag = tt.nlinalg.diag(chol_cov) ok = tt.all(diag > 0) chol_cov = tt.switch(ok, chol_cov, tt.fill(chol_cov, 1)) delta_trans = solve_lower(chol_cov, delta.T).T result = n * k * tt.log(f(2) * np.pi) result += f(2) * n * tt.sum(tt.log(diag)) result += (delta_trans ** f(2)).sum() result = f(-.5) * result logp = tt.switch(ok, result, -np.inf) def dlogp(inputs, gradients): g_logp, = gradients cov, delta = inputs g_logp.tag.test_value = floatX(1.) n, k = delta.shape chol_cov = cholesky(cov) diag = tt.nlinalg.diag(chol_cov) ok = tt.all(diag > 0) chol_cov = tt.switch(ok, chol_cov, tt.fill(chol_cov, 1)) delta_trans = solve_lower(chol_cov, delta.T).T inner = n * tt.eye(k) - tt.dot(delta_trans.T, delta_trans) g_cov = solve_upper(chol_cov.T, inner) g_cov = solve_upper(chol_cov.T, g_cov.T) tau_delta = solve_upper(chol_cov.T, delta_trans.T) g_delta = tau_delta.T g_cov = tt.switch(ok, g_cov, -np.nan) g_delta = tt.switch(ok, g_delta, -np.nan) return [-0.5 * g_cov * g_logp, -g_delta * g_logp] return theano.OpFromGraph( [cov, delta], [logp], grad_overrides=dlogp, inline=True)
def chain_crf_accuracy(energies, targets): """ decode crf and compute accuracy :param energies: Theano 4D tensor energies of each step. the shape is [batch_size, n_time_steps, num_labels, num_labels], where the pad label index is at last. :param targets: Theano 2D tensor targets in the shape [batch_size, n_time_steps] :return: Theano 1D tensor an expression for minus log likelihood loss. """ assert energies.ndim == 4 assert targets.ndim == 2 def inner_function(energies_one_step, prior_pi, prior_pointer): """ :param energies_one_step: [batch_size, t, t] :param prior_pi: [batch_size, t] :param prior_pointer: [batch_size, t] :return: """ prior_pi_shuffled = prior_pi.dimshuffle(0, 1, 'x') pi_t = T.max(prior_pi_shuffled + energies_one_step, axis=1) pointer_t = T.argmax(prior_pi_shuffled + energies_one_step, axis=1) return [pi_t, pointer_t] def back_pointer(pointer, pointer_tp1): """ :param pointer: [batch, t] :param point_tp1: [batch,] :return: """ return pointer[T.arange(pointer.shape[0]), pointer_tp1] # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels) energies_shuffled = energies.dimshuffle(1, 0, 2, 3) # pi at time 0 is the last rwo at time 0. but we need to remove the last column which is the pad symbol. pi_time0 = energies_shuffled[0, :, -1, :-1] # the last row and column is the tag for pad symbol. reduce these two dimensions by 1 to remove that. # now the shape of energies_shuffled is [n_time_steps, b_batch, t, t] where t = num_labels - 1. energies_shuffled = energies_shuffled[:, :, :-1, :-1] initials = [pi_time0, T.cast(T.fill(pi_time0, -1), 'int64')] [pis, pointers], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[energies_shuffled[1:]]) pi_n = pis[-1] pointer_n = T.argmax(pi_n, axis=1) back_pointers, _ = theano.scan(fn=back_pointer, outputs_info=pointer_n, sequences=[pointers], go_backwards=True) # prediction shape [batch_size, length] prediction_revered = T.concatenate([pointer_n.dimshuffle(0, 'x'), back_pointers.dimshuffle(1, 0)], axis=1) prediction = prediction_revered[:, T.arange(prediction_revered.shape[1] - 1, -1, -1)] return prediction, T.eq(prediction, targets)
def _InitializeModelThatPredictsCharsMultiSoftmax(self,learning_rate, num_softmaxes=5): image_input = T.tensor4('image_input') print ("num_of_softmax: " + str(num_softmaxes)) #prediction_layer = self._BuildModelToPredictFirstChar(image_input) prediction_layer = self._BuildModelToPredictCharsMultiSoftmax( image_input, num_softmaxes=num_softmaxes) target_chars_input = T.imatrix('target_chars_input') target_chars = target_chars_input[:, :num_softmaxes].reshape(shape=(-1,)) # Create a loss expression for training, Using cross-entropy loss. prediction = lasagne.layers.get_output(prediction_layer) l_loss = lasagne.objectives.categorical_crossentropy(prediction, target_chars) loss = l_loss.mean() # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum. params = lasagne.layers.get_all_params(prediction_layer, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate, momentum=0.9) #updates = lasagne.updates.adagrad(loss, params, learning_rate=0.0001) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(prediction_layer, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_chars) test_loss = test_loss.mean() predicted_chars = T.argmax(test_prediction, axis=1) correctly_predicted_chars = T.eq(predicted_chars, target_chars) # An expression for the classification accuracy: test_acc = T.mean(correctly_predicted_chars, dtype=theano.config.floatX) predicted_chars = predicted_chars.reshape(shape=(-1, num_softmaxes)) correctly_predicted_chars = correctly_predicted_chars.reshape(shape=(-1, num_softmaxes)) num_chars_matched = T.sum(correctly_predicted_chars, axis=1, dtype=theano.config.floatX) seq_test_acc = T.mean(T.eq(num_chars_matched, T.fill(num_chars_matched, num_softmaxes)), dtype=theano.config.floatX) test_prediction = test_prediction.reshape(shape=(-1, num_softmaxes, len(self.CHARS))) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function( [image_input, target_chars_input], loss, updates=updates, allow_input_downcast=True) # Compile a second function computing the prediction, validation loss and accuracy: test_fn = theano.function([image_input, target_chars_input], [test_loss, test_acc, seq_test_acc], allow_input_downcast=True) # Compile a third function computing the prediction. inference_fn = theano.function([image_input], [predicted_chars, test_prediction], allow_input_downcast=True) return prediction_layer, train_fn, test_fn, inference_fn
def _InitializeModelThatPredictsAllChars( self, learning_rate, bidirectional_rnn=False, use_mask_input=False, lstm_layer_units=256): image_input = T.tensor4('image_input') num_rnn_steps = self.num_rnn_steps target_chars_input = T.imatrix('target_chars') target_chars = target_chars_input[:, :num_rnn_steps] target_chars = target_chars.reshape(shape=(-1,)) mask_input_input = None mask_input = None if use_mask_input: mask_input_input = T.imatrix('mask_input') mask_input = mask_input_input[:, :num_rnn_steps] #mask_input = mask_input.reshape(shape=(-1,)) prediction_layer, l_cnn, l_lstm = self._BuildModelToPredictAllChars( image_input, num_rnn_steps=num_rnn_steps, mask_input=mask_input, bidirectional_rnn=bidirectional_rnn, lstm_layer_units=lstm_layer_units) # Create a loss expression for training, Using cross-entropy loss. #prediction = lasagne.layers.get_output(prediction_layer) prediction, l_cnn, l_lstm = tuple( lasagne.layers.get_output([prediction_layer, l_cnn, l_lstm])) l_loss = lasagne.objectives.categorical_crossentropy(prediction, target_chars) if use_mask_input: l_loss = l_loss.reshape(shape=(-1, num_rnn_steps)) l_loss *= mask_input loss = l_loss.sum() / mask_input.sum() else: loss = l_loss.mean() # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum. params = lasagne.layers.get_all_params(prediction_layer, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate, momentum=0.9) #updates = lasagne.updates.adagrad(loss, params, learning_rate=0.001) grads = theano.grad(loss, params) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(prediction_layer, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_chars) test_loss = test_loss.mean() predicted_chars = T.argmax(test_prediction, axis=1) correctly_predicted_chars = T.eq(predicted_chars, target_chars) # An expression for the classification accuracy: test_acc = T.mean(correctly_predicted_chars, dtype=theano.config.floatX) predicted_chars = predicted_chars.reshape(shape=(-1, num_rnn_steps)) correctly_predicted_chars = correctly_predicted_chars.reshape(shape=(-1, num_rnn_steps)) num_chars_matched = T.sum(correctly_predicted_chars, axis=1, dtype=theano.config.floatX) seq_test_acc = T.mean(T.eq(num_chars_matched, T.fill(num_chars_matched, num_rnn_steps)), dtype=theano.config.floatX) test_prediction = test_prediction.reshape(shape=(-1, num_rnn_steps, len(self.CHARS))) mask_input_vec = [mask_input_input] if use_mask_input else [] # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function( [image_input, target_chars_input] + mask_input_vec, loss, updates=updates, allow_input_downcast=True) # Compile a second function computing the prediction, validation loss and accuracy: test_fn = theano.function([image_input, target_chars_input] + mask_input_vec, [test_loss, test_acc, seq_test_acc], allow_input_downcast=True) # Compile a third function computing the prediction. inference_fn = theano.function([image_input] + mask_input_vec, [predicted_chars, test_prediction], allow_input_downcast=True) return prediction_layer, train_fn, test_fn, inference_fn
def __init__(self, voca_size, hidden_size, lstm_layers_num, learning_rate=0.2): self.voca_size = voca_size self.hidden_size = hidden_size self.lstm_layers_num = lstm_layers_num self.learning_rate = learning_rate self._train = None self._utter = None self.params = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.hos = [] self.Cos = [] encoderInputs, encoderMask = tensor.imatrices(2) decoderInputs, decoderMask, decoderTarget = tensor.imatrices(3) self.lookuptable = theano.shared( name="Encoder LookUpTable", value=utils.init_norm(self.voca_size, self.hidden_size), borrow=True ) self.linear = theano.shared( name="Linear", value=utils.init_norm(self.hidden_size, self.voca_size), borrow=True ) self.params += [self.lookuptable, self.linear] #concatenate #(max_sent_size, batch_size, hidden_size) state_below = self.lookuptable[encoderInputs.flatten()].reshape((encoderInputs.shape[0], encoderInputs.shape[1], self.hidden_size)) for _ in range(self.lstm_layers_num): enclstm = LSTM(self.hidden_size) self.encoder_lstm_layers += enclstm, #append self.params += enclstm.params #concatenate hs, Cs = enclstm.forward(state_below, encoderMask) self.hos += hs[-1], self.Cos += Cs[-1], state_below = hs state_below = self.lookuptable[decoderInputs.flatten()].reshape((decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co) decoder_lstm_outputs = state_below ei, em, di, dm, dt = tensor.imatrices(5) #place holders ##################################################### ##################################################### linear_outputs = tensor.dot(decoder_lstm_outputs, self.linear) softmax_outputs, updates = theano.scan( fn=lambda x: tensor.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * tensor.log(pred[tensor.arange(decoderInputs.shape[1]), y]) costs, updates = theano.scan(fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask]) loss = costs.sum() / decoderMask.sum() gparams = [tensor.grad(loss, param) for param in self.params] updates = [(param, param - self.learning_rate*gparam) for param, gparam in zip(self.params, gparams)] self._train = theano.function( inputs=[ei, em, di, dm, dt], outputs=[loss, costs], updates=updates, givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt} ) ##################################################### ##################################################### hs0, Cs0 = tensor.as_tensor_variable(self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos, name="Cs0") token_idxs = tensor.fill( tensor.zeros_like(decoderInputs, dtype="int32"), utils.idx_start) msk = tensor.fill( (tensor.zeros_like(decoderInputs, dtype="int32")), 1) def _step(token_idxs, hs_, Cs_): hs, Cs = [], [] state_below = self.lookuptable[token_idxs].reshape((decoderInputs.shape[0], decoderInputs.shape[1], self.hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below, msk, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(Cs) next_token_idx = tensor.cast( tensor.dot(state_below, self.linear).argmax(axis=-1), "int32" ) return next_token_idx, hs, Cs outputs, updates = theano.scan( fn=_step, outputs_info=[token_idxs, hs0, Cs0], n_steps=utils.max_sent_size ) listof_token_idx = outputs[0] self._utter = theano.function( inputs=[ei, em, di], outputs=listof_token_idx, givens={encoderInputs:ei, encoderMask:em, decoderInputs:di} #givens={encoderInputs:ei, encoderMask:em} )
def test_exp_over_1_plus_exp(self): m = self.get_mode(excluding=['local_elemwise_fusion']) x = T.vector() data = numpy.random.rand(54).astype(config.floatX) backup = config.warn.identify_1pexp_bug config.warn.identify_1pexp_bug = False try: # tests exp_over_1_plus_exp f = theano.function([x], T.exp(x) / (1 + T.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid] f(data) f = theano.function([x], T.exp(x) / (2 + T.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = theano.function([x], T.exp(x) / (1 - T.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = theano.function([x], T.exp(x + 1) / (1 + T.exp(x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) # tests inv_1_plus_exp f = theano.function([x], T.fill(x, 1.0) / (1 + T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid] f(data) f = theano.function([x], T.fill(x, 1.0) / (2 + T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = theano.function([x], T.fill(x, 1.0) / (1 - T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) f = theano.function([x], T.fill(x, 1.1) / (1 + T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid] f(data) # tests inv_1_plus_exp with neg f = theano.function([x], T.fill(x, -1.0) / (1 + T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid, theano.tensor.inplace.neg_inplace] f(data) f = theano.function([x], T.fill(x, -1.0) / (1 - T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid, theano.tensor.inplace.neg_inplace] f(data) f = theano.function([x], T.fill(x, -1.0) / (2 + T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid, theano.tensor.inplace.neg_inplace] f(data) f = theano.function([x], T.fill(x, -1.1) / (1 + T.exp(-x)), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid, theano.tensor.inplace.neg_inplace] f(data) # tests double inv_1_plus_exp with neg # (-1)(exp(x)) / (1+exp(x))(1+exp(-x)) # = (-1)/(1+exp(-x)) * exp(x)/(1+exp(x)) # = - (sigm(x) * sigm(x)) f = theano.function([x], (T.fill(x, -1.0) * T.exp(x)) / ((1 + T.exp(x)) * (1 + T.exp(-x))), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] == [sigmoid, T.mul] f(data) f = theano.function([x], (T.fill(x, -1.1) * T.exp(x)) / ((1 + T.exp(x)) * (1 + T.exp(-x))), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid, T.mul, theano.tensor.inplace.neg_inplace] f(data) f = theano.function([x], (T.fill(x, -1.0) * T.exp(x)) / ((2 + T.exp(x)) * (1 + T.exp(-x))), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid, T.mul, theano.tensor.inplace.neg_inplace] f(data) f = theano.function([x], (T.fill(x, -1.0) * T.exp(x)) / ((1 + T.exp(x)) * (2 + T.exp(-x))), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid, T.mul, theano.tensor.inplace.neg_inplace] f(data) f = theano.function([x], (T.fill(x, -1.0) * T.exp(x)) / ((1 + T.exp(x)) * (1 + T.exp(x))), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid, T.mul, theano.tensor.inplace.neg_inplace] f(data) f = theano.function([x], (T.fill(x, -1.0) * T.exp(x)) / ((1 + T.exp(x)) * (2 + T.exp(-x))), mode=m) assert [node.op for node in f.maker.fgraph.toposort()] != [sigmoid, T.mul, theano.tensor.inplace.neg_inplace] f(data) finally: # Restore config option. config.warn.identify_1pexp_bug = backup