def grad(self, inputs, g_outputs): (rho, ) = inputs (gz,) = g_outputs A = self.Id - tt.mul(rho, self.Wd) dinv = tt.nlinalg.matrix_inverse(A).T out = tt.mul(dinv, - self.Wd) return [tt.as_tensor(tt.sum(tt.mul(out, gz)), ndim=1)]
def NLL(self, y, useMeanOnly=False, sampleWeight=None): assert (y.ndim == 2) pi = numpy.pi err = y - self.mean cos_err = 1. - T.cos(err) if useMeanOnly==True or (self.sigma_sqr is None): sig_sqr = T.ones_like(cos_err) else: sig_sqr = self.sigma_sqr e = T.sum(cos_err/sig_sqr, axis=1, keepdims=True ) sig = T.sqrt(sig_sqr) sin_err = T.sin(err) f = T.prod(sin_err/sig, axis=1, keepdims=True) if useMeanOnly==True or (self.corr is None): rho = T.zeros_like(f) else: rho = self.corr g = e - T.mul(rho, f) rho_sqr = T.sqr(rho) h = g/(1 - rho_sqr ) nll = h + numpy.log(2*pi) + T.sum(T.log(sig_sqr), axis=1, keepdims=True)/2. + T.log(1 - rho_sqr)/2. if sampleWeight is None: return T.mean(nll) return T.sum(T.mul(nll, sampleWeight) )/T.sum(sampleWeight)
def beta_div(X, W, H, beta): """Compute beta divergence D(X|WH) Parameters ---------- X : Theano tensor data W : Theano tensor Bases H : Theano tensor activation matrix beta : Theano scalar Returns ------- div : Theano scalar beta divergence D(X|WH)""" div = ifelse( T.eq(beta, 2), T.sum(1. / 2 * T.power(X - T.dot(H, W), 2)), ifelse( T.eq(beta, 0), T.sum(X / T.dot(H, W) - T.log(X / T.dot(H, W)) - 1), ifelse( T.eq(beta, 1), T.sum(T.mul(X, (T.log(X) - T.log(T.dot(H, W)))) + T.dot(H, W) - X), T.sum(1. / (beta * (beta - 1.)) * (T.power(X, beta) + (beta - 1.) * T.power(T.dot(H, W), beta) - beta * T.power(T.mul(X, T.dot(H, W)), (beta - 1))))))) return div
def ConvByPattern(x, patterns, mask=None): W = np.transpose(patterns, (3, 0, 1, 2)) out2 = T.nnet.conv2d(x.dimshuffle(0, 3, 1, 2), W, filter_shape=W.shape, border_mode='half') if mask is not None: ## mask has shape (batchSize, #rows_to_be_masked, nCols) ## a subtensor of out2 along the horiz direction out2_sub_horiz = out2[:, :, :mask.shape[1], :] mask_horiz = mask.dimshuffle(0, 'x', 1, 2) out3 = T.set_subtensor(out2_sub_horiz, T.mul(out2_sub_horiz, mask_horiz)) ## a subtensor of out3 along the vertical direction out3_sub_vertical = out3[:, :, :, :mask.shape[1]] mask_vertical = mask.dimshuffle(0, 'x', 2, 1) y = T.set_subtensor(out3_sub_vertical, T.mul(out3_sub_vertical, mask_vertical)) else: y = out2 y = y.dimshuffle(0, 2, 3, 1) return y / np.prod(patterns.shape[1:3])
def t_forward_step(self, mask, cur_w_in_sig, pre_out_sig, pre_cell_sig, w_ifco, b_ifco,ln_b1,ln_s1, ln_b2,ln_s2,ln_b3,ln_s3, t_n_out): cur_w_in_sig_ln = self.ln(cur_w_in_sig, ln_b1, ln_s1) pre_w_out_sig = T.dot(pre_out_sig, w_ifco) pre_w_out_sig_ln = self.ln(pre_w_out_sig, ln_b2, ln_s2) preact = T.add(cur_w_in_sig_ln, pre_w_out_sig_ln, b_ifco) inner_act = self.activation # T.nnet.hard_sigmoid #T.tanh # T.nnet.hard_sigmoid T.tanh gate_act = self.sigmoid() # T.nnet.hard_sigmoid #T.nnet.sigmoid # Input Gate ig_t1 = gate_act(preact[:, 0:t_n_out]) # Forget Gate fg_t1 = gate_act(preact[:, 1 * t_n_out:2 * t_n_out]) # Cell State cs_t1 = T.add(T.mul(fg_t1, pre_cell_sig), T.mul(ig_t1, inner_act(preact[:, 2 * t_n_out:3 * t_n_out]))) mask = T.addbroadcast(mask, 1) cs_t1 = mask * cs_t1 + (1. - mask) * pre_cell_sig cs_t1_ln = self.ln(cs_t1, ln_b3, ln_s3) # Output Gate og_t1 = gate_act(preact[:, 3 * t_n_out:4 * t_n_out]) # Output LSTM out_sig = T.mul(og_t1, inner_act(cs_t1_ln)) out_sig = mask * out_sig + (1. - mask) * pre_out_sig return [out_sig, cs_t1]
def create_weight_update_functions(self): updates = [] for i in range(len(self.error_gradients)): updates.append((self.weights[i], g(T.sub(self.weights[i],T.mul(T.mul(self.error_gradients[-(i+1)],self.alpha),self.batch_size_divisor))))) updates.append((self.biases[i],g(T.sub(self.biases[i],T.mul(T.mul(self.errors[-(i+1)], self.alpha),self.batch_size_divisor))))) self.update_weight_function = function(inputs=[self.idx,self.alpha],updates= updates)
def get_cost_updates(self, corruption_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # Calculate cross-entropy cost (as alternative to MSE) of the reconstruction of the minibatch. weight_decay = 0.5 * self.lamda * (T.sum(T.mul(self.W, self.W)) + T.sum(T.mul(self.W_prime, self.W_prime))) # Calculate weight decay term to prevent overfitting rho_hat = T.sum(y, axis=1) / tilde_x.shape[1] KL_divergence = self.beta * T.sum(self.rho * T.log(self.rho / rho_hat) + (1-self.rho) * T.log((1 - self.rho)/(1-rho_hat))) # KL divergence sparsity term # Calculate overall errors cost = T.mean(L) + weight_decay + KL_divergence # Compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # Generate the list of updates updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams) ] return (cost, updates)
def recurrence(self, inp, prev_hidden, prev_cell): """ LSTM.recurrence(input_, prev_hidden, prev_cell) -> hidden, cell (batchsize x hidden_size) Produces the new hidden and cell state, acting as a single computation step of an LSTM @param input_: a batchsize x input_size matrix that represents the new data to input into the LSTM @param prev_hidden: a batchsize x hidden_size matrix that represents the previous hidden state of the network @param prev_cell: a batchsize x hidden_size matrix that represents the previous cell state of the network """ forget = T.nnet.sigmoid(T.dot(inp, self.weights["f:x"]) +\ T.dot(prev_hidden, self.weights["f:h"]) +\ self.weights["f:b"]) input_ = T.nnet.sigmoid(T.dot(inp, self.weights["i:x"]) +\ T.dot(prev_hidden, self.weights["i:h"]) +\ self.weights["i:b"]) output = T.nnet.sigmoid(T.dot(inp, self.weights["o:x"]) +\ T.dot(prev_hidden, self.weights["o:h"]) +\ self.weights["o:b"]) cell = T.mul(forget, prev_cell) + T.mul(input_, T.tanh(T.dot(inp, self.weights["c:x"]) +\ T.dot(prev_hidden, self.weights["c:h"]) +\ self.weights["c:b"])) hidden = T.mul(output, cell) return hidden, cell
def H_beta_sub(X, W, Wsub, H, Hsub, beta): """Update group activation with beta divergence Parameters ---------- X : Theano tensor data W : Theano tensor Bases Wsub : Theano tensor group Bases H : Theano tensor activation matrix Hsub : Theano tensor group activation matrix beta : Theano scalar Returns ------- H : Theano tensor Updated version of the activations """ up = ifelse(T.eq(beta, 2), (T.dot(X, Wsub)) / (T.dot(T.dot(H, W.T), Wsub)), (T.dot(T.mul(T.power(T.dot(H, W.T), (beta - 2)), X), Wsub)) / (T.dot(T.power(T.dot(H, W.T), (beta-1)), Wsub))) return T.mul(Hsub, up)
def get_cost_updates(self, corruption_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the dA """ tilde_x = self.get_corrupted_input(self.x, corruption_level) y = self.get_hidden_values(tilde_x) z = self.get_reconstructed_input(y) L = -T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # Calculate cross-entropy cost (as alternative to MSE) of the reconstruction of the minibatch. weight_decay = 0.5 * self.lamda * (T.sum(T.mul( self.W, self.W)) + T.sum(T.mul(self.W_prime, self.W_prime))) # Calculate weight decay term to prevent overfitting rho_hat = T.sum(y, axis=1) / tilde_x.shape[1] KL_divergence = self.beta * T.sum( self.rho * T.log(self.rho / rho_hat) + (1 - self.rho) * T.log((1 - self.rho) / (1 - rho_hat))) # KL divergence sparsity term # Calculate overall errors cost = T.mean(L) + weight_decay + KL_divergence # Compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost, self.params) # Generate the list of updates updates = [(param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams)] return (cost, updates)
def t_forward_step(self, mask, cur_w_in_sig, pre_out_sig, pre_cell_sig, w_ifco, b_ifco, t_n_out): ifco = T.add(T.dot(pre_out_sig, w_ifco), b_ifco) inner_act = self.activation gate_act = self.sigmoid() # Input Gate ig_t1 = gate_act(T.add(ifco[:, 0:t_n_out], cur_w_in_sig[:, 0:t_n_out])) # Forget Gate fg_t1 = gate_act(T.add(ifco[:, 1 * t_n_out:2 * t_n_out], cur_w_in_sig[:, 1 * t_n_out:2 * t_n_out])) # Cell State cs_t1 = T.add(T.mul(fg_t1, pre_cell_sig), T.mul(ig_t1, inner_act( T.add(ifco[:, 2 * t_n_out:3 * t_n_out], cur_w_in_sig[:, 2 * t_n_out:3 * t_n_out])))) mask = T.addbroadcast(mask, 1) cs_t1 = mask * cs_t1 + (1. - mask) * pre_cell_sig # functionality: cs_t1 = T.switch(mask , cs_t1, pre_cell_sig) # Output Gate og_t1 = gate_act( T.add(ifco[:, 3 * t_n_out:4 * t_n_out], cur_w_in_sig[:, 3 * t_n_out:4 * t_n_out])) # Output LSTM out_sig = T.mul(og_t1, inner_act(cs_t1)) out_sig = mask * out_sig + (1. - mask) * pre_out_sig return [out_sig, cs_t1]
def log_loss(self, y, L_input): #return -T.dot(T.log(self.p_y_given_x), T.transpose(y))[T.arange(y.shape[0]), T.arange(y.shape[0])] y_train_standard = y #tomm5_pos_range = numpy.concatenate([numpy.arange(30, 80), numpy.arange(185, 186)]).tolist() #y_train_tomm5 = self.partial_pos(y, tomm5_pos_range, y.shape[0]) tomm5_range = numpy.arange(55, 85).tolist() tomm5_range_versus = numpy.concatenate( [numpy.arange(30, 55), numpy.arange(185, 186)]).tolist() y_train_tomm5 = self.partial_versus(y, tomm5_range, tomm5_range_versus, y.shape[0]) symprx_range = numpy.arange(60, 80).tolist() symprx_range_versus = numpy.concatenate( [numpy.arange(145, 165), numpy.arange(185, 186)]).tolist() y_train_symprx = self.partial_versus(y, symprx_range, symprx_range_versus, y.shape[0]) return - ( T.switch(self.L_standard.shape[0] > 0, T.sum(T.sum(T.mul(T.log(self.p_train_standard[self.L_standard, :]), y_train_standard[self.L_standard, :]), axis=1)), 0) \ + T.switch(self.L_tomm5.shape[0] > 0, T.sum(T.sum(T.mul(T.log(self.p_train_tomm5[self.L_tomm5, :]), y_train_tomm5[self.L_tomm5, :]), axis=1)), 0) \ + T.switch(self.L_symprx.shape[0] > 0, T.sum(T.sum(T.mul(T.log(self.p_train_symprx[self.L_symprx, :]), y_train_symprx[self.L_symprx, :]), axis=1)), 0)) '''sym_prx_y = T.concatenate([ (T.sum(y[:, 60:80], axis=1) / ( T.sum(y[:, 60:80], axis=1) + T.sum(y[:, 145:165], axis=1) + y[:, 185] )).reshape((50, 1)) , (1.0 - T.sum(y[:, 60:80], axis=1) / ( T.sum(y[:, 60:80], axis=1) + T.sum(y[:, 145:165], axis=1) + y[:, 185] )).reshape((50, 1)) ], axis=1)
def beta_div(X, W, H, beta): """Compute beta divergence D(X|WH) Parameters ---------- X : Theano tensor data W : Theano tensor Bases H : Theano tensor activation matrix beta : Theano scalar Returns ------- div : Theano scalar beta divergence D(X|WH)""" div = ifelse( T.eq(beta, 2), T.sum(1. / 2 * T.power(X - T.dot(H, W), 2)), ifelse( T.eq(beta, 0), T.sum(X / T.dot(H, W) - T.log(X / T.dot(H, W)) - 1), ifelse( T.eq(beta, 1), T.sum( T.mul(X, (T.log(X) - T.log(T.dot(H, W)))) + T.dot(H, W) - X), T.sum(1. / (beta * (beta - 1.)) * (T.power(X, beta) + (beta - 1.) * T.power(T.dot(H, W), beta) - beta * T.power(T.mul(X, T.dot(H, W)), (beta - 1))))))) return div
def beta_H_groupSparse(X, W, H, beta, l_sp, start, stop): """Update activation with beta divergence Parameters ---------- X : Theano tensor data W : Theano tensor Bases H : Theano tensor activation matrix beta : Theano scalar Returns ------- H : Theano tensor Updated version of the activations """ results, _ = theano.scan(fn=lambda start_i, stop_i, prior_results, H: T.set_subtensor( prior_results[:, start_i:stop_i].T, H[:, start_i:stop_i].T / H[:, start_i:stop_i].norm(2, axis=1)).T, outputs_info=T.zeros_like(H), sequences=[start, stop], non_sequences=H) cst = results[-1] up = ifelse(T.eq(beta, 2), (T.dot(X, W)) / (T.dot(T.dot(H, W.T), W) + l_sp * cst), (T.dot(T.mul(T.power(T.dot(H, W.T), (beta - 2)), X), W)) / (T.dot(T.power(T.dot(H, W.T), (beta-1)), W) + l_sp * cst)) return T.mul(H, up)
def beta_H_Sparse(X, W, H, beta, l_sp): """Update activation with beta divergence Parameters ---------- X : Theano tensor data W : Theano tensor Bases H : Theano tensor activation matrix beta : Theano scalar Returns ------- H : Theano tensor Updated version of the activations """ up = ifelse(T.eq(beta, 2), (T.dot(X, W)) / (T.dot(T.dot(H, W.T), W) + l_sp), (T.dot(T.mul(T.power(T.dot(H, W.T), (beta - 2)), X), W)) / (T.dot(T.power(T.dot(H, W.T), (beta-1)), W) + l_sp)) return T.mul(H, up)
def W_beta_sub_withcst(X, W, Wsub, H, Hsub, beta, sum_grp, lambda_grp, card_grp): """Update group activation with beta divergence Parameters ---------- X : Theano tensor data W : Theano tensor Bases Wsub : Theano tensor group Bases H : Theano tensor activation matrix Hsub : Theano tensor group activation matrix beta : Theano scalar Returns ------- H : Theano tensor Updated version of the activations """ up = ifelse(T.eq(beta, 2), (T.dot(X.T, Hsub) + lambda_grp * sum_grp) / (T.dot(T.dot(H, W.T).T, Hsub) + lambda_grp * card_grp * Wsub), (T.dot(T.mul(T.power(T.dot(H, W.T), (beta - 2)), X).T, Hsub)+ lambda_grp * sum_grp) / (T.dot(T.power(T.dot(H, W.T), (beta-1)).T, Hsub) + lambda_grp * card_grp * Wsub)) return T.mul(Wsub, up)
def get_output_for(self, inputs, **kwargs): mod1 = T.clip(T.sqrt(T.sum(T.sqr(inputs[0]), axis=self.axis)), self.tol, 1000.) mod2 = T.clip(T.sqrt(T.sum(T.sqr(inputs[1]), axis=self.axis)), self.tol, 1000.) return T.sum(T.mul(inputs[0], inputs[1]), axis=self.axis) / T.mul( mod1, mod2)
def create_backprop_gradient_functions(self): self.errors =[] self.error_gradients = [] error_function = None error_gradient = None for i in range(len(self.weights)): if len(self.errors) == 0: #this is the last layer of the net: The error is X - t because of #the combination of softmax and cross entropy cost function error_function = g(T.sub(self.feedforward,self.t[self.idx])) self.errors.append(error_function) error_gradient = g(T.dot(self.z[-2].T,self.errors[i])) error_gradient = self.apply_L2_penalties_error_gradients(error_gradient, -1) self.error_gradients.append(error_gradient) elif (len(self.weights) - 1) == i: #this involves the input X instead of z-values as it is the first weights that #need to be updated self.errors.append(g(T.mul(T.dot(self.errors[-1],self.weights[1].T), self.layers[1].activation_derivative(self.z[0])))) error_gradient = g(T.dot(self.X[self.idx].T,self.errors[-1])) #error_gradient = self.apply_L2_penalties_error_gradients(error_gradient, 0) self.error_gradients.append(error_gradient) else: self.errors.append(g(T.mul(T.dot(self.errors[-1],self.weights[-i].T), self.layers[-(i+1)].activation_derivative(self.z[-(i+1)])))) error_gradient = g(T.dot(self.z[-(i+2)].T,self.errors[-1])) #error_gradient = self.apply_L2_penalties_error_gradients(error_gradient, -(i+1)) self.error_gradients.append(error_gradient)
def minus_corr(u, v): um = T.sub(u, T.mean(u)) vm = T.sub(v, T.mean(v)) r_num = T.sum(T.mul(um, vm)) r_den = T.sqrt(T.mul(T.sum(T.sqr(um)), T.sum(T.sqr(vm)))) r = T.true_div(r_num, r_den) r = T.neg(r) return r
def pearson_correlation(x, y): print('Using PLCC metric') muy_ypred = T.mean(x) muy_y = T.mean(y) numerator = T.sum(T.mul(x - muy_ypred, y - muy_y)) denominator = T.mul(T.sqrt(T.sum(T.square(x - muy_ypred))), T.sqrt(T.sum(T.sqr(y - muy_y)))) + 1e-10 return numerator / denominator
def create_momentum_weight_update_functions(self): momentum_updates = [] for i in range(len(self.H.L.momentum_weights)): momentum_updates.append( (self.H.L.momentum_weights[i], g(T.mul(self.batch_size_divisor,T.sub(T.mul(self.M,self.H.L.momentum_weights[i]),T.mul(self.alpha,self.error_gradients[-(i+1)])))))) self.H.L.momentum_update_function = function(inputs=[self.idx, self.M, self.alpha], updates=momentum_updates)
def grad(self, inputs, g_outputs): (rho, ) = inputs (gz,) = g_outputs A = self.Id - tt.mul(rho, self.Wd) dinv = self.I + ts.mul_s_d(self.W, rho) dinv +=ts.mul_s_d(self.WW, rho**2) dinv +=ts.mul_s_d(self.WWW, rho**3) out = tt.mul(dinv, - self.Wd) return [tt.as_tensor(tt.sum(tt.mul(out, gz)), ndim=1)]
def lmul_T(self, x): CC, RR = self.split_right_shape(tuple(x.shape), T=True) x_WT = theano.dot( x.reshape((tensor.mul(*CC), tensor.mul(*RR))), self._W.T) cshape = self.col_shape() yshp = tensor.stack(*(CC + cshape)) rval = x_WT.reshape(yshp, ndim=len(CC) + len(cshape)) return rval
def f1_score(self, y): n_total = y.shape[0] n_relevant_documents_predicted = T.sum(T.eq(T.ones(self.y_pred.shape), self.y_pred)) two_vector = T.add(T.ones(self.y_pred.shape), T.ones(self.y_pred.shape)) n_relevant_predicted_correctly = T.sum(T.eq(T.add(self.y_pred, y), two_vector)) precision = T.true_div(n_relevant_predicted_correctly, n_relevant_documents_predicted) recall = T.true_div(n_relevant_predicted_correctly, n_total) f1_score = T.mul(2.0, T.true_div(T.mul(precision, recall), T.add(precision, recall))) return [f1_score, precision, recall]
def lmul(self, x): # dot(x, A) RR, CC = self.split_left_shape(tuple(x.shape), T=False) xW = theano.dot( x.reshape((tensor.mul(*RR), tensor.mul(*CC))), self._W) rshape = self.row_shape() yshp = tensor.stack(*(RR + rshape)) rval = xW.reshape(yshp, ndim=len(RR) + len(rshape)) return rval
def __objective_triple(self, triple): """ form the objective function value of a triple :param triple: (entity_l, entity_r, relation) :return: """ l_index, r_index, relation_index = triple return T.nlinalg.norm(T.mul(self.Relation_L[relation_index, :, :], self.Entity[:, l_index]) - T.mul(self.Relation_R[relation_index, :, :], self.Entity[:, r_index]), ord=1)
def set_dropout(self, dropout, activation_function): action_with_drop = None if dropout > 0: action_with_drop = lambda X: T.mul(activation_function(X),self.dropout_function) self.activation_cv_dropout = lambda X: T.mul(activation_function(X),self.dropout_function_cv) else: action_with_drop = activation_function self.activation_cv_dropout = activation_function return action_with_drop
def square_dist(self, X, Z): X = tt.mul(X, 1.0) Xs = tt.sum(tt.square(X), 1) if Z is None: return -2.0 * tt.dot(X, tt.transpose(X)) +\ (tt.reshape(Xs, (-1, 1)) + tt.reshape(Xs, (1, -1))) else: Z = tt.mul(Z, 1.0) Zs = tt.sum(tt.square(Z), 1) return -2.0 * tt.dot(X, tt.transpose(Z)) +\ (tt.reshape(Xs, (-1, 1)) + tt.reshape(Zs, (1, -1)))
def __init__(self, nnet, dataset=None, learning_rate=0.01, beta=0.0, sparsity=0.01, weight_decay=0.0): if len(dataset) < 2: print "Error dataset must contain tuple (train_data,train_target)" train_data, train_target = dataset target = T.matrix('y') square_error = T.mean(0.5 * T.sum(T.pow(target - nnet.output, 2), axis=1)) avg_activate = T.mean(nnet.hiddenLayer[0].output, axis=0) sparsity_penalty = beta * T.sum( T.mul(T.log(sparsity / avg_activate), sparsity) + T.mul(T.log((1 - sparsity) / T.sub(1, avg_activate)), (1 - sparsity))) regularization = 0.5 * weight_decay * ( T.sum(T.pow(nnet.params[0], 2)) + T.sum(T.pow(nnet.params[2], 2))) cost = square_error + sparsity_penalty + regularization gparams = [T.grad(cost, param) for param in nnet.params] new_params = [ param - (learning_rate * gparam) for param, gparam in zip(nnet.params, gparams) ] updates = [(param, new_param) for param, new_param in zip(nnet.params, new_params)] index = T.lscalar() self.train = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ input: train_data[index * batch_size:(index + 1) * batch_size], target: train_target[index * batch_size:(index + 1) * batch_size] }) self.cost = theano.function(inputs=[], outputs=cost, givens={ input: train_data, target: train_target })
def __init__(self, x, marginal_flag=None): self._params = theano.shared(np.random.randn()) self.name = x.name self._prob_succes = T.nnet.sigmoid(self._params) self.pdf = T.mul((self._prob_succes**x), (1.0 - self._prob_succes)**(1 - x)) self.pdf_function = theano.function([x], self.pdf) self.pdf_marg = T.mul((self._prob_succes**x), (1.0 - self._prob_succes)**(1 - x)) if marginal_flag: self.pdf_marg **= marginal_flag
def sequence_iteration(self, output, mask, use_dropout=0, dropout_value=0.5): dot_product = T.dot(output, self.t_w_out) linear_o = T.add(dot_product, self.t_b_out) mask = T.addbroadcast(mask, 2) # to do nesseccary? output = T.mul(mask, linear_o) + T.mul((1. - mask), 1e-6) return output # result
def square_dist(self, X, Xs): X = tt.mul(X, 1.0 / self.ls) X2 = tt.sum(tt.square(X), 1) if Xs is None: sqd = (-2.0 * tt.dot(X, tt.transpose(X)) + (tt.reshape(X2, (-1, 1)) + tt.reshape(X2, (1, -1)))) else: Xs = tt.mul(Xs, 1.0 / self.ls) Xs2 = tt.sum(tt.square(Xs), 1) sqd = (-2.0 * tt.dot(X, tt.transpose(Xs)) + (tt.reshape(X2, (-1, 1)) + tt.reshape(Xs2, (1, -1)))) return tt.clip(sqd, 0.0, np.inf)
def square_dist(self, X, Z): X = tt.mul(X, 1.0 / self.lengthscales) Xs = tt.sum(tt.square(X), 1) if Z is None: sqd = -2.0 * tt.dot(X, tt.transpose(X)) +\ (tt.reshape(Xs, (-1, 1)) + tt.reshape(Xs, (1, -1))) else: Z = tt.mul(Z, 1.0 / self.lengthscales) Zs = tt.sum(tt.square(Z), 1) sqd = -2.0 * tt.dot(X, tt.transpose(Z)) +\ (tt.reshape(Xs, (-1, 1)) + tt.reshape(Zs, (1, -1))) return tt.clip(sqd, 0.0, np.inf)
def set_dropout(self, dropout, activation_function): action_with_drop = None if dropout > 0: action_with_drop = lambda X: T.mul(activation_function(X), self. dropout_function) self.activation_cv_dropout = lambda X: T.mul( activation_function(X), self.dropout_function_cv) else: action_with_drop = activation_function self.activation_cv_dropout = activation_function return action_with_drop
def get_output_for(self, input, **kwargs): num_leading_axes = self.num_leading_axes if num_leading_axes < 0: num_leading_axes += input.ndim if input.ndim > num_leading_axes + 1: # flatten trailing axes (into (n+1)-tensor for num_leading_axes=n) input = input.flatten(num_leading_axes + 1) t = lasagne.nonlinearities.sigmoid(T.dot(input, self.W_t) + self.b_t) g = self.nonlinearity(T.dot(input, self.W_h) + self.b_h) return T.mul(t,g) + T.mul(1-t, input)
def sequence_iteration(self, output, mask,use_dropout=0,dropout_value=0.5): dot_product = T.dot(output , self.t_w_out) net_o = T.add( dot_product , self.t_b_out ) ex_net = T.exp(net_o) sum_net = T.sum(ex_net, axis=2, keepdims=True) softmax_o = ex_net / sum_net mask = T.addbroadcast(mask, 2) # to do nesseccary? output = T.mul(mask, softmax_o) + T.mul( (1. - mask) , 1e-6 ) return output #result
def beta_div(X, W, H, beta): """Compute betat divergence""" div = ifelse(T.eq(beta, 2), T.sum(1. / 2 * T.power(X - T.dot(H, W), 2)), ifelse(T.eq(beta, 0), T.sum(X / T.dot(H, W) - T.log(X / T.dot(H, W)) - 1), ifelse(T.eq(beta, 1), T.sum(T.mul(X, (T.log(X) - T.log(T.dot(H, W)))) + T.dot(H, W) - X), T.sum(1. / (beta * (beta - 1.)) * (T.power(X, beta) + (beta - 1.) * T.power(T.dot(H, W), beta) - beta * T.power(T.mul(X, T.dot(H, W)), (beta - 1))))))) return div
def NLL(self, y, useMeanOnly=False, sampleWeight=None): assert (y.ndim == 2) pi = numpy.pi if self.n_variables == 1: e = T.sqr(y - self.mean) / 2. nll = numpy.log(2 * pi) / 2. if useMeanOnly or (self.sigma_sqr is None): nll = nll + e else: e = e / self.sigma_sqr nll = nll + e + T.log(self.sigma_sqr) / 2. else: err = y - self.mean err_sqr = T.sqr(err) if useMeanOnly or (self.sigma_sqr is None): sig_sqr = T.ones_like(e) else: sig_sqr = self.sigma_sqr nll = T.sum( T.log(sig_sqr) + numpy.log(2 * pi), axis=1, keepdims=True) / 2. e = T.sum(err_sqr / sig_sqr, axis=1, keepdims=True) sig = T.sqrt(sig_sqr) f = T.prod(err / sig, axis=1, keepdims=True) if useMeanOnly or (self.corr is None): rho = T.zeros_like(e) else: rho = T.corr g = e - T.mul(rho, f) * 2. rho_sqr = T.sqr(rho) h = g / (2 * (1 - rho_sqr)) nll = nll + h + T.log(1 - rho_sqr) / 2. if sampleWeight is None: return T.mean(nll) return T.sum(T.mul(nll, sampleWeight)) / T.sum(sampleWeight)
def __init(): dataset = T.matrix("dataset", dtype=config.globalFloatType()) trans_dataset = T.transpose(dataset) dot_mul = T.dot(dataset, trans_dataset) l2 = T.sqrt(T.sum(T.square(dataset), axis=1)) # p =printing.Print("l2") # l2 = p(l2) l2_inv2 = T.inv(l2).dimshuffle(['x', 0]) # p =printing.Print("l2_inv2") # l2_inv2 = p(l2_inv2) l2_inv1 = T.transpose(l2_inv2) # p =printing.Print("l2_inv1") # l2_inv1 = p(l2_inv1) l2_inv = T.dot(l2_inv1, l2_inv2) # p =printing.Print("l2_inv") # l2_inv = p(l2_inv) affinty = (T.mul(dot_mul, l2_inv) + 1) / 2 globals()['__affinty_fun'] = theano.function( [dataset], [affinty], allow_input_downcast=True )
def __call__(self, X): XY = X.dot(X.T) x2 = tt.sum(X**2, axis=1).dimshuffle(0, 'x') X2e = tt.repeat(x2, X.shape[0], axis=1) H = X2e + X2e.T - 2. * XY V = tt.sort(H.flatten()) length = V.shape[0] # median distance m = tt.switch( tt.eq((length % 2), 0), # if even vector tt.mean(V[((length // 2) - 1):((length // 2) + 1)]), # if odd vector V[length // 2]) h = .5 * m / tt.log(floatX(H.shape[0]) + floatX(1)) # RBF Kxy = tt.exp(-H / h / 2.0) # Derivative dxkxy = -tt.dot(Kxy, X) sumkxy = tt.sum(Kxy, axis=-1, keepdims=True) dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / h return Kxy, dxkxy
def create_cost_function(self): cost_function = None if self.cost_function == Cost_function.cross_entropy: cost_function = -T.mean(T.sum(T.mul(self.t[self.idx], T.log(self.feedforward)), axis=0)) self.cost_function = cost_function
def output_error(self, input_sequence, true_output, mask): outputs = T.nnet.categorical_crossentropy(input_sequence, true_output) outputs = T.mul(outputs.dimshuffle(0,1,'x'), mask) return T.sum(outputs) / T.sum(mask)
def t_forward_step(self,mask, rzup_in_sig, h_pre,b_rzup, u_rz, u_up,ln_b1,ln_s1, ln_b2,ln_s2,ln_b3,ln_s3, t_n_out): signal_act = self.activation gate_act = self.sigmoid() rzup_in_sig_ln = self.ln(rzup_in_sig, ln_b1, ln_s1) rzup_b_in_sig_ln = T.add(rzup_in_sig_ln, b_rzup) preact = T.dot( h_pre, u_rz) preact_ln = self.ln(preact, ln_b2, ln_s2) r = gate_act( T.add( rzup_b_in_sig_ln[:, 0:t_n_out] , preact_ln[:, 0:t_n_out] )) z = gate_act( T.add( rzup_b_in_sig_ln[:, t_n_out:2 * t_n_out] , preact_ln[:, t_n_out:2 * t_n_out] )) preactx = T.dot(h_pre , u_up) preactx_ln = self.ln(preactx, ln_b3, ln_s3) h_pre_r_ln = T.mul( preactx_ln, r) h_update = signal_act( T.add( rzup_b_in_sig_ln[:, 2*t_n_out:3*t_n_out] , h_pre_r_ln )) h_new = T.add( (1.-z) * h_update , z * h_pre ) mask = T.addbroadcast(mask, 1) out_sig = T.add( mask * h_new , (1. - mask) * h_pre ) return out_sig
def output_error(self, input_sequence, true_output, mask): outputs = T.nnet.categorical_crossentropy(input_sequence, true_output) outputs = T.mul(outputs.dimshuffle(0, 1, 'x'), mask) return T.sum(outputs) / T.sum(mask)
def rbf_kernel(X): XY = T.dot(X, X.T) x2 = T.sum(X**2, axis=1).dimshuffle(0, 'x') X2e = T.repeat(x2, X.shape[0], axis=1) H = X2e + X2e.T - 2. * XY V = H.flatten() # median distance h = T.switch(T.eq((V.shape[0] % 2), 0), # if even vector T.mean(T.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]), # if odd vector T.sort(V)[V.shape[0] // 2]) h = .5 * h / T.log(T.cast(H.shape[0] + 1., theano.config.floatX)) # compute the rbf kernel kxy = T.exp(-H / h / 2.0) dxkxy = -T.dot(kxy, X) sumkxy = T.sum(kxy, axis=1).dimshuffle(0, 'x') dxkxy = T.add(dxkxy, T.mul(X, sumkxy)) / h return kxy, dxkxy
def __init__(self, mu, sigma, random_state=None): super(MultivariateNormal, self).__init__(mu=mu, sigma=sigma, random_state=random_state, optimizer=None) # XXX: The SDP-ness of sigma should be check upon changes # ndim self.ndim_ = self.mu.shape[0] self.make_(self.ndim_, "ndim_func_", args=[]) # pdf L = linalg.cholesky(self.sigma) sigma_det = linalg.det(self.sigma) # XXX: compute from L instead sigma_inv = linalg.matrix_inverse(self.sigma) # XXX: idem self.pdf_ = ( (1. / T.sqrt((2. * np.pi) ** self.ndim_ * T.abs_(sigma_det))) * T.exp(-0.5 * T.sum(T.mul(T.dot(self.X - self.mu, sigma_inv), self.X - self.mu), axis=1))).ravel() self.make_(self.pdf_, "pdf") # -log pdf self.nnlf_ = -T.log(self.pdf_) # XXX: for sure this can be better self.make_(self.nnlf_, "nnlf") # self.rvs_ self.make_(T.dot(L, self.X.T).T + self.mu, "rvs_func_")
def ctc_loss(y_true, y_pred): def path_probs(predict, y_sym): pred_y = predict[:, y_sym] rr = recurrence_relation(y_sym.shape[0]) def step(p_curr, p_prev,rr): return p_curr * T.dot(p_prev, rr) probabilities, _ = theano.scan( step, sequences=[pred_y], outputs_info=[T.eye(y_sym.shape[0])[0]], non_sequences=[rr] ) return probabilities y_sym_a=T.argmax(y_true,axis=-1) n=T.cast(T.add(T.mul(2, y_true.shape[0] - T.sum(y_true[:,-1])),1),'int16') y_sym=T.cast(y_sym_a[:n],'int16') y_pred = T.clip(y_pred, epsilon, 1.0-epsilon) forward_probs = path_probs(y_pred, y_sym) backward_probs = path_probs(y_pred[::-1], y_sym[::-1])[::-1, ::-1] probs = forward_probs * backward_probs / y_pred[:, y_sym] total_probs = T.sum(probs) #total_probs=T.sum(forward_probs[-1,-2:]) return -T.log(total_probs)
def negative_log_likelihood(self, y,misClassCost): """Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label Note: we use the mean instead of the sum so that the learning rate is less dependent on the batch size """ # start-snippet-2 # y.shape[0] is (symbolically) the number of rows in y, i.e., # number of examples (call it n) in the minibatch # T.arange(y.shape[0]) is a symbolic vector which will contain # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of # Log-Probabilities (call it LP) with one row per example and # one column per class LP[T.arange(y.shape[0]),y] is a vector # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y] + T.mul(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y],misClassCost) )
def recon_from(self, s): # s = (bs, n_out) self.mu = theano.shared( rng.uniform(0,1,size=(self.n_in,self.n_out)), name = 'mu') D = numpy.zeros((self.n_in,self.n_out,self.n_out)) for i in range(self.n_in): numpy.fill_diagonal(D[i], 1) self.D = theano.shared(D, name='D') self.params += [self.mu, self.D] """ r = [] for i in range(self.n_in): k = (s-self.mu[i].reshape((1,self.n_out))) l = T.dot(k, self.D[i]) # v = T.exp(-T.dot(l, k.T)).diagonal() # but the dot is expensive for nothing since we're only taking the diagonal v = T.exp(-T.mul(l, k).sum(axis=1)) r.append(v) recon = T.as_tensor_variable(r).T""" K = s.dimshuffle('x',0,1) - self.mu.dimshuffle(0,'x',1) #numpy.sum(a[:,:,:,numpy.newaxis]*b[:,numpy.newaxis,:,:],axis=-2) L = T.sum(K.dimshuffle(0,1,2,'x')*self.D.dimshuffle(0,'x',1,2),axis=-2) V = T.exp(-T.mul(L, K).sum(axis=2)) R = V.T self.recon = R
def loss_t(self): # equiv to sum_i || Xi^T U g( U^T Xi r_i) - r_i ||^2 # X is [d,m,n] # I is [m,d] I = self.decode_t() Rhat = TT.sum(TT.mul(self.X_t.T, I), axis=2).T return TT.sum(( Rhat - self.R_t - self.bias_recon_t) ** 2) / (self.m * self.n)