def output(self, X): # TODO: activation for ReLu. if self.activation == 'sigmoid': return 1 / (1 + T.exp(-T.dot(X, self.w) - self.b)) elif self.activation == 'tanh': return T.tanh(T.dot(X, self.w) + self.b)
def forwardPass(self, x): # Sample from the visible layer # Get the mask that is used for the visible units if self.visibleDropout in [1.0, 1]: currentLayerValues = x else: dropoutMask = self.theanoRng.binomial(n=1, p=self.visibleDropout, size=x.shape, dtype=theanoFloat) currentLayerValues = x * dropoutMask for stage in xrange(self.nrWeights -1): w = self.weights[stage] b = self.biases[stage] linearSum = T.dot(currentLayerValues, w) + b # dropout: give the next layer only some of the units from this layer if self.hiddenDropout in [1.0, 1]: currentLayerValues = self.activationFunction.deterministic(linearSum) else: dropoutMaskHidden = self.theanoRng.binomial(n=1, p=self.hiddenDropout, size=linearSum.shape, dtype=theanoFloat) currentLayerValues = dropoutMaskHidden * self.activationFunction.deterministic(linearSum) # Last layer operations, no dropout in the output w = self.weights[self.nrWeights - 1] b = self.biases[self.nrWeights - 1] linearSum = T.dot(currentLayerValues, w) + b currentLayerValues = self.classificationActivationFunction.deterministic(linearSum) return currentLayerValues
def _step(x_, h_, c_, pred_, prob_): h_a = [] c_a = [] for it in range(self.n_levels): preact = T.dot(h_[it], self.U[it]) preact += T.dot(x_, self.W[it]) + self.b[it] i = T.nnet.sigmoid(_slice(preact, 0, self.n_dim)) f = T.nnet.sigmoid(_slice(preact, 1, self.n_dim)) o = T.nnet.sigmoid(_slice(preact, 2, self.n_dim)) c = T.tanh(_slice(preact, 3, self.n_dim)) c = f * c_[it] + i * c h = o * T.tanh(c) h_a.append(h) c_a.append(c) x_ = h q = T.dot(h, self.L) + self.b0 prob = T.nnet.softmax(q) pred = T.argmax(prob, axis=1) return T.stack(h_a).squeeze(), T.stack(c_a).squeeze(), pred, prob
def __init__(self, input=tensor.dvector('input'), target=tensor.dvector('target'), n_input=1, n_hidden=1, n_output=1, lr=1e-3, **kw): super(NNet, self).__init__(**kw) self.input = input self.target = target self.lr = shared(lr, 'learning_rate') self.w1 = shared(numpy.zeros((n_hidden, n_input)), 'w1') self.w2 = shared(numpy.zeros((n_output, n_hidden)), 'w2') # print self.lr.type self.hidden = sigmoid(tensor.dot(self.w1, self.input)) self.output = tensor.dot(self.w2, self.hidden) self.cost = tensor.sum((self.output - self.target)**2) self.sgd_updates = { self.w1: self.w1 - self.lr * tensor.grad(self.cost, self.w1), self.w2: self.w2 - self.lr * tensor.grad(self.cost, self.w2)} self.sgd_step = pfunc( params=[self.input, self.target], outputs=[self.output, self.cost], updates=self.sgd_updates) self.compute_output = pfunc([self.input], self.output) self.output_from_hidden = pfunc([self.hidden], self.output)
def __init__(self, input, nrLayers, weights, biases, visibleDropout, hiddenDropout, activationFunction, classificationActivationFunction): self.input = input self.classificationWeights = classificationWeightsFromTestWeights(weights, visibleDropout=visibleDropout, hiddenDropout=hiddenDropout) nrWeights = nrLayers - 1 currentLayerValues = input for stage in xrange(nrWeights -1): w = self.classificationWeights[stage] b = biases[stage] linearSum = T.dot(currentLayerValues, w) + b currentLayerValues = activationFunction.deterministic(linearSum) self.lastHiddenActivations = currentLayerValues w = self.classificationWeights[nrWeights - 1] b = biases[nrWeights - 1] linearSum = T.dot(currentLayerValues, w) + b currentLayerValues = classificationActivationFunction.deterministic(linearSum) self.output = currentLayerValues
def set_inpt(self, inpt, inpt_dropout, mini_batch_size): self.inpt = inpt.reshape((mini_batch_size, self.n_in)) self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b) self.y_out = T.argmax(self.output, axis=1) self.inpt_dropout = dropout_layer( inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout) self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b)
def generate(self, h_, c_, x_): h_a = [] c_a = [] for it in range(self.n_levels): preact = T.dot(x_, self.W[it]) preact += T.dot(h_[it], self.U[it]) + self.b[it] i = T.nnet.sigmoid(self.slice(preact, 0, self.n_dim)) f = T.nnet.sigmoid(self.slice(preact, 1, self.n_dim)) o = T.nnet.sigmoid(self.slice(preact, 2, self.n_dim)) c = T.tanh(self.slice(preact, 3, self.n_dim)) c = f * c_[it] + i * c h = o * T.tanh(c) h_a.append(h) c_a.append(c) x_ = h q = T.dot(h, self.L) + self.b0 # mask = T.concatenate([T.alloc(np_floatX(1.), q.shape[0] - 1), T.alloc(np_floatX(0.), 1)]) prob = T.nnet.softmax(q / 1) return prob, T.stack(h_a).squeeze(), T.stack(c_a)[0].squeeze()
def _step(self, x, mask, M_tm1, wr_tm1, ww_tm1, *args): # read if self.inner_rnn == 'lstm': h_tm1 = args[0:2][::-1] # (cell_tm1, h_tm1) else: h_tm1 = args[0:1] # (h_tm1, ) k_read, beta_read, g_read, gamma_read, s_read = self._get_controller_output( h_tm1[-1], self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read) wc_read = self._get_content_w(beta_read, k_read, M_tm1) wr_t = self._get_location_w(g_read, s_read, self.C, gamma_read, wc_read, wr_tm1, mask) M_read = self._read(wr_t, M_tm1) # update controller h_t = _update_controller(self, x, h_tm1, M_read, mask) # write k_write, beta_write, g_write, gamma_write, s_write = self._get_controller_output( h_t[-1], self.W_k_write, self.b_k_write, self.W_c_write, self.b_c_write, self.W_s_write, self.b_s_write) wc_write = self._get_content_w(beta_write, k_write, M_tm1) ww_t = self._get_location_w(g_write, s_write, self.C, gamma_write, wc_write, ww_tm1, mask) e = T.nnet.sigmoid(T.dot(h_t[-1], self.W_e) + self.b_e) a = T.tanh(T.dot(h_t[-1], self.W_a) + self.b_a) M_t = self._write(ww_t, e, a, M_tm1, mask) return (M_t, wr_t, ww_t) + h_t
def free_energy_at_beta(model, samples, beta, pa_bias=None, marginalize_odd=True): """ Computes the free-energy of the sample `h1_sample`, for model p_k(h1). Inputs ------ h1_sample: theano.shared Shared variable representing a sample of layer h1. beta: T.scalar Inverse temperature beta_k of model p_k(h1) at which to measure the free-energy. Returns ------- Symbolic variable, free-energy of sample `h1_sample`, at inv. temp beta. """ keep_idx = numpy.arange(not marginalize_odd, model.depth, 2) marg_idx = numpy.arange(marginalize_odd, model.depth, 2) # contribution of biases fe = 0. for i in keep_idx: fe -= T.dot(samples[i], model.bias[i]) * beta # contribution of biases for i in marg_idx: from_im1 = T.dot(samples[i-1], model.W[i]) if i >= 1 else 0. from_ip1 = T.dot(samples[i+1], model.W[i+1].T) if i < model.depth-1 else 0 net_input = (from_im1 + from_ip1 + model.bias[i]) * beta fe -= T.sum(T.nnet.softplus(net_input), axis=1) fe -= T.dot(samples[not marginalize_odd], pa_bias) * (1. - beta) return fe
def recurrent_step(self, x_c_t, x_i_t, x_f_t, x_o_t, h_tm1, c_tm1, U_h_c, U_h_i, U_h_f, U_h_o): """ Performs one computation step over time. """ # new memory content c_tilde c_tilde = self.hidden_activation_func( x_c_t + T.dot(h_tm1, U_h_c) ) # input gate i_t = self.inner_hidden_activation_func( x_i_t + T.dot(h_tm1, U_h_i) ) # forget gate f_t = self.inner_hidden_activation_func( x_f_t + T.dot(h_tm1, U_h_f) ) # new memory content c_t = f_t*c_tm1 + i_t*c_tilde # output gate o_t = self.inner_hidden_activation_func( x_o_t + T.dot(h_tm1, U_h_o) ) # new hiddens h_t = o_t*self.hidden_activation_func(c_t) # return the hiddens and memory content return h_t, c_t
def __init__(self, rng, input, n_in, n_out, n_component): self.input = input W_value = rng.normal(0.0, 1.0/numpy.sqrt(n_in), size=(n_in, n_out*n_component)) self.W_mu = theano.shared(value=numpy.asarray(W_value, dtype=theano.config.floatX), name='W_mu', borrow=True) self.W_sigma = theano.shared(value=numpy.asarray(W_value.copy(), dtype=theano.config.floatX), name='W_sigma', borrow=True) W_mix_value = rng.normal(0.0, 1.0/numpy.sqrt(n_in), size=(n_in, n_component)) self.W_mix = theano.shared(value=numpy.asarray(W_mix_value, dtype=theano.config.floatX), name='W_mix', borrow=True) self.mu = T.dot(self.input, self.W_mu) #assume linear output for mean vectors self.sigma = T.nnet.softplus(T.dot(self.input, self.W_sigma)) # + 0.0001 #self.sigma = T.exp(T.dot(self.input, self.W_sigma)) # + 0.0001 self.mix = T.nnet.softmax(T.dot(self.input, self.W_mix)) self.delta_W_mu = theano.shared(value = numpy.zeros((n_in, n_out*n_component), dtype=theano.config.floatX), name='delta_W_mu') self.delta_W_sigma = theano.shared(value = numpy.zeros((n_in, n_out*n_component), dtype=theano.config.floatX), name='delta_W_sigma') self.delta_W_mix = theano.shared(value = numpy.zeros((n_in, n_component), dtype=theano.config.floatX), name='delta_W_mix') self.params = [self.W_mu, self.W_sigma, self.W_mix] self.delta_params = [self.delta_W_mu, self.delta_W_sigma, self.delta_W_mix]
def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total): sigma2 = tt.square(sigma) Kuu = cov_total(Xu) Kuf = cov_total(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = cov_total(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 else: # VFE or DTC Lamd = tt.ones_like(Qffd) * sigma2 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - mean_total(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) Kus = self.cov_func(Xu, Xnew) As = solve_lower(Luu, Kus) mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c)) C = solve_lower(L_B, As) if diag: Kss = self.cov_func(Xnew, diag=True) var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0) if pred_noise: var += sigma2 return mu, var else: cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) + tt.dot(tt.transpose(C), C)) if pred_noise: cov += sigma2 * tt.identity_like(cov) return mu, stabilize(cov)
def compileFunctions(self, x_image_global, examples, ib, B, K, corrupt): if x_image_global == None: x_image_global = self.x if corrupt == 0.0: self.x_c = self.x else: self.x_c = self.theano_rng.binomial( size=self.x.shape, n=1, p=1-corrupt, dtype=theano.config.floatX) * self.x self.h = self.g(T.dot(self.x_c, self.W_hl) + self.b_hl) self.x_r = self.o(T.dot(self.h, self.W_ol) + self.b_ol) self.params = [self.W_hl, self.b_hl, self.b_ol] self.cost = \ (- T.sum( self.x * T.log(self.x_r) + (1 - self.x) * T.log(1 - self.x_r), axis=(0,1))) gparams = T.grad(self.cost, self.params) updates = [ (param, param - K * gparam) for param, gparam in zip(self.params, gparams) ] fun_train = theano.function( inputs=[ib], outputs=(self.cost, self.x_r, self.x_c), updates=updates, givens={ x_image_global: examples[ib*B: (ib+1)*B] } ) return fun_train
def mlp(insize, hiddensize, outsize, transferfunc='tanh', outfunc='id'): P = util.ParameterSet( inweights=(insize, hiddensize), hiddenbias=hiddensize, outweights=(hiddensize, outsize), outbias=outsize) P.randomize(1e-4) inpt = T.matrix('inpt') hidden_in = T.dot(inpt, P.inweights) hidden_in += P.hiddenbias nonlinear = transfermap[transferfunc] hidden = nonlinear(hidden_in) output_in = T.dot(hidden, P.outweights) output_in += P.outbias output = output_in output = transfermap[outfunc](output_in) exprs = {'inpt': inpt, 'hidden-in': hidden_in, 'hidden': hidden, 'output-in': output_in, 'output': output} return exprs, P
def _build_marginal_likelihood_logp(self, y, X, Xu, sigma): sigma2 = tt.square(sigma) Kuu = self.cov_func(Xu) Kuf = self.cov_func(Xu, X) Luu = cholesky(stabilize(Kuu)) A = solve_lower(Luu, Kuf) Qffd = tt.sum(A * A, 0) if self.approx == "FITC": Kffd = self.cov_func(X, diag=True) Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2 trace = 0.0 elif self.approx == "VFE": Lamd = tt.ones_like(Qffd) * sigma2 trace = ((1.0 / (2.0 * sigma2)) * (tt.sum(self.cov_func(X, diag=True)) - tt.sum(tt.sum(A * A, 0)))) else: # DTC Lamd = tt.ones_like(Qffd) * sigma2 trace = 0.0 A_l = A / Lamd L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A))) r = y - self.mean_func(X) r_l = r / Lamd c = solve_lower(L_B, tt.dot(A, r_l)) constant = 0.5 * X.shape[0] * tt.log(2.0 * np.pi) logdet = 0.5 * tt.sum(tt.log(Lamd)) + tt.sum(tt.log(tt.diag(L_B))) quadratic = 0.5 * (tt.dot(r, r_l) - tt.dot(c, c)) return -1.0 * (constant + logdet + quadratic + trace)
def __init__(self, rng, input, n_in, n_out, activation, W=None, b=None, use_bias=False): self.input = input self.activation = activation if W is None: if activation.func_name == "ReLU": W_values = numpy.asarray(0.01 * rng.standard_normal(size=(n_in, n_out)), dtype=theano.config.floatX) else: W_values = numpy.asarray(rng.uniform(low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) W = theano.shared(value=W_values, name='W') if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b') self.W = W self.b = b if use_bias: lin_output = T.dot(input, self.W) + self.b else: lin_output = T.dot(input, self.W) self.output = (lin_output if activation is None else activation(lin_output)) # parameters of the model if use_bias: self.params = [self.W, self.b] else: self.params = [self.W]
def _construct_mom_stuff(self): """ Construct the cost function for the moment-matching "regularizer". """ a = self.mom_mix_rate dist_mean = self.GN.dist_mean dist_cov = self.GN.dist_cov # Get the generated sample observations for this batch, transformed # linearly into the desired space for moment matching... X_b = T.dot(self.GN.output, self.mom_match_proj) # Get their mean batch_mean = T.mean(X_b, axis=0) # Get the updated generator distribution mean new_mean = ((1.0 - a[0]) * self.GN.dist_mean) + (a[0] * batch_mean) # Use the mean to get the updated generator distribution covariance X_b_minus_mean = X_b - new_mean # Whelp, I guess this line needs the cast... for some reason... batch_cov = T.dot(X_b_minus_mean.T, X_b_minus_mean) / T.cast(X_b.shape[0], 'floatX') new_cov = ((1.0 - a[0]) * self.GN.dist_cov) + (a[0] * batch_cov) # Get the cost for deviation from the target distribution's moments mean_err = new_mean - self.target_mean cov_err = (new_cov - self.target_cov) mm_cost = self.mom_match_weight[0] * \ (T.sum(mean_err**2.0) + T.sum(cov_err**2.0)) # Construct the updates for the running estimates of the generator # distribution's first and second-order moments. mom_updates = OrderedDict() mom_updates[self.GN.dist_mean] = new_mean mom_updates[self.GN.dist_cov] = new_cov return [mm_cost, mom_updates]
def infer_H_hat_two_sided(self, H_hat_below, W_below, H_hat_above, W_above, b): bottom_up = T.dot(H_hat_below, W_below) top_down = T.dot(H_hat_above, W_above.T) total = bottom_up + top_down + b H_hat = T.nnet.sigmoid(total)
def __init(): dataset = T.matrix("dataset", dtype=config.globalFloatType()) trans_dataset = T.transpose(dataset) dot_mul = T.dot(dataset, trans_dataset) l2 = T.sqrt(T.sum(T.square(dataset), axis=1)) # p =printing.Print("l2") # l2 = p(l2) l2_inv2 = T.inv(l2).dimshuffle(['x', 0]) # p =printing.Print("l2_inv2") # l2_inv2 = p(l2_inv2) l2_inv1 = T.transpose(l2_inv2) # p =printing.Print("l2_inv1") # l2_inv1 = p(l2_inv1) l2_inv = T.dot(l2_inv1, l2_inv2) # p =printing.Print("l2_inv") # l2_inv = p(l2_inv) affinty = (T.mul(dot_mul, l2_inv) + 1) / 2 globals()['__affinty_fun'] = theano.function( [dataset], [affinty], allow_input_downcast=True )
def gibbs_vhv(self,v_sample): h_activation_score = T.dot(v_sample,self.W) + self.h_bias h_activation_probs, h_sample, h_updates = self.h.sample(h_activation_score) v_activation_score = T.dot(h_sample,self.W.T) + self.v_bias v_activation_probs, v_sample, v_updates = self.v.sample(v_activation_score) return h_activation_score,h_activation_probs,h_sample,\ v_activation_score,v_activation_probs,v_sample
def eq_log_pstar_vgh(self, g_hat, h_hat, s1_hat, s0_hat, v): """ Computes the expectation (under the variational distribution q(g,h)=q(g)q(h)) of the log un-normalized probability, i.e. log p^*(g,h,s,v) :param g_hat: T.matrix of shape (batch_size, n_g) :param h_hat: T.matrix of shape (batch_size, n_h) :param v : T.matrix of shape (batch_size, n_v) """ from_v = self.from_v(v) from_h = self.from_h(h_hat) from_g = self.from_g(g_hat) # center variables cg_hat = g_hat - self.cg if self.flags['center_g'] else g_hat ch_hat = h_hat - self.ch if self.flags['center_h'] else h_hat # compute expectation of various s-quantities s_hat = self.s_hat(ch_hat, s1_hat, s0_hat) ss_hat = self.s_hat(ch_hat, s1_hat**2 + 1./self.alpha_prec, s0_hat**2 + 1./self.alpha_prec) lq = 0. lq += T.sum(from_v * self._mu * from_h, axis=1) lq += T.sum(from_v * s1_hat * from_h, axis=1) lq -= 0.5 * T.sum(self.alpha_prec * ss_hat, axis=1) lq -= T.sum(0.5 * self.lambd_prec * v**2, axis=1) lq += T.sum(self.alpha_prec * from_g * s_hat, axis=1) lq += T.dot(cg_hat, self.gbias) lq += T.dot(ch_hat, self.hbias) return T.mean(lq), [g_hat, h_hat, s_hat, ss_hat, s1_hat, s0_hat, v]
def _compile_func(): beta = T.vector('beta') b = T.scalar('b') X = T.matrix('X') y = T.vector('y') C = T.scalar('C') params = [beta, b, X, y, C] cost = 0.5 * (T.dot(beta, beta) + b * b) + C * T.sum( T.nnet.softplus( -T.dot(T.diag(y), T.dot(X, beta) + b) ) ) # Function computing in one go the cost, its gradient # with regard to beta and with regard to the bias. cost_grad = theano.function(params,[ cost, T.grad(cost, beta), T.grad(cost, b) ]) # Function for computing element-wise sigmoid, used for # prediction. log_predict = theano.function( [beta, b, X], T.nnet.sigmoid(b + T.dot(X, beta)), on_unused_input='warn' ) return (cost_grad, log_predict)
def factors(self, w, x, z, A): if self.data == 'binary': def f_xi(zi, xi): pi = T.nnet.sigmoid(T.dot(w['wx'], zi) + T.dot(w['bx'], A)) # pi = p(X_i=1) logpxi = - T.nnet.binary_crossentropy(pi, xi).sum(axis=0, keepdims=True)# logpxi = log p(X_i=x_i) #logpxi = T.log(pi*xi+(1-pi)*(1-xi)).sum(axis=0, keepdims=True) return logpxi elif self.data == 'gaussian': def f_xi(zi, xi): x_mean = T.dot(w['wx'], zi) + T.dot(w['bx'], A) x_logvar = T.dot(2*w['logsdx'], A) return ap.logpdfs.normal2(xi, x_mean, x_logvar).sum(axis=0, keepdims=True) else: raise Exception() # Factors of X and Z logpx = 0 logpz = 0 sd = T.dot(T.exp(w['logsd']), A) for i in range(self.n_steps): if i == 0: logpz += logpdfs.standard_normal(z['z'+str(i)]).sum(axis=0, keepdims=True) if i > 0: mean = T.tanh(T.dot(w['wz'], z['z'+str(i-1)]) + T.dot(w['bz'], A)) logpz += logpdfs.normal(z['z'+str(i)], mean, sd).sum(axis=0, keepdims=True) logpxi = f_xi(z['z'+str(i)], x['x'+str(i)]) logpx += logpxi # joint() = logp(x,z,w) = logp(x|z) + logp(z) + logp(w) + C # This is a proper scalar function logpw = 0 for i in w: logpw += logpdfs.normal(w[i], 0, self.prior_sd).sum() # logp(w) return logpw, logpx, logpz, {}
def forward_prop(self,F,S): # We assume F is a m x n matrix (m rows, n columns) # and S is a 1 x o where o is our output size. # Our weight matrix (self.w) will be n x o. # Resize our bias to be appropriate size (batch_size x o) resized_bias = T.extra_ops.repeat(self.bh, F.shape[0], axis=0) # Combine our input data (F) with our weight matrix and bias. recurrent_gate = T.dot(F,self.wx) #T.nnet.sigmoid(T.dot(F,self.wx)) # Resize the state value to have batch_size x output_size shape weighted_state = T.dot(S,self.wh) hidden_state = T.extra_ops.repeat(weighted_state, F.shape[0], axis=0) # Combine the recurrent_gate with our resized hidden state # Should I use T.tanh on the hidden_state? output = T.nnet.sigmoid(recurrent_gate + hidden_state + resized_bias) # This will average the values across the batch_size and # return a vector of size 1 x o (output_size) new_state = T.mean(hidden_state, axis=0) new_state = new_state.reshape((1,self.y)) # Cast the output output_cast = T.cast(output,theano.config.floatX) return new_state,output_cast
def model(X, w1, w2, w3, Max_Pooling_Shape, p_drop_conv, p_drop_hidden): l1 = T.flatten( dropout(max_pool_2d(rectify(conv2d(X, w1, border_mode="valid")), Max_Pooling_Shape), p_drop_conv), outdim=2 ) l2 = dropout(rectify(T.dot(l1, w2)), p_drop_hidden) pyx = softmax(T.dot(l2, w3)) return pyx
def __init__(self, rng, train_input, test_input, n_in, n_out): # self.input = input.flatten(2) self.W = theano.shared( value=numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=theano.config.floatX ), name='W', borrow=True ) self.b = theano.shared( value=numpy.zeros((n_out,), dtype=theano.config.floatX), name='b', borrow=True ) p = 0.5 tmp_output = T.nnet.relu(T.dot(train_input.flatten(2), self.W) + self.b) srng = RandomStreams(rng.randint(1234)) mask = (srng.uniform(size=tmp_output.shape) < p)/p self.train_output = tmp_output * mask self.test_output = T.nnet.relu(T.dot(test_input.flatten(2), self.W) + self.b) self.params = [self.W, self.b]
def __init__(self, rng, input1, input2, n_in, n_out): self.input1 = input1.flatten(2) self.input2 = input2.flatten(2) self.W = theano.shared( value=numpy.asarray( rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype='float32' ), name='W', borrow=True ) self.b = theano.shared( value=numpy.zeros((n_out,), dtype='float32'), name='b', borrow=True ) lin_output1 = T.dot(self.input1, self.W) + self.b lin_output2 = T.dot(self.input2, self.W) + self.b self.output1 = T.nnet.relu(lin_output1) self.output2 = T.nnet.relu(lin_output2) self.similarity = self.similarity_func(self.output1, self.output2) self.params = [self.W, self.b]
def rbm_fe(rbm_params, v, b): (weights, visbias, hidbias) = rbm_params vis_term = b * tensor.dot(v, visbias) hid_act = b * (tensor.dot(v, weights) + hidbias) fe = -vis_term - tensor.sum(tensor.log(1 + tensor.exp(hid_act)), axis=1) return fe
def pred_t(input_voc_t, weight_tm1, memory_tm1): rawinput_t = self.embedding[input_voc_t] input_t = T.dot(rawinput_t,self.input_w) read_m = T.dot(weight_tm1, memory_tm1) read_t = T.dot(read_m,self.read_w) controller_input = activation(input_t+read_t+self.input_b) hid = self.controller.getY(controller_input) output = T.nnet.softmax(T.dot(hid, self.output_w)+self.output_b) result = T.switch(T.eq(input_voc_t, 0),T.argmax(output,axis=1), theano.shared(0)) #test = controller_input memory_inter = memory_tm1 weight_inter = weight_tm1 for head in self.heads: weight_inter, erase, add= head.emit_new_weight(hid, weight_inter, memory_inter) #write to memory weight_tdim = weight_inter.dimshuffle((0, 'x')) erase_dim = erase.dimshuffle(('x', 0)) add_dim = add.dimshuffle(('x', 0)) M_erased = memory_inter*(1-(weight_tdim*erase_dim)) memory_inter = M_erased+(weight_tdim*add_dim) #testing = weight_tm1 #testing2 = rawinput_t memory_t = memory_inter weight_t = weight_inter return weight_t, memory_t, output,result
def get_pred_prob(self): z1 = T.dot(self.input, self.W1) + self.b1 a1 = T.tanh(z1) z2 = T.dot(a1, self.W2) + self.b2 y_hat = T.nnet.softmax(z2) # output probabilties return y_hat
def factors(self, x, z, A): v = self.v w = self.w ''' z is unused x['x'] is the data The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(x|z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' def f_softplus(x): return T.log(T.exp(x) + 1) # - np.log(2) def f_rectlin(x): return x * (x > 0) def f_rectlin2(x): return x * (x > 0) + 0.01 * x nonlinear = { 'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2 } nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] #rng = rng_curand.CURAND_RandomStreams(0) import theano.tensor.shared_randomstreams rng = theano.tensor.shared_randomstreams.RandomStreams(0) # Compute q(z|x,y) # # it seems that z = f(v['w0x'] * x + v['w0y'] * y + b) # hidden_q = [ nonlinear_q( T.dot(v['w0x'], x['x']) + T.dot(v['w0y'], x['y']) + T.dot(v['b0'], A)) ] for i in range(1, len(self.n_hidden_q)): hidden_q.append( nonlinear_q( T.dot(v['w' + str(i)], hidden_q[-1]) + T.dot(v['b' + str(i)], A))) q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot( v['logvar_b'], A) else: raise Exception() # function for distribution q(z|x) theanofunc = lazytheanofunc('warn', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x'], x['mean_prior'], x['y']] + [A], [q_mean, q_logvar]) # Compute virtual sample eps = rng.normal(size=q_mean.shape, dtype='float32') _z = q_mean + T.exp(0.5 * q_logvar) * eps # Compute log p(x|z) # # log p(x | z, y) # It seems that x = f((w0y * y + w0z * z) + b0) # hidden_p = [ nonlinear_p( T.dot(w['w0y'], x['y']) + T.dot(w['w0z'], _z) + T.dot(w['b0'], A)) ] for i in range(1, len(self.n_hidden_p)): hidden_p.append( nonlinear_p( T.dot(w['w' + str(i)], hidden_p[-1]) + T.dot(w['b' + str(i)], A))) if self.dropout: hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5) if self.type_px == 'bernoulli': p = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = -T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], p) elif self.type_px == 'gaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar]) elif self.type_px == 'laplace': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.laplace(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w) # log p(y) (prior of y) #_logpy = w['logpy'] #if self.uniform_y: _logpy *= 0 #py_model = T.nnet.softmax(T.dot(_logpy, A).T).T #logpy = (- T.nnet.categorical_crossentropy(py_model.T, x['y'].T).T).reshape((1,-1)) #logpx += logpy #self.dist_px['y'] = theanofunc([A], py_model) # log p(z) (prior of z) # # E_q[log(p(z))] # if self.type_pz == 'gaussianmarg': logpz = -0.5 * (np.log(2 * np.pi) + ( (q_mean - x['mean_prior'])**2 + T.exp(q_logvar))).sum( axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'mog': pz = 0 for i in range(self.n_mixture): pz += T.exp( ap.logpdfs.normal2(_z, T.dot(w['mog_mean' + str(i)], A), T.dot(w['mog_logvar' + str(i)], A))) logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log( float(self.n_mixture)) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) # # E_q[-log(q)] # if self.type_qz == 'gaussianmarg': logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum( axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # Note: logpv and logpw are a scalars def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() logpv = 0 logpv += f_prior(v['w0x']) logpv += f_prior(v['w0y']) for i in range(1, len(self.n_hidden_q)): logpv += f_prior(v['w' + str(i)]) logpv += f_prior(v['mean_w']) if self.type_qz in ['gaussian', 'gaussianmarg']: logpv += f_prior(v['logvar_w']) logpw = 0 logpw += f_prior(w['w0y']) logpw += f_prior(w['w0z']) for i in range(1, len(self.n_hidden_p)): logpw += f_prior(w['w' + str(i)]) logpw += f_prior(w['out_w']) if self.type_px in ['sigmoidgaussian', 'gaussian', 'laplace']: logpw += f_prior(w['out_logvar_w']) if self.type_pz == 'studentt': logpw += f_prior(w['logv']) #return logpv, logpw, logpx, logpz, logqz return logpx, logpz, logqz
def _get_fertility(self, c): fertility = T.nnet.sigmoid(T.dot(c, self.W_cov_fertility) + self.b_cov_fertility) * self.max_fertility fertility = fertility.reshape((c.shape[0], c.shape[1])) return fertility
net = build_model() # loading pretrained weights model = pickle.load(open('./blvc_googlenet.pkl')) lasagne.layers.set_all_param_values(net['prob'], model['param values']) googlenet_features = lasagne.layers.get_output(net['pool5/7x7_s1'], X) # add a mlp on top of this W = theano.shared( numpy.random.uniform(low=-0.1, high=0.1, size=(1024, 10)).astype(numpy.float32), 'linear_weights') b = theano.shared(numpy.zeros(10).astype(numpy.float32)) all_parameters = [W, b] output = tensor.dot(googlenet_features, W) + b pred = tensor.nnet.softmax(output) loss = categorical_crossentropy(pred, targets).mean() loss.name = 'loss' loss_test = categorical_crossentropy(pred, targets).mean() loss.name = 'loss_test' error = tensor.neq(tensor.argmax(pred, axis=1), tensor.argmax(targets, axis=1)).mean() error.name = 'error' error_test = tensor.neq(tensor.argmax(pred, axis=1), tensor.argmax(targets, axis=1)).mean() error.name = 'error_test'
def dot(x, y): return T.dot(x, y)
def build_read(M_curr,weight_curr): return T.dot(weight_curr, M_curr)
def rnn_step(_x_tm1, _h_tm1, _W_x, W_h): return T.nnet.sigmoid(T.dot(_x_tm1, _W_x.T) + T.dot(_h_tm1, W_h.T))
# -*- coding: utf-8 -*- """ Created on Sat Sep 30 11:11:12 2017 Define function Examples for Matrix @author: DELL """ import theano import theano.tensor as T a=T.matrix() b=T.matrix() c=a*b d=T.dot(a,b) F1= theano.function([a,b],c) F2=theano.function([a,b],d) A=[[1,2],[3,4]] B=[[2,4],[6,8]] C=[[1,2],[3,4],[5,6]] print (F1(A,B)) print(F2(C,B))
# declare the x,y x = T.dmatrix("x") y = T.dvector("y") learning_rate = T.dscalar("lr") # declare the weight w and b w = theano.shared(value=numpy.random.rand(feat), name="w") b = theano.shared(value=0., name="b") print("initialized weights \n") print(w.get_value()) print(b.get_value()) # build the graph output = 1/(1+T.exp(-T.dot(x, w)-b)) prediction = output > 0.5 cross_entropy = -y * T.log(output) - (1-y)*T.log(1-output) loss = cross_entropy.mean() + 0.01*(w**2).sum() gradW, gradb = T.grad(loss, [w, b]) # train function train = theano.function(inputs=[x,y,learning_rate], outputs=[prediction, cross_entropy,loss, learning_rate], \ updates=((w,w-learning_rate*gradW), (b,b-learning_rate*gradb))) # predict function predict = theano.function(inputs=[x], outputs=prediction) for i in range(training_step): if (i < 1000): learning_rate = 0.1 else:
def mlp_pred(non_linearity): Z = [T.dot(X, W) for W in model.W1] H = map(non_linearity, Z) Z = [T.dot(h, W) for h, W in safe_izip(H, model.W2)] pred = sum(Z) return pred
def setup_outputs(self, input): lin_output = T.dot(input, self.W) + self.b self.output = (lin_output if self.activation is None else self.activation(lin_output))
def __init__(self, input, n_in, n_out, W=None, b=None, prob_constraint_on=None): """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie :type prob_constraint_on: boolean :param prob_constraint_on: whether we use the probability constraints or not """ # initialize weight matrix W if W is None: self.W = theano.shared(value=np.zeros((n_in, n_out), dtype=theano.config.floatX), name='W') else: self.W = W # initialize bias b if b is None: self.b = theano.shared(value=np.zeros((n_out, ), dtype=theano.config.floatX), name='b') else: self.b = b # compute prediction # the linear output lin_output = T.dot(input, self.W) + self.b if prob_constraint_on == None: #### we do not use those probability constraints self.y_pred = Sigmoid(lin_output) elif prob_constraint_on == "top": #### We first predict the probability of each class using softmax. # We then weight those probabilities by multiplying them by the # probability of their parent in the Galaxy Zoo Decision Tree. # class 1 prob_Class1 = SoftMax(lin_output[:, 0:3]) # class 2 prob_Class2 = SoftMax(lin_output[:, 3:5]) # weight these probabilities using the probability of class 1.2 prob_Class2 *= T.shape_padright(prob_Class1[:, 1]) # class 3 prob_Class3 = SoftMax(lin_output[:, 5:7]) # weight these probabilities using the probability of class 2.2 prob_Class3 *= T.shape_padright(prob_Class2[:, 1]) # class 4 prob_Class4 = SoftMax(lin_output[:, 7:9]) # weight these probabilities using the probability of class 2.2 prob_Class4 *= T.shape_padright(prob_Class2[:, 1]) # class 5 prob_Class5 = SoftMax(lin_output[:, 9:13]) # weight these probabilities using the probability of class 2.2 prob_Class5 *= T.shape_padright(prob_Class2[:, 1]) # class 6 prob_Class6 = SoftMax(lin_output[:, 13:15]) # class 7 prob_Class7 = SoftMax(lin_output[:, 15:18]) # weight these probabilities using the probability of class 1.1 prob_Class7 *= T.shape_padright(prob_Class1[:, 0]) # class 8 prob_Class8 = SoftMax(lin_output[:, 18:25]) # weight these probabilities using the probability of class 6.1 prob_Class8 *= T.shape_padright(prob_Class6[:, 0]) # class 9 prob_Class9 = SoftMax(lin_output[:, 25:28]) # weight these probabilities using the probability of class 2.1 prob_Class9 *= T.shape_padright(prob_Class2[:, 0]) # class 10 prob_Class10 = SoftMax(lin_output[:, 28:31]) # weight these probabilities using the probability of class 4.1 prob_Class10 *= T.shape_padright(prob_Class4[:, 0]) # class 11 prob_Class11 = SoftMax(lin_output[:, 31:37]) # weight these probabilities using the probability of class 4.1 prob_Class11 *= T.shape_padright(prob_Class4[:, 0]) # concatenate all the probabilities into a single tensor variable self.y_pred = T.concatenate([ prob_Class1, prob_Class2, prob_Class3, prob_Class4, prob_Class5, prob_Class6, prob_Class7, prob_Class8, prob_Class9, prob_Class10, prob_Class11 ], axis=1) elif prob_constraint_on == "down": #### we use those probability constraints # the following probabilities should sum up to 1, so we use SoftMax # to predict all of them ind1 = [2, 8, 15, 16, 17, 25, 26, 27, 31, 32, 33, 34, 35, 36] p1 = SoftMax(lin_output[:, ind1]) prob_Class1_3 = p1[:, 0] prob_Class4_2 = p1[:, 1] prob_Class7 = p1[:, 2:5] prob_Class9 = p1[:, 5:8] prob_Class11 = p1[:, 8:14] prob_Class4_1 = T.sum(prob_Class11, axis=1) prob_Class2_1 = T.sum(prob_Class9, axis=1) prob_Class2_2 = prob_Class4_1 + prob_Class4_2 prob_Class1_1 = T.sum(prob_Class7, axis=1) prob_Class1_2 = prob_Class2_1 + prob_Class2_2 prob_Class1 = T.concatenate([ T.shape_padright(prob_Class1_1), T.shape_padright(prob_Class1_2), T.shape_padright(prob_Class1_3) ], axis=1) prob_Class2 = T.concatenate([ T.shape_padright(prob_Class2_1), T.shape_padright(prob_Class2_2) ], axis=1) prob_Class4 = T.concatenate([ T.shape_padright(prob_Class4_1), T.shape_padright(prob_Class4_2) ], axis=1) # the following probabilities should sum up to 1, so we use SoftMax # to predict all of them ind2 = [14, 18, 19, 20, 21, 24, 23, 24] p2 = SoftMax(lin_output[:, ind2]) prob_Class6_2 = p2[:, 0] prob_Class8 = p2[:, 1:8] prob_Class6_1 = T.sum(prob_Class8, axis=1) prob_Class6 = T.concatenate([ T.shape_padright(prob_Class6_1), T.shape_padright(prob_Class6_2) ], axis=1) # for the following probabilities, we resort to the same strategy in # the "top" option # class 3 prob_Class3 = SoftMax(lin_output[:, 5:7]) # weight these probabilities using the probability of class 2.2 prob_Class3 *= T.shape_padright(prob_Class2[:, 1]) # class 5 prob_Class5 = SoftMax(lin_output[:, 9:13]) # weight these probabilities using the probability of class 2.2 prob_Class5 *= T.shape_padright(prob_Class2[:, 1]) # class 10 prob_Class10 = SoftMax(lin_output[:, 28:31]) # weight these probabilities using the probability of class 4.1 prob_Class10 *= T.shape_padright(prob_Class4[:, 0]) # concatenate all the probabilities into a single tensor variable self.y_pred = T.concatenate([ prob_Class1, prob_Class2, prob_Class3, prob_Class4, prob_Class5, prob_Class6, prob_Class7, prob_Class8, prob_Class9, prob_Class10, prob_Class11 ], axis=1) # parameters of the model self.params = [self.W, self.b]
def recurrence(_x, i_m1, i_m2): ati = T.dot(_x, Ws[0]) _m1 = T.maximum(i_m1, ati) ati = i_m1 + T.dot(_x, Ws[1]) _m2 = T.maximum(i_m2, ati) return [_m1, _m2]
def general_unitary_RNN(n_input, n_hidden, n_output, input_type='real', out_every_t=False, loss_function='CE'): # STEPH: hey, it's mine! copying proclivity towards boilerplate from rest # of code: this is derived from complex_RNN! np.random.seed(1234) rng = np.random.RandomState(1234) # TODO: all from here (requires some engineering thoughts) # TODO TODO TODO # Initialize parameters: theta, V_re, V_im, hidden_bias, U, out_bias, h_0 V = initialize_matrix(n_input, 2 * n_hidden, 'V', rng) U = initialize_matrix(2 * n_hidden, n_output, 'U', rng) # STEPH: U was previously known as out_mat hidden_bias = theano.shared(np.asarray(rng.uniform(low=-0.01, high=0.01, size=(n_hidden, )), dtype=theano.config.floatX), name='hidden_bias') # STEPH: hidden bias is simply initialised differently in this case reflection = initialize_matrix(2, 2 * n_hidden, 'reflection', rng) # STEPH: part of recurrence (~W) out_bias = theano.shared(np.zeros((n_output, ), dtype=theano.config.floatX), name='out_bias') theta = theano.shared(np.asarray(rng.uniform(low=-np.pi, high=np.pi, size=(3, n_hidden)), dtype=theano.config.floatX), name='theta') # STEPH: theta is used in recurrence several times (~W) bucket = np.sqrt(3. / 2 / n_hidden) h_0 = theano.shared(np.asarray(rng.uniform(low=-bucket, high=bucket, size=(1, 2 * n_hidden)), dtype=theano.config.floatX), name='h_0') # STEPH: special way of initialising hidden state parameters = [V, U, hidden_bias, reflection, out_bias, theta, h_0] x, y = initialize_data_nodes(loss_function, input_type, out_every_t) index_permute = np.random.permutation(n_hidden) # STEPH: permutation used in recurrence (~W) index_permute_long = np.concatenate( (index_permute, index_permute + n_hidden)) # STEPH: do the same permutation to both real and imaginary parts swap_re_im = np.concatenate((np.arange(n_hidden, 2 * n_hidden), np.arange(n_hidden))) # STEPH: this is a permutation which swaps imaginary and real indices # define the recurrence used by theano.scan def recurrence(x_t, y_t, h_prev, cost_prev, acc_prev, theta, V, hidden_bias, out_bias, U): # Compute hidden linear transform # STEPH: specific set of transformations, sliiightly not that important step1 = times_diag(h_prev, n_hidden, theta[0, :], swap_re_im) step2 = do_fft(step1, n_hidden) step3 = times_reflection(step2, n_hidden, reflection[0, :]) step4 = vec_permutation(step3, index_permute_long) step5 = times_diag(step4, n_hidden, theta[1, :], swap_re_im) step6 = do_ifft(step5, n_hidden) step7 = times_reflection(step6, n_hidden, reflection[1, :]) step8 = times_diag(step7, n_hidden, theta[2, :], swap_re_im) hidden_lin_output = step8 # STEPH: hidden_lin_output isn't complex enough to have its own name # in the other models # Compute data linear transform if loss_function == 'CE': data_lin_output = V[T.cast(x_t, 'int32')] else: data_lin_output = T.dot(x_t, V) # Total linear output lin_output = hidden_lin_output + data_lin_output # Apply non-linearity ---------------------------- # scale RELU nonlinearity modulus = T.sqrt(lin_output**2 + lin_output[:, swap_re_im]**2) # STEPH: I think this comes to twice the modulus... # TODO: check that rescale = T.maximum( modulus + T.tile(hidden_bias, [2]).dimshuffle('x', 0), 0.) / (modulus + 1e-5) h_t = lin_output * rescale if out_every_t: lin_output = T.dot(h_t, U) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: cost_t = theano.shared(np.float32(0.0)) acc_t = theano.shared(np.float32(0.0)) return h_t, cost_t, acc_t # compute hidden states # STEPH: the same as in tanhRNN, here (except U ~ out_mat) h_0_batch = T.tile(h_0, [x.shape[1], 1]) non_sequences = [theta, V, hidden_bias, out_bias, U] if out_every_t: sequences = [x, y] else: sequences = [ x, T.tile(theano.shared(np.zeros((1, 1), dtype=theano.config.floatX)), [x.shape[0], 1, 1]) ] outputs_info = [ h_0_batch, theano.shared(np.float32(0.0)), theano.shared(np.float32(0.0)) ] [hidden_states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info) if not out_every_t: lin_output = T.dot(hidden_states[-1, :, :], U) + out_bias.dimshuffle( 'x', 0) costs = compute_cost_t(lin_output, loss_function, y) else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return [x, y], parameters, costs
def __init__(self, rng, input1, input2, n_in, n_out, W=None, b=None, activation=T.tanh): """ Typical hidden layer of a MLP: units are fully-connected and have sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) and the bias vector b is of shape (n_out,). NOTE : The nonlinearity used here is tanh Hidden unit activation is given by: tanh(dot(input,W) + b) :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input1: theano.tensor.dmatrix :param input1: a symbolic tensor of shape ( n_in) :type input2: theano.tensor.dmatrix :param input2: a symbolic tensor of shape ( n_in) :type n_in: int :param n_in: dimensionality of input :type n_out: int :param n_out: number of hidden units :type activation: theano.Op or function :param activation: Non linearity to be applied in the hidden layer """ self.input1 = input1 self.input2 = input2 # `W` is initialized with `W_values` which is uniformely sampled # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) # for tanh activation function # the output of uniform if converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU # Note : optimal initialization of weights is dependent on the # activation function used (among other things). # For example, results presented in [Xavier10] suggest that you # should use 4 times larger initial weights for sigmoid # compared to tanh # We have no info for other function, so we use the same as # tanh. if W is None: W_values = np.asarray(rng.uniform( low=-np.sqrt(6. / (n_in + n_out)), high=np.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) if activation == theano.tensor.nnet.sigmoid: W_values *= 4 W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b_values = np.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) self.W = W self.b = b lin_output1 = T.dot(input1, self.W) + self.b self.output1 = (lin_output1 if activation is None else activation(lin_output1)) lin_output2 = T.dot(input2, self.W) + self.b self.output2 = (lin_output2 if activation is None else activation(lin_output2)) # parameters of the model self.params = [self.W, self.b]
def times_reflection(input, n_hidden, reflection): # comments here are Steph working through the maths # OK so the equation they give is: # (I - 2 outer(v, v*)/|v|**2) h # (v is the reflection, h is the input) # this gets us to: (using Einstein notation) # h_i - (2/|v|**2) v_i v*_j h_j # Looking at the final few lines of this function, what we would like to # show is: (since :n_hidden is the imaginary part of the output tensor) # re(v_i v*_j h_j) = d - c # im(v_i v*_j h_j) = a + b # # v = mu + i nu # h = alpha + i beta # v_i v*_j h_j = (mu_i + i nu_i) (mu_j - i nu_j) (alpha_j + i beta_j) # = (mu_i mu_j - i mu_i nu_j + i nu_i mu_j + nu_i nu_j) (alpha_j + i beta_j) # = (mu_i mu_j alpha_j + i mu_i mu_j beta_j + # -i mu_i nu_j alpha_j + mu_i nu_j beta_j + # i nu_i mu_j alpha_j - nu_i mu_j beta_j + # nu_i nu_j alpha_j + i nu_i nu_j beta_j) = K # # What an expression! # Let's split it up: # re(K) = (mu_i mu_j alpha_j + mu_i nu_j beta_j + # -nu_i mu_j beta_j + nu_i nu_j alpha_j) # im(K) = (mu_i mu_j beta_j - mu_i nu_j alpha_j + # + nu_i mu_j alpha_j + nu_i nu_j beta_j) # # Now let's replace the scalar parts (the repeated js...) # αμ = alpha_j mu_j # αν = alpha_j nu_j # βμ = beta_j mu_j # βν = beta_j nu_j # # re(K) = (mu_i αμ + mu_i βν - nu_i βμ + nu_i αν ) # im(K) = (mu_i βμ - mu_i αν + nu_i αμ + nu_i βν ) # # Simplifying further... # # re(K) = mu_i ( αμ + βν ) - nu_i ( βμ - αν ) = nope - nope # im(K) = mu_i ( βμ - αν ) + nu_i ( αμ + βν ) = nope + nope # # Jumping ahead (see below) to the definitions of a, b, c, d... # # a = mu_i ( αμ - βν ) # b = nu_i ( αν + βμ ) # c = nu_i ( αμ - βν ) # d = mu_i ( αν + βμ ) # # And so: # d - c = mu_i ( αν + βμ ) - nu_i ( αμ - βν ) # a + b = mu_i ( αμ - βν ) + nu_i ( αν + βμ ) # # ... huh, what is going on? # ... double-checking my maths! # ... double-checking their maths! # ... looks OK? # ... will need to TRIPLE-check my maths when it's not 1am. # # Possibility: when they used a * in the paper, they meant *transpose* # and not *conjugate transpose*... # # This would result in... # # v_i v_j h_j = (mu_i + i nu_i) (mu_j + i nu_j) (alpha_j + i beta_j) # = (mu_i mu_j + i mu_i nu_j + i nu_i mu_j - nu_i nu_j) (alpha_j + i beta_j) # = (mu_i mu_j alpha_j + i mu_i mu_j beta_j + # + i mu_i nu_j alpha_j - mu_i nu_j beta_j + # i nu_i mu_j alpha_j - nu_i mu_j beta_j + # - nu_i nu_j alpha_j - i nu_i nu_j beta_j) = J # # re(J) = (mu_i mu_j alpha_j - mu_i nu_j beta_j + # - nu_i mu_j beta_j - nu_i nu_j alpha_j) # im(J) = (mu_i mu_j beta_j + mu_i nu_j alpha_j + # nu_i mu_j alpha_j - nu_i nu_j beta_j) # # Replacing scalar parts... # re(J) = mu_i αμ - mu_i βν - nu_i βμ - nu_i αν # im(J) = mu_i βμ + mu_i αν + nu_i αμ - nu_i βν # # Further simplifying... # # re(J) = mu_i ( αμ - βν ) - nu_i ( βμ + αν ) = a - b # im(J) = mu_i ( βμ + αν ) + nu_i ( αμ - βν ) = d + c # # ... closer but NOT THE SAME # WHAT IS GOING ON HERE? input_re = input[:, :n_hidden] # alpha input_im = input[:, n_hidden:] # beta reflect_re = reflection[:n_hidden] # mu reflect_im = reflection[n_hidden:] # nu vstarv = (reflection**2).sum() # (the following things are roughly scalars) # (they actually are as long as the batch size, e.g. input[0]) input_re_reflect_re = T.dot(input_re, reflect_re) # αμ input_re_reflect_im = T.dot(input_re, reflect_im) # αν input_im_reflect_re = T.dot(input_im, reflect_re) # βμ input_im_reflect_im = T.dot(input_im, reflect_im) # βν # a = T.outer(input_re_reflect_re - input_im_reflect_im, reflect_re) # outer(αμ - βν, mu) b = T.outer(input_re_reflect_im + input_im_reflect_re, reflect_im) # outer(αν + βμ, nu) c = T.outer(input_re_reflect_re - input_im_reflect_im, reflect_im) # outer(αμ - βν, nu) d = T.outer(input_re_reflect_im + input_im_reflect_re, reflect_re) # outer(αν + βμ, mu) output = input output = T.inc_subtensor(output[:, :n_hidden], -2. / vstarv * (a + b)) output = T.inc_subtensor(output[:, n_hidden:], -2. / vstarv * (d - c)) return output
def __init__(self, rng, input, n_in, n_out, activation=Tanh, use_bias=True, W=None, b=None): """ Typical hidden layer of a MLP: units are fully-connected and have sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) and the bias vector b is of shape (n_out,). NOTE : The nonlinearity used here is tanh Hidden unit activation is given by: tanh(dot(input,W) + b) :type rng: np.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dmatrix :param input: a symbolic tensor of shape (n_examples, n_in) :type n_in: int :param n_in: dimensionality of input :type n_out: int :param n_out: number of hidden units :type activation: theano.Op or function :param activation: Non linearity to be applied in the hidden layer """ self.input = input self.activation = activation if W is None: W_values = np.asarray(0.01 * rng.standard_normal(size=(n_in, n_out)), dtype=theano.config.floatX) self.W = theano.shared(value=W_values, name='W', borrow=True) else: self.W = W if b is None: if activation == ReLU: # for ReLU, we initialize bias as constant 1 as suggested in # the dropout and ImageNet paper b_values = np.ones((n_out, ), dtype=theano.config.floatX) else: b_values = np.zeros((n_out, ), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, name='b', borrow=True) else: self.b = b if use_bias: lin_output = T.dot(input, self.W) + self.b else: lin_output = T.dot(input, self.W) self.output = (lin_output if activation is None else activation(lin_output)) # parameters of the model if use_bias: self.params = [self.W, self.b] else: self.params = [self.W]
def tanhRNN(n_input, n_hidden, n_output, input_type='real', out_every_t=False, loss_function='CE'): np.random.seed(1234) rng = np.random.RandomState(1234) # STEPH: initialising np's generic RNG and a specific rng identically # uncertain why but maybe we'll find out soon x, y = initialize_data_nodes(loss_function, input_type, out_every_t) inputs = [x, y] h_0 = theano.shared(np.zeros((1, n_hidden), dtype=theano.config.floatX)) V = initialize_matrix(n_input, n_hidden, 'V', rng) W = initialize_matrix(n_hidden, n_hidden, 'W', rng) # STEPH: W is the weights of the recurrence (can tell cause of its shape!) out_mat = initialize_matrix(n_hidden, n_output, 'out_mat', rng) hidden_bias = theano.shared( np.zeros((n_hidden, ), dtype=theano.config.floatX)) out_bias = theano.shared(np.zeros((n_output, ), dtype=theano.config.floatX)) parameters = [h_0, V, W, out_mat, hidden_bias, out_bias] def recurrence(x_t, y_t, h_prev, cost_prev, acc_prev, V, W, hidden_bias, out_mat, out_bias): # all of this is to get the hidden state, and possibly cost/accuracy if loss_function == 'CE': data_lin_output = V[x_t] # STEPH: uncertain why this is named thusly # STEPH: in CE case, the data is just an index, I guess... # basically, an indicator vector # I think this may be confounded with the experimental setup # CE appears in ? else: data_lin_output = T.dot(x_t, V) # STEPH: 'as normal', folding the data from the sequence in h_t = T.tanh( T.dot(h_prev, W) + data_lin_output + hidden_bias.dimshuffle('x', 0)) # STEPH: dimshuffle (theano) here, makes row out of 1d vector, N -> 1xN if out_every_t: lin_output = T.dot(h_t, out_mat) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: # STEPH: no cost/accuracy until the end! cost_t = theano.shared(np.float32(0.0)) acc_t = theano.shared(np.float32(0.0)) return h_t, cost_t, acc_t non_sequences = [V, W, hidden_bias, out_mat, out_bias] # STEPH: naming due to scan (theano); these are 'fixed' values in scan h_0_batch = T.tile(h_0, [x.shape[1], 1]) # STEPH: tile (theano) repeats input x according to pattern # pattern is number of times to tile in each direction if out_every_t: sequences = [x, y] else: # STEPH: the 'y' here is just... a bunch of weirdly-shaped zeros? sequences = [ x, T.tile(theano.shared(np.zeros((1, 1), dtype=theano.config.floatX)), [x.shape[0], 1, 1]) ] # STEPH: sequences here are the input we loop over... outputs_info = [ h_0_batch, theano.shared(np.float32(0.0)), theano.shared(np.float32(0.0)) ] # STEPH: naming due to scan, these are initialisation values... see return # value of recurrence: h_t, cost_t, acc_t... [hidden_states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info) # STEPH: remembering how to do scan! # outputs_info: contains initialisation, naming is bizarre, whatever # non_sequences: unchanging variables # sequences: tensors to be looped over # so fn receives (sequences, previous output, non_sequences): # this seems to square with the order of arguments in 'recurrence' # TODO: read scan more carefully to confirm this if not out_every_t: lin_output = T.dot(hidden_states[-1, :, :], out_mat) + out_bias.dimshuffle('x', 0) costs = compute_cost_t(lin_output, loss_function, y) # STEPH: cost is computed off the final hidden state else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return inputs, parameters, costs
def orthogonal_RNN(n_input, n_hidden, n_output, input_type='real', out_every_t=False, loss_function='CE', basis=None): np.random.seed(1234) rng = np.random.RandomState(1234) x, y = initialize_data_nodes(loss_function, input_type, out_every_t) inputs = [x, y] # ---- encoder ---- # V = initialize_matrix(n_input, n_hidden, 'V', rng) # ---- decoder ---- # U = initialize_matrix(n_hidden, n_output, 'U', rng) out_bias = theano.shared(np.zeros((n_output, ), dtype=theano.config.floatX), name='out_bias') # ---- hidden part ---- # dim_of_lie_algebra = n_hidden * (n_hidden - 1) / 2 lambdas = theano.shared(np.asarray(rng.uniform( low=-1, high=1, size=(dim_of_lie_algebra, )), dtype=theano.config.floatX), name='lambdas') # warning: symbolic_basis is expensive, memory-wise! if basis is None: symbolic_basis = theano.shared(np.asarray( rng.normal(size=(dim_of_lie_algebra, n_hidden, n_hidden)), dtype=theano.config.floatX), name='symbolic_basis') else: symbolic_basis = theano.shared(basis, name='symbolic_basis') # here it is! #O = T.expm(T.dot(lambdas, symbolic_basis)) # YOLO #O = T.tensordot(lambdas, symbolic_basis, axes=[0, 0]) #O = lambdas[0]*symbolic_basis[0] + lambdas[10]*symbolic_basis[10] O = lambdas[dim_of_lie_algebra - 1] * symbolic_basis[0] #lambdas[n_hidden*(n_hidden-1)/2 -1]*symbolic_basis[n_hidden*(n_hidden-1)/2 -1] # RIDICULOUS HACK THEANO IS WEIRD #for k in xrange(1, n_hidden*(n_hidden-1)/2): # O += lambdas[k]*symbolic_basis[k] # pdb.set_trace() #O = T.eye(n_hidden, n_hidden) # END YOLO # TODO: check maths on bucket bucket = np.sqrt(3. / 2 / n_hidden) h_0 = theano.shared(np.asarray(rng.uniform(low=-bucket, high=bucket, size=(1, n_hidden)), dtype=theano.config.floatX), name='h_0') hidden_bias = theano.shared(np.asarray(rng.uniform(low=-0.01, high=0.01, size=(n_hidden, )), dtype=theano.config.floatX), name='hidden_bias') # ---- all the parameters! ---- # parameters = [V, U, out_bias, lambdas, h_0, hidden_bias] def recurrence(x_t, y_t, h_prev, cost_prev, acc_prev, V, O, hidden_bias, out_bias, U): if loss_function == 'CE': # STEPH: why is this cast here??? data_lin_output = V[T.cast(x_t, 'int32')] else: data_lin_output = T.dot(x_t, V) h_t = T.nnet.relu( T.dot(h_prev, O) + data_lin_output + hidden_bias.dimshuffle('x', 0)) if out_every_t: lin_output = T.dot(h_t, U) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: cost_t = theano.shared(np.float32(0.0)) acc_t = theano.shared(np.float32(0.0)) return h_t, cost_t, acc_t # compute hidden states h_0_batch = T.tile(h_0, [x.shape[1], 1]) non_sequences = [V, O, hidden_bias, out_bias, U] if out_every_t: sequences = [x, y] else: sequences = [ x, T.tile(theano.shared(np.zeros((1, 1), dtype=theano.config.floatX)), [x.shape[0], 1, 1]) ] outputs_info = [ h_0_batch, theano.shared(np.float32(0.0)), theano.shared(np.float32(0.0)) ] [hidden_states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info) if not out_every_t: lin_output = T.dot(hidden_states[-1, :, :], U) + out_bias.dimshuffle( 'x', 0) costs = compute_cost_t(lin_output, loss_function, y) else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return inputs, parameters, costs
def __init__(self, numpy_rng, theano_rng=None, input=None, n_visible=784, n_hidden=500, W=None, bhid=None, bvis=None, learning_rate=0.1, corruption_level=0.3): """ Initialize the dA class by specifying the number of visible units (the dimension d of the input ), the number of hidden units ( the dimension d' of the latent or hidden space ) and the corruption level. The constructor also receives symbolic variables for the input, weights and bias. Such a symbolic variables are useful when, for example the input is the result of some computations, or when weights are shared between the dA and an MLP layer. When dealing with SdAs this always happens, the dA on layer 2 gets as input the output of the dA on layer 1, and the weights of the dA are used in the second stage of training to construct an MLP. :type numpy_rng: numpy.random.RandomState :param numpy_rng: number random generator used to generate weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type input: theano.tensor.TensorType :param input: a symbolic description of the input or None for standalone dA :type n_visible: int :param n_visible: number of visible units :type n_hidden: int :param n_hidden: number of hidden units :type W: theano.tensor.TensorType :param W: Theano variable pointing to a set of weights that should be shared belong the dA and another architecture; if dA should be standalone set this to None :type bhid: theano.tensor.TensorType :param bhid: Theano variable pointing to a set of biases values (for hidden units) that should be shared belong dA and another architecture; if dA should be standalone set this to None :type bvis: theano.tensor.TensorType :param bvis: Theano variable pointing to a set of biases values (for visible units) that should be shared belong dA and another architecture; if dA should be standalone set this to None :type corruption_level: float :param corruption_level: The amount of input corruption to use. Should be between 0 and 1. """ self.n_visible = n_visible self.n_hidden = n_hidden self.learning_rate = learning_rate self.corruption_level = corruption_level # create a Theano random generator that gives symbolic random values if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # note : W' was written as `W_prime` and b' as `b_prime` if not W: # W is initialized with `initial_W` which is uniformly sampled # from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if # converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU initial_W = numpy.asarray(numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden)), dtype=theano.config.floatX) W = theano.shared(value=initial_W, name='W', borrow=True) if not bvis: bvis = theano.shared(value=numpy.zeros(n_visible, dtype=theano.config.floatX), borrow=True) if not bhid: bhid = theano.shared(value=numpy.zeros(n_hidden, dtype=theano.config.floatX), name='b', borrow=True) self.W = W # b corresponds to the bias of the hidden self.b = bhid # b_prime corresponds to the bias of the visible self.b_prime = bvis # tied weights, therefore W_prime is W transpose self.W_prime = self.W.T self.theano_rng = theano_rng # if no input is given, generate a variable representing the input if input == None: # we use a matrix because we expect a minibatch of several # examples, each example being a row self.x = T.matrix(name='input') else: self.x = input self.params = [self.W, self.b, self.b_prime] self.hidden = T.nnet.sigmoid(T.dot(self.x, self.W) + self.b) self.reconstructed = T.nnet.sigmoid(T.dot(self.hidden, self.W_prime) + self.b_prime) #self.reconstructed_L = - T.sum(self.x * T.log(self.reconstructed) + (1 - self.x) * T.log(1 - self.reconstructed), axis=1) self.reconstructed_L = T.sum((self.x - self.reconstructed)**2,axis=1) dummy = self.x - self.b_prime self.F = T.sum(T.nnet.softplus(self.hidden), axis=1) - 0.5*T.sum(dummy*dummy, axis=1)
def LSTM(n_input, n_hidden, n_output, input_type='real', out_every_t=False, loss_function='CE'): np.random.seed(1234) rng = np.random.RandomState(1234) # STEPH: i for input, f for forget, c for candidate, o for output W_i = initialize_matrix(n_input, n_hidden, 'W_i', rng) W_f = initialize_matrix(n_input, n_hidden, 'W_f', rng) W_c = initialize_matrix(n_input, n_hidden, 'W_c', rng) W_o = initialize_matrix(n_input, n_hidden, 'W_o', rng) U_i = initialize_matrix(n_hidden, n_hidden, 'U_i', rng) U_f = initialize_matrix(n_hidden, n_hidden, 'U_f', rng) U_c = initialize_matrix(n_hidden, n_hidden, 'U_c', rng) U_o = initialize_matrix(n_hidden, n_hidden, 'U_o', rng) # STEPH: note that U is not out_mat as it was in complex_RNN V_o = initialize_matrix(n_hidden, n_hidden, 'V_o', rng) b_i = theano.shared(np.zeros((n_hidden, ), dtype=theano.config.floatX)) b_f = theano.shared(np.ones((n_hidden, ), dtype=theano.config.floatX)) b_c = theano.shared(np.zeros((n_hidden, ), dtype=theano.config.floatX)) b_o = theano.shared(np.zeros((n_hidden, ), dtype=theano.config.floatX)) h_0 = theano.shared(np.zeros((1, n_hidden), dtype=theano.config.floatX)) state_0 = theano.shared(np.zeros((1, n_hidden), dtype=theano.config.floatX)) out_mat = initialize_matrix(n_hidden, n_output, 'out_mat', rng) out_bias = theano.shared(np.zeros((n_output, ), dtype=theano.config.floatX)) parameters = [ W_i, W_f, W_c, W_o, U_i, U_f, U_c, U_o, V_o, b_i, b_f, b_c, b_o, h_0, state_0, out_mat, out_bias ] x, y = initialize_data_nodes(loss_function, input_type, out_every_t) def recurrence(x_t, y_t, h_prev, state_prev, cost_prev, acc_prev, W_i, W_f, W_c, W_o, U_i, U_f, U_c, U_o, V_o, b_i, b_f, b_c, b_o, out_mat, out_bias): if loss_function == 'CE': x_t_W_i = W_i[x_t] x_t_W_c = W_c[x_t] x_t_W_f = W_f[x_t] x_t_W_o = W_o[x_t] else: x_t_W_i = T.dot(x_t, W_i) x_t_W_c = T.dot(x_t, W_c) x_t_W_f = T.dot(x_t, W_f) x_t_W_o = T.dot(x_t, W_o) input_t = T.nnet.sigmoid(x_t_W_i + T.dot(h_prev, U_i) + b_i.dimshuffle('x', 0)) # STEPH: save candidate? candidate_t = T.tanh(x_t_W_c + T.dot(h_prev, U_c) + b_c.dimshuffle('x', 0)) forget_t = T.nnet.sigmoid(x_t_W_f + T.dot(h_prev, U_f) + b_f.dimshuffle('x', 0)) # STEPH: forget previosu state? state_t = input_t * candidate_t + forget_t * state_prev # STEPH: so we can both save the input and not forget the previous, OK output_t = T.nnet.sigmoid(x_t_W_o + T.dot(h_prev, U_o) + T.dot(state_t, V_o) + b_o.dimshuffle('x', 0)) # TODO: (STEPH) double-check maths, here! h_t = output_t * T.tanh(state_t) # STEPH: same as other models... if out_every_t: lin_output = T.dot(h_t, out_mat) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: cost_t = theano.shared(np.float32(0.0)) acc_t = theano.shared(np.float32(0.0)) return h_t, state_t, cost_t, acc_t non_sequences = [ W_i, W_f, W_c, W_o, U_i, U_f, U_c, U_o, V_o, b_i, b_f, b_c, b_o, out_mat, out_bias ] # STEPH: same as tanhRNN, etc... the scan part is generally duplicated! h_0_batch = T.tile(h_0, [x.shape[1], 1]) state_0_batch = T.tile(state_0, [x.shape[1], 1]) if out_every_t: sequences = [x, y] else: sequences = [ x, T.tile(theano.shared(np.zeros((1, 1), dtype=theano.config.floatX)), [x.shape[0], 1, 1]) ] outputs_info = [ h_0_batch, state_0_batch, theano.shared(np.float32(0.0)), theano.shared(np.float32(0.0)) ] [hidden_states, states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info) if not out_every_t: lin_output = T.dot(hidden_states[-1, :, :], out_mat) + out_bias.dimshuffle('x', 0) costs = compute_cost_t(lin_output, loss_function, y) else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return [x, y], parameters, costs
def get_hidden_values(self, input): """ Computes the values of the hidden layer """ return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
def IRNN(n_input, n_hidden, n_output, input_type='real', out_every_t=False, loss_function='CE'): # STEPH: this differs from tanhRNN in two places, see below np.random.seed(1234) rng = np.random.RandomState(1234) x, y = initialize_data_nodes(loss_function, input_type, out_every_t) inputs = [x, y] h_0 = theano.shared(np.zeros((1, n_hidden), dtype=theano.config.floatX)) V = initialize_matrix(n_input, n_hidden, 'V', rng) W = theano.shared(np.identity(n_hidden, dtype=theano.config.floatX)) # STEPH: W differs from that of tanhRNN: this is just identity! out_mat = initialize_matrix(n_hidden, n_output, 'out_mat', rng) hidden_bias = theano.shared( np.zeros((n_hidden, ), dtype=theano.config.floatX)) out_bias = theano.shared(np.zeros((n_output, ), dtype=theano.config.floatX)) parameters = [h_0, V, W, out_mat, hidden_bias, out_bias] def recurrence(x_t, y_t, h_prev, cost_prev, acc_prev, V, W, hidden_bias, out_mat, out_bias): if loss_function == 'CE': data_lin_output = V[x_t] else: data_lin_output = T.dot(x_t, V) h_t = T.nnet.relu( T.dot(h_prev, W) + data_lin_output + hidden_bias.dimshuffle('x', 0)) # STEPH: differs from tanhRNN: here we have relu, there they had tanh if out_every_t: lin_output = T.dot(h_t, out_mat) + out_bias.dimshuffle('x', 0) cost_t, acc_t = compute_cost_t(lin_output, loss_function, y_t) else: cost_t = theano.shared(np.float32(0.0)) acc_t = theano.shared(np.float32(0.0)) return h_t, cost_t, acc_t non_sequences = [V, W, hidden_bias, out_mat, out_bias] h_0_batch = T.tile(h_0, [x.shape[1], 1]) if out_every_t: sequences = [x, y] else: sequences = [ x, T.tile(theano.shared(np.zeros((1, 1), dtype=theano.config.floatX)), [x.shape[0], 1, 1]) ] outputs_info = [ h_0_batch, theano.shared(np.float32(0.0)), theano.shared(np.float32(0.0)) ] [hidden_states, cost_steps, acc_steps], updates = theano.scan(fn=recurrence, sequences=sequences, non_sequences=non_sequences, outputs_info=outputs_info) if not out_every_t: lin_output = T.dot(hidden_states[-1, :, :], out_mat) + out_bias.dimshuffle('x', 0) costs = compute_cost_t(lin_output, loss_function, y) else: cost = cost_steps.mean() accuracy = acc_steps.mean() costs = [cost, accuracy] return inputs, parameters, costs
def build_model(alpha, beta, tparams, options): trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) x_zheng = tensor.matrix('x_zheng', dtype='int32') x_zheng_mask = tensor.matrix('x_zheng_mask', dtype=config.floatX) x_ni = tensor.matrix('x_ni', dtype='int32') x_ni_mask = tensor.matrix('x_ni_mask', dtype=config.floatX) y = tensor.vector('y', dtype='int32') n_timesteps = x_zheng.shape[0] n_samples = x_zheng.shape[1] emb_zheng = tparams['Wemb'][x_zheng.flatten()].reshape( [n_timesteps, n_samples, options['dim_proj']]) proj1 = get_layer(options['encoder'])[1](tparams, emb_zheng, options, prefix='lstm_zheng', mask=x_zheng_mask) if options['encoder'] == 'lstm': proj_zheng = (proj1 * x_zheng_mask[:, :, None]).sum(axis=0) proj_zheng = proj_zheng / x_zheng_mask.sum(axis=0)[:, None] emb_ni = tparams['Wemb'][x_ni.flatten()].reshape( [n_timesteps, n_samples, options['dim_proj']]) proj2 = get_layer(options['encoder'])[1](tparams, emb_ni, options, prefix='lstm_ni', mask=x_ni_mask) if options['encoder'] == 'lstm': proj_ni = (proj2 * x_ni_mask[:, :, None]).sum(axis=0) proj_ni = proj_ni / x_ni_mask.sum(axis=0)[:, None] proj = tensor.concatenate((proj_zheng, proj_ni), axis=1) if options['use_dropout']: proj = dropout_layer(proj, use_noise, trng) pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b']) pred_zheng = tensor.nnet.softmax( tensor.dot(proj_zheng, tparams['U_zheng'] + tparams['b'])) pred_ni = tensor.nnet.softmax( tensor.dot(proj_ni, tparams['U_ni'] + tparams['b'])) f_pred_prob = theano.function([x_zheng, x_zheng_mask, x_ni, x_ni_mask], pred, name='f_pred_prob') f_pred = theano.function([x_zheng, x_zheng_mask, x_ni, x_ni_mask], pred.argmax(axis=1), name='f_pred') f_proj = theano.function([x_zheng, x_zheng_mask, x_ni, x_ni_mask], proj, name='f_proj') off = 1e-8 if pred.dtype == 'float16': off = 1e-6 cost1 = -tensor.log(pred[tensor.arange(n_samples), y] + off).mean() cost2 = -tensor.log(pred_zheng[tensor.arange(n_samples), y] + off).mean() cost3 = -tensor.log(pred_ni[tensor.arange(n_samples), y] + off).mean() cost4 = tensor.sum(tensor.square(proj_zheng - proj_ni), axis=1).mean() cost = alpha * (cost1 + cost2 + cost3) + beta * cost4 return use_noise, x_zheng, x_zheng_mask, x_ni, x_ni_mask, y, f_pred_prob, f_pred, cost1, cost2, cost3, cost4, cost, f_proj
def get_reconstructed_input(self, hidden): """Computes the reconstructed input given the values of the hidden layer """ return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
def step(TT_x_t, TT_h_tm1, TT_Wxh, TT_Whh, TT_Why): TT_h_t = TT.tanh(TT.dot(TT_x_t, TT_Wxh) + TT.dot(TT_h_tm1, TT_Whh)) TT_y_t = TT.tanh(TT.dot(TT_h_t, TT_Why)) return TT_h_t, TT_y_t
def forward(self, input): return T.dot(input, self.W) + self.b
def GESD(sum_uni_l, sum_uni_r): eucli = 1 / (1 + T.sum((sum_uni_l - sum_uni_r)**2)) kernel = 1 / (1 + T.exp(-(T.dot(sum_uni_l, sum_uni_r.T) + 1))) return (eucli * kernel).reshape((1, 1))
corruption_level = 0.1 training_epochs = 25 learning_rate = 0.1 batch_size = 128 W1 = init_weights(28 * 28, 900) b1 = init_bias(900) b1_prime = init_bias(28 * 28) W1_prime = W1.transpose() W2 = init_weights(900, 10) b2 = init_bias(10) tilde_x = theano_rng.binomial( size=x.shape, n=1, p=1 - corruption_level, dtype=theano.config.floatX) * x y1 = T.nnet.sigmoid(T.dot(tilde_x, W1) + b1) z1 = T.nnet.sigmoid(T.dot(y1, W1_prime) + b1_prime) cost1 = -T.mean(T.sum(x * T.log(z1) + (1 - x) * T.log(1 - z1), axis=1)) params1 = [W1, b1, b1_prime] grads1 = T.grad(cost1, params1) updates1 = [(param1, param1 - learning_rate * grad1) for param1, grad1 in zip(params1, grads1)] train_da1 = theano.function(inputs=[x], outputs=cost1, updates=updates1, allow_input_downcast=True) p_y2 = T.nnet.softmax(T.dot(y1, W2) + b2) y2 = T.argmax(p_y2, axis=1) cost2 = T.mean(T.nnet.categorical_crossentropy(p_y2, d))
def Sigmoid(sum_uni_l, sum_uni_r): dot = T.dot(sum_uni_l, sum_uni_r.T) return T.tanh(1.0 * dot + 1).reshape((1, 1))
def model(X, w_h, w_o): h = T.nnet.sigmoid(T.dot(X, w_h)) pyx = T.nnet.softmax(T.dot(h, w_o)) return pyx