def srr_step_func(ss, zi_zmuv, p_masks, q_masks, si, mi_p, mi_q): # transform the current belief state into an observation si_as_x = self._from_si_to_x(si) full_grad = T.log(1.0 + T.exp(ss)) * (self.x_out - si_as_x) # get the masked belief state and gradient for primary policy xi_for_p = (mi_p * self.x_out) + ((1.0 - mi_p) * si_as_x) grad_for_p = mi_p * full_grad # update the guide policy's revelation mask new_to_q = (1.0 - mi_q) * q_masks mip1_q = mi_q + new_to_q # get the masked belief state and gradient for guide policy # xi_for_q = (mip1_q * self.x_out) + ((1.0 - mip1_q) * si_as_x) xi_for_q = xi_for_p grad_for_q = mip1_q * full_grad # get samples of next zi, according to the primary policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( T.horizontal_stack(xi_for_p, grad_for_p), do_samples=False ) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( T.horizontal_stack(xi_for_q, grad_for_q), do_samples=False ) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = (self.train_switch[0] * zi_q) + ((1.0 - self.train_switch[0]) * zi_p) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) # KL(p || N(0, I)) # compute next si, given sampled zi (i.e. update the belief state) hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if self.step_type == "jump": # jump steps always do a full swap of belief state sip1 = si_step else: # additive steps adjust the belief state like an LSTM write_gate = T.nnet.sigmoid(2.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) # update the primary policy's revelation mask new_to_p = (1.0 - mi_p) * p_masks mip1_p = mi_p + new_to_p # compute NLL only for the newly revealed values nlli = self._construct_nll_costs(sip1, self.x_out, new_to_p) # each loop iteration produces the following values: # sip1: belief state at end of current step # mip1_p: revealed values mask to use in next step (primary) # mip1_q: revealed values mask to use in next step (guide) # nlli: NLL for values revealed at end of current step # kldi_q2p: KL(q || p) for the current step # kldi_p2q: KL(p || q) for the current step # kldi_p2g: KL(p || N(0,I)) for the current step return sip1, mip1_p, mip1_q, nlli, kldi_q2p, kldi_p2q, kldi_p2g
def __call__(self, v, output_type='h'): if hasattr(self, 'layer1_model'): h1 = self.l1model.h_given_v(v) s1 = self.l1model.s_given_hv(h1, v) if hasattr(self, 'layer2_model'): # preprocessor for input data h2 = self.l2model.h_given_vx(h1, s1) s2 = self.l2model.s_given_vxh(h1, s1, h2) h3 = self.h_given_vx(h2, s2) return T.horizontal_stack(h1, h2, h3) else: h2 = self.h_given_vx(h1, s1) return T.horizontal_stack(h1, h2)
def __call__(self, v, output_type='h'): if hasattr(self, 'layer1_model'): h1 = self.l1model.h_given_v(v) s1 = self.l1model.s_given_hv(h1, v) if hasattr(self, 'layer2_model'): # preprocessor for input data h2 = self.l2model.h_given_vx(h1, s1) s2 = self.l2model.s_given_vxh(h1, s1, h2) h3 = self.h_given_vx(h2, s2) return T.horizontal_stack(h1,h2,h3) else: h2 = self.h_given_vx(h1, s1) return T.horizontal_stack(h1,h2)
def __call__(self, v, output_type='fg+fh'): print 'Building representation with %s' % output_type [g, h, s] = self.e_step(v, n_steps=self.pos_mf_steps) atoms = { 'g_s' : T.dot(g, self.Wg), # g in s-space 'h_s' : T.dot(h, self.Wh), # h in s-space 's_g' : T.sqrt(T.dot(s**2, self.Wg.T)), 's_h' : T.sqrt(T.dot(s**2, self.Wh.T)), 's_g__h' : T.sqrt(T.dot(s**2 * T.dot(h, self.Wh), self.Wg.T)), 's_h__g' : T.sqrt(T.dot(s**2 * T.dot(g, self.Wg), self.Wh.T)) } output_prods = { ## factored representations 'g' : g, 'h' : h, 'gs': g * atoms['s_g'], 'hs': h * atoms['s_h'], 's_g': atoms['s_g'], 's_h': atoms['s_h'], ## unfactored representations 'sg_s' : atoms['g_s'] * s, 'sh_s' : atoms['h_s'] * s, } toks = output_type.split('+') output = output_prods[toks[0]] for tok in toks[1:]: output = T.horizontal_stack(output, output_prods[tok]) return output
def get_eem_predict_function(metric_name): W = T.dmatrix('W') X = T.dmatrix('X') beta = T.dmatrix('beta') m_plus = T.dmatrix('m_plus') m_minus = T.dmatrix('m_minus') sigma_plus = T.dmatrix('sigma_plus') sigma_minus = T.dmatrix('sigma_minus') H = metric_theano[metric_name](X, W) def gaussian(x, mu, sigma): return T.exp(T.power((x - mu[0]), 2) / (-2 * sigma)[0]) / (sigma * T.sqrt(2 * np.pi))[0] x = T.dot(H, beta) r_plus = gaussian(x, T.dot(beta.T, m_plus), T.dot(T.dot(beta.T, sigma_plus), beta)) r_minus = gaussian(x, T.dot(beta.T, m_minus), T.dot(T.dot(beta.T, sigma_minus), beta)) result = T.argmax(T.horizontal_stack(r_minus, r_plus), axis=1) eem_predict_function = theano.function( [X, W, beta, m_plus, m_minus, sigma_plus, sigma_minus], result) return eem_predict_function
def imp_step_func(zi_zmuv, si): si_as_x = self._from_si_to_x(si) xi_unmasked = self.x_out xi_masked = (self.x_mask * xi_unmasked) + \ ((1.0 - self.x_mask) * si_as_x) grad_unmasked = self.x_out - si_as_x grad_masked = (self.x_mask * grad_unmasked) + \ ((1.0 - self.x_mask) * self.grad_null) # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \ T.horizontal_stack(xi_masked, grad_masked), \ do_samples=False) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_xi.apply( \ T.horizontal_stack(xi_masked, grad_unmasked), \ do_samples=False) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \ zi_p_mean, zi_p_logvar) # KL(q || p) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \ zi_q_mean, zi_q_logvar) # KL(p || q) kldi_p2g = gaussian_kld(zi_p_mean, zi_p_logvar, \ 0.0, 0.0) # KL(p || global prior) # compute the next si, given the sampled zi hydra_out = self.p_sip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always completely overwrite the current guesses sip1 = si_step else: # additive steps update the current guesses like an LSTM write_gate = T.nnet.sigmoid(3.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(3.0 + hydra_out[2]) sip1 = (erase_gate * si) + (write_gate * si_step) # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q, kldi_p2g
def fit_cross_cov(self, n_exp=2, n_gauss=2, range_mu=None): """ Fit an analytical covariance to the experimental data. Args: n_exp (int): number of exponential basic functions n_gauss (int): number of gaussian basic functions range_mu: prior mean of the range. Default mean of the lags Returns: pymc.Model: PyMC3 model to be sampled using MCMC """ self.n_exp = n_exp self.n_gauss = n_gauss n_var = self.n_properties df = self.exp_var lags = self.lags # Prior standard deviation for the error of the regression prior_std_reg = df.std(0).max() * 10 # Prior value for the mean of the ranges if not range_mu: range_mu = lags.mean() # pymc3 Model with pm.Model() as model: # model specifications in PyMC3 are wrapped in a with-statement # Define priors sigma = pm.HalfCauchy('sigma', beta=prior_std_reg, testval=1., shape=n_var) psill = pm.Normal('sill', prior_std_reg, sd=.5 * prior_std_reg, shape=(n_exp + n_gauss)) range_ = pm.Normal('range', range_mu, sd=range_mu * .3, shape=(n_exp + n_gauss)) lambda_ = pm.Uniform('weights', 0, 1, shape=(n_var * (n_exp + n_gauss))) # Exponential covariance exp = pm.Deterministic('exp', # (lambda_[:n_exp*n_var]* psill[:n_exp] * (1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)), (range_[:n_exp].reshape((1, n_exp)) / 3.) ** -1)))) gauss = pm.Deterministic('gaus', psill[n_exp:] * (1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)) ** 2, (range_[n_exp:].reshape((1, n_gauss)) * 4 / 7.) ** -2)))) # We stack the basic functions in the same matrix and tile it to match the number of properties we have func = pm.Deterministic('func', T.tile(T.horizontal_stack(exp, gauss), (n_var, 1, 1))) # We weight each basic function and sum them func_w = pm.Deterministic("func_w", T.sum(func * lambda_.reshape((n_var, 1, (n_exp + n_gauss))), axis=2)) for e, cross in enumerate(df.columns): # Likelihoods pm.Normal(cross + "_like", mu=func_w[e], sd=sigma[e], observed=df[cross].as_matrix()) return model
def __init__(self, inpt, in_sz, n_classes, tied=False): if tied: b = share(init_wts(n_classes-1)) w = share(init_wts(in_sz, n_classes-1)) w1 = tt.horizontal_stack(w, tt.zeros((in_sz, 1))) b1 = tt.concatenate((b, tt.zeros(1))) self.output = tt.dot(inpt, w1) + b1 else: b = share(init_wts(n_classes)) w = share(init_wts(in_sz, n_classes)) self.output = tt.dot(inpt, w) + b self.params = [w, b]
def __init__(self, inpt, in_sz, n_classes, tied=False): if tied: b = share(init_wts(n_classes-1)) w = share(init_wts(in_sz, n_classes-1)) w1 = TT.horizontal_stack(w, TT.zeros((in_sz, 1))) b1 = TT.concatenate((b, TT.zeros(1))) self.output = TT.dot(inpt, w1) + b1 else: b = share(init_wts(n_classes)) w = share(init_wts(in_sz, n_classes)) self.output = TT.dot(inpt, w) + b self.params = [w, b]
def __init__(self, theta): self.x = T.dmatrix('x') self.y = T.dmatrix('y') self.n = self.x.shape[0] self.theta = theano.shared(theta) self.a = T.horizontal_stack((self.x.dot(self.theta)).reshape([self.n, 1]), T.zeros([self.n, 1])) self.prob = T.nnet.softmax(self.a)# T.exp(self.log_prob) self.l = T.dscalar('l') self.cost = -(T.sum(T.log(T.nnet.softmax(self.a))*self.y) / self.n) + self.l*T.sum(self.theta[2:]**2)/self.n self.grad = T.grad(self.cost, wrt=self.theta) self.hessian = theano.gradient.hessian(self.cost, self.theta) self.pred = self.prob > .5 self.predict = theano.function(inputs=[self.x], outputs=[self.pred])
def get_output_for(self, inputs, **kwargs): """Compute diffusion convolutional activation of inputs.""" Apow = T.horizontal_stack(*inputs[:-1]) X = inputs[-1] Apow_dot_X = T.dot(Apow, X) Apow_dot_X_times_W = Apow_dot_X * self.W out = self.nonlinearity(Apow_dot_X_times_W) return out
def __call__(self, v, output_type='g+h'): print 'Building representation with %s' % output_type init_state = OrderedDict() init_state['g'] = T.ones( (v.shape[0], self.n_g)) * T.nnet.sigmoid(self.gbias) init_state['h'] = T.ones( (v.shape[0], self.n_h)) * T.nnet.sigmoid(self.hbias) [g, h, s2_1, s2_0, v, pos_counter] = self.pos_phase( v, init_state, n_steps=self.pos_steps) s = s2_1 atoms = { 'g_s': self.from_g(g), # g in s-space 'h_s': self.from_h(h), # h in s-space 's_g': T.sqrt(self.to_g(s**2)), 's_h': T.sqrt(self.to_h(s**2)), 's_g__h': T.sqrt(self.to_g(s**2 * self.from_h(h))), 's_h__g': T.sqrt(self.to_h(s**2 * self.from_g(g))), } output_prods = { ## factored representations 'g': g, 'h': h, 'gh': (g.dimshuffle(0, 1, 'x') * h.dimshuffle(0, 'x', 1)).flatten( ndim=2), 'gs': g * atoms['s_g'], 'hs': h * atoms['s_h'], 's_g': atoms['s_g'], 's_h': atoms['s_h'], ## unfactored representations 'sg_s': atoms['g_s'] * s, 'sh_s': atoms['h_s'] * s, } toks = output_type.split('+') output = output_prods[toks[0]] for tok in toks[1:]: output = T.horizontal_stack(output, output_prods[tok]) return output
def imp_step_func(zi_zmuv, si): si_as_x = self.obs_transform(si) xi_masked = (self.x_mask * self.x_out) + \ ((1.0 - self.x_mask) * si_as_x) #grad_ll = self.x_out - xi_masked # get samples of next zi, according to the global policy zi_p_mean, zi_p_logvar = self.p_zi_given_xi.apply( \ xi_masked, do_samples=False) zi_p = zi_p_mean + (T.exp(0.5 * zi_p_logvar) * zi_zmuv) # get samples of next zi, according to the guide policy zi_q_mean, zi_q_logvar = self.q_zi_given_x_xi.apply( \ T.horizontal_stack(xi_masked, self.x_out), \ do_samples=False) zi_q = zi_q_mean + (T.exp(0.5 * zi_q_logvar) * zi_zmuv) if self.use_osm_mode: zi = zi_p # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, 0.0, 0.0) else: # make zi samples that can be switched between zi_p and zi_q zi = ((self.train_switch[0] * zi_q) + \ ((1.0 - self.train_switch[0]) * zi_p)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(zi_q_mean, zi_q_logvar, \ zi_p_mean, zi_p_logvar) kldi_p2q = gaussian_kld(zi_p_mean, zi_p_logvar, \ zi_q_mean, zi_q_logvar) # compute the next si, given the sampled zi hydra_out = self.p_xip1_given_zi.apply(zi) si_step = hydra_out[0] if (self.step_type == 'jump'): # jump steps always do a full swap (like standard VAE) sip1 = si_step else: # additive steps adjust the current guesses incrementally write_gate = T.nnet.sigmoid(2.0 + hydra_out[1]) erase_gate = T.nnet.sigmoid(2.0 + hydra_out[2]) # LSTM-style update sip1 = (erase_gate * si) + (write_gate * si_step) # normal update (this was used in workshop papers) #sip1 = si + si_step # compute NLL for the current imputation nlli = self._construct_nll_costs(sip1, self.x_out, 0.0*self.x_mask) return sip1, nlli, kldi_q2p, kldi_p2q
def get_output_for(self, input, **kwargs): distances = conv_pairwise_distance(input, self.V) similarities = T.exp(-distances / T.abs_(self.gamma)) norm = T.sum(similarities, 1).reshape((similarities.shape[0], 1, similarities.shape[2], similarities.shape[3])) membership = similarities / (norm + self.eps) histogram = T.mean(membership, axis=(2, 3)) if self.spatial_level == 1: pivot1, pivot2 = membership.shape[2] / 2, membership.shape[3] / 2 h1 = T.mean(membership[:, :, :pivot1, :pivot2], axis=(2, 3)) h2 = T.mean(membership[:, :, :pivot1, pivot2:], axis=(2, 3)) h3 = T.mean(membership[:, :, pivot1:, :pivot2], axis=(2, 3)) h4 = T.mean(membership[:, :, pivot1:, pivot2:], axis=(2, 3)) # Pyramid is not used in the paper # histogram = T.horizontal_stack(h1, h2, h3, h4) histogram = T.horizontal_stack(histogram, h1, h2, h3, h4) return histogram
def get_output_for(self, inputs, deterministic=False, **kwargs): # extract inputs H1, H2 = inputs # running average projection matrix update if not deterministic: # compute batch mean mean1 = T.mean(H1, axis=0) mean2 = T.mean(H2, axis=0) # running average updates of means mean1 = (floatX(1.0 - self.alpha) * self.mean1 + self.alpha * mean1) running_mean1 = theano.clone(self.mean1, share_inputs=False) running_mean1.default_update = mean1 mean1 += 0 * running_mean1 mean2 = (floatX(1.0 - self.alpha) * self.mean2 + self.alpha * mean2) running_mean2 = theano.clone(self.mean2, share_inputs=False) running_mean2.default_update = mean2 mean2 += 0 * running_mean2 # hidden representations H1bar = H1 - mean1 H2bar = H2 - mean2 # use means of layer else: # hidden representations H1bar = H1 - self.mean1 H2bar = H2 - self.mean2 # re-project data lv1_cca = H1bar.dot(self.U) lv2_cca = H2bar.dot(self.V) output = T.horizontal_stack(lv1_cca, lv2_cca) return output
def fit_cross_cov(df, lags, n_exp=2, n_gaus=2, range_mu=None): n_var = df.columns.shape[0] n_basis_f = n_var * (n_exp + n_gaus) prior_std_reg = df.std(0).max() * 10 # if not range_mu: range_mu = lags.mean() # Because is a experimental variogram I am not going to have outliers nugget_max = df.values.max() # print(n_basis_f, n_var*n_exp, nugget_max, range_mu, prior_std_reg) # pymc3 Model with pm.Model() as model: # model specifications in PyMC3 are wrapped in a with-statement # Define priors sigma = pm.HalfCauchy('sigma', beta=prior_std_reg, testval=1., shape=n_var) psill = pm.Normal('sill', prior_std_reg, sd=.5 * prior_std_reg, shape=(n_exp + n_gaus)) range_ = pm.Normal('range', range_mu, sd=range_mu * .3, shape=(n_exp + n_gaus)) # nugget = pm.Uniform('nugget', 0, nugget_max, shape=n_var) lambda_ = pm.Uniform('weights', 0, 1, shape=(n_var * (n_exp + n_gaus))) # Exponential covariance exp = pm.Deterministic('exp', # (lambda_[:n_exp*n_var]* psill[:n_exp] * (1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)), (range_[:n_exp].reshape((1, n_exp)) / 3.) ** -1)))) gaus = pm.Deterministic('gaus', psill[n_exp:] * (1. - T.exp(T.dot(-lags.as_matrix().reshape((len(lags), 1)) ** 2, (range_[n_exp:].reshape((1, n_gaus)) * 4 / 7.) ** -2)))) func = pm.Deterministic('func', T.tile(T.horizontal_stack(exp, gaus), (n_var, 1, 1))) func_w = pm.Deterministic("func_w", T.sum(func * lambda_.reshape((n_var, 1, (n_exp + n_gaus))), axis=2)) # nugget.reshape((n_var,1))) for e, cross in enumerate(df.columns): # Likelihoods pm.Normal(cross + "_like", mu=func_w[e], sd=sigma[e], observed=df[cross].as_matrix()) return model
def __call__(self, v, output_type='g+h', mean_field=True): print 'Building representation with %s' % output_type init_state = OrderedDict() init_state['g'] = T.ones((v.shape[0],self.n_g)) * T.nnet.sigmoid(self.gbias) init_state['h'] = T.ones((v.shape[0],self.n_h)) * T.nnet.sigmoid(self.hbias) init_state['l'] = T.ones((v.shape[0],self.n_l)) * T.nnet.softmax(self.lbias) [g, h, l] = self.pos_phase(v, init_state, n_steps=self.pos_steps, mean_field=mean_field) s = self.s_given_ghv(g, h, v) atoms = { 'g_s' : T.dot(g, self.Wg), # g in s-space 'h_s' : T.dot(h, self.Wh), # h in s-space 's_g' : T.sqrt(T.dot(s**2, self.Wg.T)), 's_h' : T.sqrt(T.dot(s**2, self.Wh.T)), 's_g__h' : T.sqrt(T.dot(s**2 * T.dot(h, self.Wh), self.Wg.T)), 's_h__g' : T.sqrt(T.dot(s**2 * T.dot(g, self.Wg), self.Wh.T)) } output_prods = { ## factored representations 'g' : g, 'h' : h, 'gh' : (g.dimshuffle(0,1,'x') * h.dimshuffle(0,'x',1)).flatten(ndim=2), 'gs': g * atoms['s_g'], 'hs': h * atoms['s_h'], 's_g': atoms['s_g'], 's_h': atoms['s_h'], ## unfactored representations 'sg_s' : atoms['g_s'] * s, 'sh_s' : atoms['h_s'] * s, } toks = output_type.split('+') output = output_prods[toks[0]] for tok in toks[1:]: output = T.horizontal_stack(output, output_prods[tok]) return output
def __call__(self, v, output_type='h'): print 'Building representation with %s' % output_type init_state = OrderedDict() h = self.h_given_v(v) s = self.s_given_hv(h, v) atoms = { 'h_s': self.from_h(h), # h in s-space 's_h': T.sqrt(self.to_h(s**2)), } output_prods = { 'h': h, 's': s, 'hs': h * atoms['s_h'], } toks = output_type.split('+') output = output_prods[toks[0]] for tok in toks[1:]: output = T.horizontal_stack(output, output_prods[tok]) return output
def __call__(self, v, output_type='h'): print 'Building representation with %s' % output_type init_state = OrderedDict() h = self.h_given_v(v) s = self.s_given_hv(h, v) atoms = { 'h_s' : self.from_h(h), # h in s-space 's_h' : T.sqrt(self.to_h(s**2)), } output_prods = { 'h' : h, 's' : s, 'hs': h * atoms['s_h'], } toks = output_type.split('+') output = output_prods[toks[0]] for tok in toks[1:]: output = T.horizontal_stack(output, output_prods[tok]) return output
def apply(self, x_t, h_tm1, c_tm1): """ Apply propagate the current input x_t and the previous exposed state and memory state h_tm1/c_tm1 through this LSTM layer. """ hd = self.hid_dim # merge exogenous (i.e. x_t) and endogenous (i.e. h_tm1) inputs joint_input = T.horizontal_stack(x_t, h_tm1) joint_output = T.dot(joint_input, self.W_all) + self.b_all jo_T = joint_output.T # compute transformed input to the layer g_t = T.tanh( jo_T[:,0:(1*hd)].T ) # compute input gate i_t = T.nnet.sigmoid( jo_T[:,(1*hd):(2*hd)].T ) # compute forget gate f_t = T.nnet.sigmoid( jo_T[:,(2*hd):(3*hd)].T ) # compute output gate o_t = T.nnet.sigmoid( jo_T[:,(3*hd):(4*hd)].T ) # compute updated memory state c_t = (f_t * c_tm1) + (i_t * g_t) # compute updated exposed state h_t = (o_t * T.tanh(c_t)) return h_t, c_t
def __call__(self, v, output_type='g+h', mean_field=True): print 'Building representation with %s' % output_type init_state = OrderedDict() init_state['g'] = T.ones((v.shape[0],self.n_g)) * T.nnet.sigmoid(self.gbias) init_state['h'] = T.ones((v.shape[0],self.n_h)) * T.nnet.sigmoid(self.hbias) [g, h, pos_counter] = self.pos_phase(v, init_state, n_steps=self.pos_steps, mean_field=mean_field) atoms = { 'g_s' : T.dot(g, self.Wg), # g in s-space 'h_s' : T.dot(h, self.Wh), # h in s-space } output_prods = { 'g' : g, 'h' : h, 'gh' : (g.dimshuffle(0,1,'x') * h.dimshuffle(0,'x',1)).flatten(ndim=2), } toks = output_type.split('+') output = output_prods[toks[0]] for tok in toks[1:]: output = T.horizontal_stack(output, output_prods[tok]) return output
def ir_step_func(hi_zmuv, sim1): # get variables used throughout this refinement step sim1_obs = self.obs_transform(sim1) # transform state -> obs grad_ll = self.x_out - sim1_obs # get samples of next hi, conditioned on current si hi_p_mean, hi_p_logvar = self.p_hi_given_si.apply( \ sim1_obs, do_samples=False) # now we build the model for variational hi given si hi_q_mean, hi_q_logvar = self.q_hi_given_x_si.apply( \ T.horizontal_stack(grad_ll, sim1_obs), \ do_samples=False) hi_q = (T.exp(0.5 * hi_q_logvar) * hi_zmuv) + hi_q_mean hi_p = (T.exp(0.5 * hi_p_logvar) * hi_zmuv) + hi_p_mean # make hi samples that can be switched between hi_p and hi_q hi = ( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on si and hi. ig_vals, fg_vals, in_vals = self.p_sip1_given_si_hi.apply(hi) # get the transformed values (for an LSTM style update) i_gate = 1.0 * T.nnet.sigmoid(ig_vals + 2.0) f_gate = 1.0 * T.nnet.sigmoid(fg_vals + 2.0) # perform an LSTM-like update of the state sim1 -> si si = (in_vals * i_gate) + (sim1 * f_gate) # compute generator NLL for this step nlli = self.log_prob_func(self.x_out, self.obs_transform(si)) # compute relevant KLds for this step kldi_q2p = gaussian_kld(hi_q_mean, hi_q_logvar, \ hi_p_mean, hi_p_logvar) kldi_p2q = gaussian_kld(hi_p_mean, hi_p_logvar, \ hi_q_mean, hi_q_logvar) return si, nlli, kldi_q2p, kldi_p2q
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_h_given_z=None, \ p_x_given_h=None, \ q_z_given_x=None, \ q_h_given_z_x=None, \ x_dim=None, \ z_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_h_given_z_x = q_h_given_z_x self.p_h_given_z = p_h_given_z self.p_x_given_h = p_x_given_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='tsm_train_switch') self.set_train_switch(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((1,self.z_dim)) ) self.p_z_mean = theano.shared(value=init_vec, name='tsm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='tsm_p_z_logvar') self.obs_logvar = theano.shared(value=zero_ary, name='tsm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "hidden" latent state (from both p and q) z_q_mean, z_q_logvar, z_q = \ self.q_z_given_x.apply(self.x_in, do_samples=True) z_p_mean = self.p_z_mean.repeat(z_q.shape[0], axis=0) z_p_logvar = self.p_z_logvar.repeat(z_q.shape[0], axis=0) zmuv = self.rng.normal(size=z_q.shape, avg=0.0, std=1.0, \ dtype=theano.config.floatX) z_p = (T.exp(0.5*z_p_logvar) * zmuv) + z_p_mean self.z = (self.train_switch[0] * z_q) + \ ((1.0 - self.train_switch[0]) * z_p) # compute relevant KLds for this step self.kld_z_q2p = gaussian_kld(z_q_mean, z_q_logvar, \ z_p_mean, z_p_logvar) self.kld_z_p2q = gaussian_kld(z_p_mean, z_p_logvar, \ z_q_mean, z_q_logvar) # samples of "hidden" latent state (from both p and q) h_p_mean, h_p_logvar, h_p = self.p_h_given_z.apply(self.z) h_q_mean, h_q_logvar, h_q = self.q_h_given_z_x.apply( \ T.horizontal_stack(h_p_mean, h_p_logvar, self.x_out)) self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute relevant KLds for this step self.kld_h_q2p = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2q = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_h generates an observation x conditioned on the "hidden" # latent variables h. self.x_gen, _ = self.p_x_given_h.apply(self.h, do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr = theano.shared(value=zero_ary, name='tsm_lr') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='tsm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='tsm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='tsm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_q2p = theano.shared(value=zero_ary, name='tsm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='tsm_lam_kld_p2q') self.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='tsm_lam_l2w') self.set_lam_l2w(1e-5) # get optimizable parameters belonging to the TwoStageModel self_params = [self.obs_logvar] #+ [self.p_z_mean, self.p_z_logvar] # get optimizable parameters belonging to the underlying networks child_params = [] child_params.extend(self.q_z_given_x.mlp_params) child_params.extend(self.q_h_given_z_x.mlp_params) child_params.extend(self.p_h_given_z.mlp_params) child_params.extend(self.p_x_given_h.mlp_params) # make a joint list of all optimizable parameters self.joint_params = self_params + child_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = T.sum(self.kld_z, axis=1) + \ T.sum(self.kld_h, axis=1) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_costs # get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # construct the updates for the generator and inferencer networks all_updates = get_adam_updates(params=self.joint_params, \ grads=self.joint_grads, alpha=self.lr, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-4, max_grad_norm=5.0) self.joint_updates = OrderedDict() for k in all_updates: self.joint_updates[k] = all_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() return
def learnAndPredict(Ti, C, TOList): rng = np.random.RandomState(SEED) learning_rate = learning_rate0 print np.mean(Ti[1000,:]) aminW = np.amin(Ti[:1000,:]) amaxW = np.amax(Ti[:1000,:]) Ti[:1000,:] = (Ti[:1000,:] - aminW) / (amaxW - aminW) astdW = np.std(Ti[:1000,:]) ameanW = np.mean(Ti[:1000,:]) Ti[:1000,:] = (Ti[:1000,:] - ameanW) / astdW aminacW = np.amin(Ti[1000,:]) amaxacW = np.amax(Ti[1000,:]) print aminW, amaxW, aminacW, amaxacW Ti[1000,:] = (Ti[1000,:] - aminacW) / (amaxacW - aminacW) astdacW = np.std(Ti[1000,:]) ameanacW = np.mean(Ti[1000,:]) Ti[1000,:] = (Ti[1000,:] - ameanacW) / astdacW ile__ = len(TOList) ileList = np.zeros(ile__) for titer in range(len(TOList)): print np.mean(TOList[titer][1000,:]) TOList[titer][:1000,:] = (TOList[titer][:1000,:] - aminW)/(amaxW - aminW) TOList[titer][:1000,:] = (TOList[titer][:1000,:] - ameanW)/astdW TOList[titer][1000,:] = (TOList[titer][1000,:] - aminacW)/(amaxacW - aminacW) TOList[titer][1000,:] = (TOList[titer][1000,:] - ameanacW)/astdacW _, ileList[titer] = TOList[titer].shape _, ile = Ti.shape N = NN data = []; yyy = []; need = 1; BYL = {}; j= 0; dwa = 0; ONES = []; ZEROS = [] for i in range(NN): for j in range(NN): if i!= j: if C[i][j]==1: ONES.append((i,j)) else: ZEROS.append((i,j)) Nones = len(ONES) rng.shuffle(ONES) Nzeros = len(ZEROS) print Nones print Nzeros Needed = NUM_TRAIN/2 onesPerPair = Needed / Nones + 1 onesIter = 0 jj = 0 while jj < NUM_TRAIN: if jj%300000 == 0: print jj/300000, need = 1 - need if need == 1: pairNo = onesIter % Nones ppp = onesIter / Nones s,t = ONES[pairNo] shift = rng.randint(0, ile - L) onesIter += 1 if need == 0: zer = rng.randint(Nzeros) s,t = ZEROS[zer] del ZEROS[zer] Nzeros -= 1 shift = rng.randint(0, ile - L) x = np.hstack(( Ti[s][shift:shift+L], Ti[t][shift:shift+L], Ti[1000][shift:shift+L])) y = C[s][t] data.append(x); yyy.append(y) jj+=1 data = np.array(data, dtype=theano.config.floatX) is_train = np.array( ([0]*96 + [1,1,2,2]) * (NUM_TRAIN / 100)) yyy = np.array(yyy) train_set_x0, train_set_y0 = np.array(data[is_train==0]), yyy[is_train==0] test_set_x, test_set_y = np.array(data[is_train==1]), yyy[is_train==1] valid_set_x, valid_set_y = np.array(data[is_train==2]), yyy[is_train==2] n_train_batches = len(train_set_y0) / batch_size n_valid_batches = len(valid_set_y) / batch_size n_test_batches = len(test_set_y) / batch_size epoch = T.scalar() index = T.lscalar() x = T.matrix('x') inone2 = T.matrix('inone2') y = T.ivector('y') print '... building the model' #-------- my layers ------------------- #--------------------- layer0_input = x.reshape((batch_size, 1, 3, L)) Cx = 5 layer0 = ConvolutionalLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 3, L), filter_shape=(nkerns[0], 1, 2, Cx), poolsize=(1, 1), fac = 0) ONE = (3 - 2 + 1) / 1 L2 = (L - Cx + 1) / 1 #--------------------- Cx2 = 5 layer1 = ConvolutionalLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ONE, L2), filter_shape=(nkerns[1], nkerns[0], 2, Cx2), poolsize=(1, 1), activation=ReLU, fac = 0) ONE = (ONE - 2 + 1) /1 L3 = (L2 - Cx2 + 1) /1 #--------------------- Cx3 = 1 layer1b = ConvolutionalLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], ONE, L3), filter_shape=(nkerns[2], nkerns[1], 1, Cx3), poolsize=(1, POOL), activation=ReLU, fac = 0) ONE = (ONE - 1 + 1) /1 L4 = (L3 - Cx3 + 1) /POOL REGx = 100 #--------------------- layer2_input = layer1b.output.flatten(2) print layer2_input.shape use_b = False layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[2]*L4 , n_out=REGx, activation=T.tanh, use_bias = use_b) layer3 = LogisticRegression(input=layer2.output, n_in=REGx, n_out=2) cost = layer3.negative_log_likelihood(y) out_x2 = theano.shared(np.asarray(np.zeros((N,L)), dtype=theano.config.floatX)) inone2 = theano.shared(np.asarray(np.zeros((1,L)), dtype=theano.config.floatX)) inone3 = theano.shared(np.asarray(np.zeros((1,L)), dtype=theano.config.floatX)) inone4 = theano.shared(np.asarray(np.zeros((1,L)), dtype=theano.config.floatX)) test_set_x = theano.shared(np.asarray(test_set_x, dtype=theano.config.floatX)) train_set_x = theano.shared(np.asarray(train_set_x0, dtype=theano.config.floatX)) train_set_y = T.cast(theano.shared(np.asarray(train_set_y0, dtype=theano.config.floatX)), 'int32') test_set_y = T.cast(theano.shared(np.asarray(test_set_y, dtype=theano.config.floatX)), 'int32') valid_set_y = T.cast(theano.shared(np.asarray(valid_set_y, dtype=theano.config.floatX)), 'int32') valid_set_x = theano.shared(np.asarray(valid_set_x, dtype=theano.config.floatX)) test_model = theano.function([index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function([index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) mom_start = 0.5; mom_end = 0.98; mom_epoch_interval = n_epochs * 1.0 #### @@@@@@@@@@@ class_params0 = [layer3, layer2, layer1, layer1b, layer0] class_params = [ param for layer in class_params0 for param in layer.params ] gparams = [] for param in class_params: gparam = T.grad(cost, param) gparams.append(gparam) gparams_mom = [] for param in class_params: gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)) gparams_mom.append(gparam_mom) mom = ifelse(epoch < mom_epoch_interval, mom_start*(1.0 - epoch/mom_epoch_interval) + mom_end*(epoch/mom_epoch_interval), mom_end) updates = OrderedDict() for gparam_mom, gparam in zip(gparams_mom, gparams): updates[gparam_mom] = mom * gparam_mom - (1. - mom) * learning_rate * gparam for param, gparam_mom in zip(class_params, gparams_mom): stepped_param = param + updates[gparam_mom] squared_filter_length_limit = 15.0 if param.get_value(borrow=True).ndim == 2: col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(squared_filter_length_limit)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param output = cost train_model = theano.function(inputs=[epoch, index], outputs=output, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}) keep = theano.function([index], layer3.errorsFull(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}, on_unused_input='warn') timer = time.clock() print "finished reading", (timer - start_time0) /60. , "minutes " # TRAIN MODEL # print '... training' validation_frequency = n_train_batches; best_params = None; best_validation_loss = np.inf best_iter = 0; test_score = 0.; epochc = 0; while (epochc < n_epochs): epochc = epochc + 1 learning_rate = learning_rate0 * (1.2 - ((1.0 * epochc)/n_epochs)) for minibatch_index in xrange(n_train_batches): iter = (epochc - 1) * n_train_batches + minibatch_index cost_ij = train_model(epochc, minibatch_index) if (iter + 1) % validation_frequency == 0: validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print(' %i) err %.2f ' % (epochc, this_validation_loss/10)), L, nkerns, REGx, "|", Cx, Cx2, Cx3, batch_size if this_validation_loss < best_validation_loss or epochc % 30 == 0: best_validation_loss = this_validation_loss best_iter = iter test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epochc, minibatch_index + 1, n_train_batches, test_score/10)) ############ timel = time.clock() print "finished learning", (timel - timer) /60. , "minutes " ppm = theano.function([index], layer3.pred_proba_mine(), givens={ x: T.horizontal_stack(T.tile(inone2, (batch_size ,1)), out_x2[index * batch_size: (index + 1) * batch_size], T.tile(inone3, (batch_size ,1))), y: train_set_y[0 * (batch_size): (0 + 1) * (batch_size)] }, on_unused_input='warn') NONZERO = (N*N-N) gc.collect() RESList = [np.zeros((N,N)) for it in range(ile__)] for __net in range(ile__): TO = TOList[__net] ileO = ileList[__net] RES = RESList[__net] shift = 0.1 DELTAshift = (ileO-L) / (Q-1) print "DELTAshift:", DELTAshift for q in range (Q): dataO = []; print (q+1),"/", Q , " ", out_x2.set_value(np.asarray(np.array(TO[:,shift:shift+L]), dtype=theano.config.floatX)) PARTIAL = np.zeros((N,N)) inone3.set_value(np.asarray(np.array(TO[1000][shift:shift+L]).reshape(1,L), dtype=theano.config.floatX)) for i in range(N): inone2.set_value(np.asarray(np.array(TO[i][shift:shift+L]).reshape(1,L), dtype=theano.config.floatX)) p = [ppm(ii) for ii in xrange( N / batch_size)] for pos in range(N): if pos != i: PARTIAL[i][pos] += p[pos / batch_size][pos % batch_size][1] for i in range(N): for j in range(N): RES[i][j] += PARTIAL[i][j] shift += DELTAshift print "Finished", __net RESList[__net] = RES/np.max(RES) gc.collect() end_time = time.clock() print "finished predicting", (end_time - timel) /60. , "minutes ", str(nkerns), "using SEED = ", SEED print('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time0) / 60.)) return RESList
def train_conv_net( datasets, rel_tr, rel_te, rel_de, hlen, U, # yluo: embedding matrix fnres, img_w=300, filter_hs=[3, 4, 5], hidden_units=[100, 2], # hidden_units[1] is number of classes dropout_rate=[0.5], shuffle_batch=True, n_epochs=25, batch_size=50, # yluo: how many sentences to extract to compute gradient lr_decay=0.95, conv_non_linear="relu", activations=[Iden], sqr_norm_lim=9, non_static=True, relname=None): """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer sqr_norm_lim = s^2 in the paper lr_decay = adadelta decay parameter """ hrel_tr = make_rel_hash(rel_tr) hrel_te = make_rel_hash(rel_te) hrel_de = make_rel_hash(rel_de) rng = np.random.RandomState() img_h_tot = len( datasets[0][0] ) - 2 # SS: exclude 2 dimensions: (iid, y). compa1 and compa2 are included pad = max(filter_hs) - 1 filter_w = img_w # yluo: what does different feature maps correspond to? feature_maps = hidden_units[0] filter_shapes = [] for filter_h in filter_hs: # yluo: what does 1 in the filter shape mean? # (number of filters, num input feature maps, filter height, filter width) # how to interpet different filters? filter_shapes.append((feature_maps, 1, filter_h, filter_w)) parameters = [("image shape", img_h_tot, img_w), ("filter shape", filter_shapes), ("hidden_units", hidden_units), ("dropout", dropout_rate), ("batch_size", batch_size), ("non_static", non_static), ("learn_decay", lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim), ("shuffle_batch", shuffle_batch)] print parameters #define model architecture index = T.lscalar() # x = T.matrix('x') c1 = T.matrix('c1') c2 = T.matrix('c2') prec = T.matrix('prec') mid = T.matrix('mid') succ = T.matrix('succ') y = T.ivector('y') iid = T.vector('iid') compa1 = T.vector('compa1') # compatibility1 of c1/c2 compa2 = T.vector('compa2') # compatibility2 of c1/c2 semclass1 = T.vector('semclass1') # semclass of a "predicate" semclass2 = T.vector('semclass2') # semclass of a "predicate" semclass3 = T.vector('semclass3') # semclass of a "predicate" semclass4 = T.vector('semclass4') # semclass of a "predicate" semclass5 = T.vector('semclass5') # semclass of a "predicate" #pr = theano.printing.Print("COMPA")(compa) Words = theano.shared(value=U, name="Words") zero_vec_tensor = T.vector() zero_vec = np.zeros(img_w) set_zero = theano.function([zero_vec_tensor], updates=[ (Words, T.set_subtensor(Words[0, :], zero_vec_tensor)) ], allow_input_downcast=True) c1_input = Words[T.cast(c1.flatten(), dtype="int32")].reshape( (c1.shape[0], 1, c1.shape[1], Words.shape[1])) # reshape to 3d array # Words[T.cast(c1.flatten(),dtype="int32")] >>> len c1 flattened*emb_dim # c1_input >>> n_insts * 1 * n_ws_per_inst * emb_dim c2_input = Words[T.cast(c2.flatten(), dtype="int32")].reshape( (c2.shape[0], 1, c2.shape[1], Words.shape[1])) # reshape to 3d array prec_input = Words[T.cast(prec.flatten(), dtype="int32")].reshape( (prec.shape[0], 1, prec.shape[1], Words.shape[1])) # reshape to 3d array mid_input = Words[T.cast(mid.flatten(), dtype="int32")].reshape( (mid.shape[0], 1, mid.shape[1], Words.shape[1])) # reshape to 3d array succ_input = Words[T.cast(succ.flatten(), dtype="int32")].reshape( (succ.shape[0], 1, succ.shape[1], Words.shape[1])) # reshape to 3d array layer0_input = { 'c1': c1_input, 'c2': c2_input, 'prec': prec_input, 'mid': mid_input, 'succ': succ_input } conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): for seg in hlen.keys(): # used hlen as a global var, to fix filter_shape = filter_shapes[i] img_h = hlen[seg] + 2 * pad pool_size = (img_h - filter_h + 1, img_w - filter_w + 1) conv_layer = LeNetConvPoolLayer(rng, input=layer0_input[seg], image_shape=(batch_size, 1, img_h, img_w), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) layer1_input = conv_layer.output.flatten( 2) # yluo: 2 dimensions >>> conv_layers.append(conv_layer) # yluo: layer 0 layer1_inputs.append(layer1_input) # yluo: 3 dimensions layer1_input = T.concatenate( layer1_inputs, 1) # yluo: 2 dimensions >>> n_insts * concat_dim? layer1_input = T.horizontal_stack( layer1_input, compa1.reshape((compa1.shape[0], 1)), compa2.reshape((compa2.shape[0], 1)), semclass1.reshape((semclass1.shape[0], 1)), semclass2.reshape((semclass2.shape[0], 1)), semclass3.reshape((semclass3.shape[0], 1)), semclass4.reshape((semclass4.shape[0], 1)), semclass5.reshape((semclass5.shape[0], 1))) hidden_units[0] = feature_maps * len(filter_hs) * len( hlen ) + 2 + 5 # compa: plus 2 (we have two compa feats); semclass: plus 5 classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate) #define parameters of the model and update functions using adadelta params = classifier.params for conv_layer in conv_layers: params += conv_layer.params if non_static: #if word vectors are allowed to change, add them as model parameters params += [Words] cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate , stochastic gradient descent #extra data (at random) tr_size = datasets[0].shape[0] de_size = datasets[2].shape[0] hi_seg = datasets[3] print(hi_seg) c1s, c1e = hi_seg['c1'] c2s, c2e = hi_seg['c2'] mids, mide = hi_seg['mid'] precs, prece = hi_seg['prec'] succs, succe = hi_seg['succ'] yi = hi_seg['y'] idi = hi_seg['iid'] compa1i = hi_seg['compa1'] compa2i = hi_seg['compa2'] semclass1i = hi_seg['semclass1'] semclass2i = hi_seg['semclass2'] semclass3i = hi_seg['semclass3'] semclass4i = hi_seg['semclass4'] semclass5i = hi_seg['semclass5'] if tr_size % batch_size > 0: extra_data_num = batch_size - tr_size % batch_size train_set = rng.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data = np.append(datasets[0], extra_data, axis=0) else: new_data = datasets[0] new_data = rng.permutation(new_data) n_batches = new_data.shape[0] / batch_size #n_train_batches = int(np.round(n_batches*0.9)) n_train_batches = n_batches if de_size % batch_size > 0: extra_data_num = batch_size - de_size % batch_size dev_set = rng.permutation(datasets[2]) extra_data = dev_set[:extra_data_num] new_data_de = np.append(datasets[2], extra_data, axis=0) else: new_data_de = datasets[2] new_data_de = rng.permutation(new_data_de) n_dev_batches = new_data_de.shape[0] / batch_size #divide train set into train/val sets c1_te = datasets[1][:, c1s:c1e] c2_te = datasets[1][:, c2s:c2e] prec_te = datasets[1][:, precs:prece] mid_te = datasets[1][:, mids:mide] succ_te = datasets[1][:, succs:succe] test_set = datasets[1] y_te = np.asarray(test_set[:, yi], "int32") compa1_te = np.asarray(test_set[:, compa1i], "float32") compa2_te = np.asarray(test_set[:, compa2i], "float32") semclass1_te = np.asarray(test_set[:, semclass1i], "float32") semclass2_te = np.asarray(test_set[:, semclass2i], "float32") semclass3_te = np.asarray(test_set[:, semclass3i], "float32") semclass4_te = np.asarray(test_set[:, semclass4i], "float32") semclass5_te = np.asarray(test_set[:, semclass5i], "float32") train_set = new_data[:n_train_batches * batch_size, :] dev_set = new_data_de[:n_dev_batches * batch_size:, :] x_tr, y_tr = shared_dataset((train_set[:, :img_h_tot], train_set[:, -1])) x_de, y_de = shared_dataset((dev_set[:, :img_h_tot], dev_set[:, -1])) iid_tr = train_set[:, idi].flatten() iid_de = dev_set[:, idi].flatten() iid_te = test_set[:, idi].flatten() print('len iid_de %d' % (len(iid_de))) #compile theano functions to get train/val/test errors dev_model = theano.function( [index], classifier.preds(y), givens={ c1: x_de[index * batch_size:(index + 1) * batch_size, c1s:c1e], c2: x_de[index * batch_size:(index + 1) * batch_size, c2s:c2e], prec: x_de[index * batch_size:(index + 1) * batch_size, precs:prece], mid: x_de[index * batch_size:(index + 1) * batch_size, mids:mide], succ: x_de[index * batch_size:(index + 1) * batch_size, succs:succe], compa1: x_de[index * batch_size:(index + 1) * batch_size, compa1i], compa2: x_de[index * batch_size:(index + 1) * batch_size, compa2i], semclass1: x_de[index * batch_size:(index + 1) * batch_size, semclass1i], semclass2: x_de[index * batch_size:(index + 1) * batch_size, semclass2i], semclass3: x_de[index * batch_size:(index + 1) * batch_size, semclass3i], semclass4: x_de[index * batch_size:(index + 1) * batch_size, semclass4i], semclass5: x_de[index * batch_size:(index + 1) * batch_size, semclass5i], y: y_de[index * batch_size:(index + 1) * batch_size], }, allow_input_downcast=True, on_unused_input='warn') # this test_model is batch test model for train test_model = theano.function( [index], classifier.errors(y), givens={ c1: x_tr[index * batch_size:(index + 1) * batch_size, c1s:c1e], c2: x_tr[index * batch_size:(index + 1) * batch_size, c2s:c2e], prec: x_tr[index * batch_size:(index + 1) * batch_size, precs:prece], mid: x_tr[index * batch_size:(index + 1) * batch_size, mids:mide], succ: x_tr[index * batch_size:(index + 1) * batch_size, succs:succe], compa1: x_tr[index * batch_size:(index + 1) * batch_size, compa1i], compa2: x_tr[index * batch_size:(index + 1) * batch_size, compa2i], semclass1: x_tr[index * batch_size:(index + 1) * batch_size, semclass1i], semclass2: x_tr[index * batch_size:(index + 1) * batch_size, semclass2i], semclass3: x_tr[index * batch_size:(index + 1) * batch_size, semclass3i], semclass4: x_tr[index * batch_size:(index + 1) * batch_size, semclass4i], semclass5: x_tr[index * batch_size:(index + 1) * batch_size, semclass5i], y: y_tr[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) train_model = theano.function( [index], cost, updates=grad_updates, givens={ c1: x_tr[index * batch_size:(index + 1) * batch_size, c1s:c1e], c2: x_tr[index * batch_size:(index + 1) * batch_size, c2s:c2e], prec: x_tr[index * batch_size:(index + 1) * batch_size, precs:prece], mid: x_tr[index * batch_size:(index + 1) * batch_size, mids:mide], succ: x_tr[index * batch_size:(index + 1) * batch_size, succs:succe], compa1: x_tr[index * batch_size:(index + 1) * batch_size, compa1i], compa2: x_tr[index * batch_size:(index + 1) * batch_size, compa2i], semclass1: x_tr[index * batch_size:(index + 1) * batch_size, semclass1i], semclass2: x_tr[index * batch_size:(index + 1) * batch_size, semclass2i], semclass3: x_tr[index * batch_size:(index + 1) * batch_size, semclass3i], semclass4: x_tr[index * batch_size:(index + 1) * batch_size, semclass4i], semclass5: x_tr[index * batch_size:(index + 1) * batch_size, semclass5i], y: y_tr[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) test_pred_layers = [] test_size = len(y_te) c1_te_input = Words[T.cast(c1.flatten(), dtype="int32")].reshape( (c1_te.shape[0], 1, c1_te.shape[1], Words.shape[1])) c2_te_input = Words[T.cast(c2.flatten(), dtype="int32")].reshape( (c2_te.shape[0], 1, c2_te.shape[1], Words.shape[1])) prec_te_input = Words[T.cast(prec.flatten(), dtype="int32")].reshape( (prec_te.shape[0], 1, prec_te.shape[1], Words.shape[1])) mid_te_input = Words[T.cast(mid.flatten(), dtype="int32")].reshape( (mid_te.shape[0], 1, mid_te.shape[1], Words.shape[1])) succ_te_input = Words[T.cast(succ.flatten(), dtype="int32")].reshape( (succ_te.shape[0], 1, succ_te.shape[1], Words.shape[1])) test_layer0_input = { 'c1': c1_te_input, 'c2': c2_te_input, 'prec': prec_te_input, 'mid': mid_te_input, 'succ': succ_te_input } cl_id = 0 # conv layer id for i in xrange(len(filter_hs)): for seg in hlen.keys(): conv_layer = conv_layers[cl_id] test_layer0_output = conv_layer.predict( test_layer0_input[seg], test_size ) ## doesn't seeem to matter if just use layer0_input here test_pred_layers.append(test_layer0_output.flatten(2)) cl_id += 1 test_layer1_input = T.concatenate(test_pred_layers, 1) #test_layer1_input = T.horizontal_stack(test_layer1_input, compa_te.reshape((compa_te.shape[0], 1))) test_layer1_input = T.horizontal_stack( test_layer1_input, compa1.reshape((compa1.shape[0], 1)), compa2.reshape((compa2.shape[0], 1)), semclass1.reshape((semclass1.shape[0], 1)), semclass2.reshape((semclass2.shape[0], 1)), semclass3.reshape((semclass3.shape[0], 1)), semclass4.reshape((semclass4.shape[0], 1)), semclass5.reshape((semclass5.shape[0], 1))) test_y_pred = classifier.predict(test_layer1_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model_all = theano.function([ c1, c2, prec, mid, succ, compa1, compa2, semclass1, semclass2, semclass3, semclass4, semclass5 ], test_y_pred, allow_input_downcast=True) #start training over mini-batches print '... training' epoch = 0 best_dev_perf = 0 test_perf = 0 cost_epoch = 0 while (epoch < n_epochs): start_time = time.time() epoch = epoch + 1 if shuffle_batch: for minibatch_index in rng.permutation(range(n_train_batches)): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in xrange(n_train_batches): cost_epoch = train_model(minibatch_index) set_zero(zero_vec) train_losses = [ np.mean(test_model(i)) for i in xrange(n_train_batches) ] train_perf = 1 - np.mean(train_losses) dev_preds = np.asarray([]) for i in xrange(n_dev_batches): dev_sb_preds = dev_model(i) y_sb = y_de[i * batch_size:(i + 1) * batch_size].eval() dev_sb_errors = dev_sb_preds != y_sb err_ind = [j for j, x in enumerate(dev_sb_errors) if x == 1] dev_sb = iid_de[i * batch_size:(i + 1) * batch_size] dev_preds = np.append(dev_preds, dev_sb_preds) dev_perf = 1 - np.mean(y_de.eval() != dev_preds) dev_cm = su.confMat(y_de.eval(), dev_preds, hidden_units[1]) (dev_pres, dev_recs, dev_f1s, dev_mipre, dev_mirec, dev_mif) = su.cmPRF(dev_cm, ncstart=1) print( 'epoch: %i, training time: %.2f secs, train perf: %.2f %%, dev_mipre: %.2f %%, dev_mirec: %.2f %%, dev_mif: %.2f %%' % (epoch, time.time() - start_time, train_perf * 100., dev_mipre * 100., dev_mirec * 100., dev_mif * 100.)) if dev_mif >= best_dev_perf: best_dev_perf = dev_mif test_pred = test_model_all(c1_te, c2_te, prec_te, mid_te, succ_te, compa1_te, compa2_te, semclass1_te, semclass2_te, semclass3_te, semclass4_te, semclass5_te) test_preds = extract_preds(rel_te, test_pred, relname) test_errors = test_pred != y_te err_ind = [j for j, x in enumerate(test_errors) if x == 1] test_cm = su.confMat(y_te, test_pred, hidden_units[1]) print('\n'.join([ ''.join(['{:10}'.format(int(item)) for item in row]) for row in test_cm ])) (pres, recs, f1s, mipre, mirec, mif) = su.cmPRF(test_cm, ncstart=1) mipre_de = dev_mipre mirec_de = dev_mirec mif_de = dev_mif print('mipre %s, mirec %s, mif %s' % (mipre, mirec, mif)) cPickle.dump([y_te, test_pred], open(fnres, "wb")) return (mipre, mirec, mif, mipre_de, mirec_de, mif_de, test_cm, test_preds)
def get_output_for(self, inputs, deterministic=False, **kwargs): # extract inputs H1, H2 = inputs # train set size m = H1.shape[0].astype(theano.config.floatX) # running average projection matrix update if not deterministic: # compute batch mean mean1 = T.mean(H1, axis=0) mean2 = T.mean(H2, axis=0) # running average updates of means mean1 = (floatX(1.0 - self.alpha) * self.mean1 + self.alpha * mean1) running_mean1 = theano.clone(self.mean1, share_inputs=False) running_mean1.default_update = mean1 mean1 += 0 * running_mean1 mean2 = (floatX(1.0 - self.alpha) * self.mean2 + self.alpha * mean2) running_mean2 = theano.clone(self.mean2, share_inputs=False) running_mean2.default_update = mean2 mean2 += 0 * running_mean2 # hidden representations H1bar = H1 - mean1 H2bar = H2 - mean2 # transpose to formulas in paper H1bar = H1bar.T H2bar = H2bar.T # cross-covariance S12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T) # covariance 1 S11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T) S11 = S11 + self.r1 * T.identity_like(S11) # covariance 2 S22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T) S22 = S22 + self.r2 * T.identity_like(S22) # running average updates of statistics S12 = (floatX(1.0 - self.alpha) * self.S12 + self.alpha * S12) running_S12 = theano.clone(self.S12, share_inputs=False) running_S12.default_update = S12 S12 += 0 * running_S12 S11 = (floatX(1.0 - self.alpha) * self.S11 + self.alpha * S11) running_S11 = theano.clone(self.S11, share_inputs=False) running_S11.default_update = S11 S11 += 0 * running_S11 S22 = (floatX(1.0 - self.alpha) * self.S22 + self.alpha * S22) running_S22 = theano.clone(self.S22, share_inputs=False) running_S22.default_update = S22 S22 += 0 * running_S22 S21 = S12.T # theano optimized version of paper S11c = T.slinalg.cholesky(S11) S11ci = T.nlinalg.matrix_inverse(S11c) S11_inv = T.nlinalg.matrix_inverse(S11) S22c = T.slinalg.cholesky(S22) S22ci = T.nlinalg.matrix_inverse(S22c) S22_inv = T.nlinalg.matrix_inverse(S22) # compute correlation (regularized) M1 = S11ci.dot(S12).dot(S22_inv).dot(S21).dot(S11ci.T) M2 = S22ci.dot(S21).dot(S11_inv).dot(S12).dot(S22ci.T) M1 += self.rT * T.identity_like(M1) M2 += self.rT * T.identity_like(M2) # compute eigen decomposition E1, E = T.nlinalg.eigh(M1) _, F = T.nlinalg.eigh(M2) # maximize correlation E1 = T.clip(E1, 1e-7, 1.0) E1 = T.sqrt(E1) self.loss = -T.mean(E1) * self.wl self.corr = E1 # compute projection matrices U = S11ci.T.dot(E) V_prime = S22ci.T.dot(F) # project data lv1_cca = H1bar.T.dot(U) lv2_cca = H2bar.T.dot(V_prime) # workaround to flip axis of projection vector def compute_corr(d, lv1_cca, lv2_cca): CX = lv1_cca[:, d].T.dot(lv2_cca[:, d]) C1 = lv1_cca[:, d].T.dot(lv1_cca[:, d]) C2 = lv2_cca[:, d].T.dot(lv2_cca[:, d]) c = CX / (T.sqrt(C1) * T.sqrt(C2)) return T.sgn(c) dims = T.arange(0, lv1_cca.shape[1]) corrs, _ = theano.scan(fn=compute_corr, outputs_info=None, sequences=[dims], non_sequences=[lv1_cca, lv2_cca]) # fix projection matrix and reproject data V = V_prime * corrs # some casting is required here U = T.cast(U, 'float32') V = T.cast(V, 'float32') # update of projection matrices running_U = theano.clone(self.U, share_inputs=False) running_U.default_update = U U += floatX(0) * running_U running_V = theano.clone(self.V, share_inputs=False) running_V.default_update = V V += floatX(0) * running_V # use projections of layer else: # hidden representations H1bar = H1 - self.mean1 H2bar = H2 - self.mean2 # transpose to formulas in paper H1bar = H1bar.T H2bar = H2bar.T U, V = self.U, self.V # re-project data lv1_cca = H1bar.T.dot(U) lv2_cca_fixed = H2bar.T.dot(V) output = T.horizontal_stack(lv1_cca, lv2_cca_fixed) return output
def __call__(self, v): return T.horizontal_stack(*self.psamples[1:])
def __init__(self, rng=None, x_in=None, \ p_s0_obs_given_z_obs=None, p_hi_given_si=None, p_sip1_given_si_hi=None, \ p_x_given_si_hi=None, q_z_given_x=None, q_hi_given_x_si=None, \ obs_dim=None, z_rnn_dim=None, z_obs_dim=None, h_dim=None, \ model_init_obs=True, model_init_rnn=True, ir_steps=2, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # TODO: implement functionality for working with "latent" si assert (p_x_given_si_hi is None) # decide whether to initialize from a model or from a "constant" self.model_init_obs = model_init_obs self.model_init_rnn = model_init_rnn # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert ((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.rnn_dim = z_rnn_dim self.z_dim = z_rnn_dim + z_obs_dim self.z_rnn_dim = z_rnn_dim self.z_obs_dim = z_obs_dim self.h_dim = h_dim self.ir_steps = ir_steps # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x = x_in self.batch_reps = T.lscalar() # setup switching variable for changing between sampling/training zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a weight for pulling priors over hi given si towards a # shared global prior -- e.g. zero mean and unit variance. self.kzg_weight = theano.shared(value=zero_ary, name='msm_kzg_weight') self.set_kzg_weight(0.1) # this weight balances l1 vs. l2 penalty on posterior KLds self.l1l2_weight = theano.shared(value=zero_ary, name='msm_l1l2_weight') self.set_l1l2_weight(1.0) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") obs_scale = 0.0 rnn_scale = 0.0 if self.model_init_obs: # initialize obs state from generative model obs_scale = 1.0 if self.model_init_rnn: # initialize rnn state from generative model rnn_scale = 1.0 self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x) self.z = self.q_z_given_x.output self.z_rnn = self.z[:, :self.z_rnn_dim] self.z_obs = self.z[:, self.z_rnn_dim:] self.p_s0_obs_given_z_obs = p_s0_obs_given_z_obs.shared_param_clone( \ rng=rng, Xd=self.z_obs) _s0_obs_model = self.p_s0_obs_given_z_obs.output_mean _s0_obs_const = self.p_s0_obs_given_z_obs.mu_layers[-1].b self.s0_obs = (obs_scale * _s0_obs_model) + \ ((1.0 - obs_scale) * _s0_obs_const) _s0_rnn_model = self.z_rnn _s0_rnn_const = self.q_z_given_x.mu_layers[-1].b[:self.z_rnn_dim] self.s0_rnn = (rnn_scale * _s0_rnn_model) + \ ((1.0 - rnn_scale) * _s0_rnn_const) self.s0_jnt = T.horizontal_stack(self.s0_obs, self.s0_rnn) self.output_logvar = self.p_s0_obs_given_z_obs.sigma_layers[-1].b self.bounded_logvar = 8.0 * T.tanh((1.0 / 8.0) * self.output_logvar) ############################################################### # Setup the iterative refinement loop, starting from self.s0. # ############################################################### self.p_hi_given_si = [] # holds p_hi_given_si for each i self.p_sip1_given_si_hi = [] # holds p_sip1_given_si_hi for each i self.q_hi_given_x_si = [] # holds q_hi_given_x_si for each i self.si = [self.s0_jnt] # holds si for each i self.hi = [] # holds hi for each i for i in range(self.ir_steps): print("Building MSM step {0:d}...".format(i + 1)) _si = self.si[i] si_obs = _si[:, :self.obs_dim] si_rnn = _si[:, self.obs_dim:] # get samples of next hi, conditioned on current si self.p_hi_given_si.append( \ p_hi_given_si.shared_param_clone(rng=rng, \ Xd=T.horizontal_stack( \ self.obs_transform(si_obs), si_rnn))) hi_p = self.p_hi_given_si[i].output # now we build the model for variational hi given si grad_ll = self.x - self.obs_transform(si_obs) self.q_hi_given_x_si.append(\ q_hi_given_x_si.shared_param_clone(rng=rng, \ Xd=T.horizontal_stack( \ grad_ll, self.obs_transform(si_obs), si_rnn))) hi_q = self.q_hi_given_x_si[i].output # make hi samples that can be switched between hi_p and hi_q self.hi.append( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on hi and the "rnn" part of si. self.p_sip1_given_si_hi.append( \ p_sip1_given_si_hi.shared_param_clone(rng=rng, \ Xd=T.horizontal_stack(self.hi[i], si_rnn))) # construct the update from si_obs/si_rnn to sip1_obs/sip1_rnn sip1_obs = si_obs + self.p_sip1_given_si_hi[i].output_mean sip1_rnn = si_rnn sip1_jnt = T.horizontal_stack(sip1_obs, sip1_rnn) # record the updated state of the generative process self.si.append(sip1_jnt) # check that input/output dimensions of our models agree self._check_model_shapes() ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = np.zeros((1, )).astype(theano.config.floatX) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') self.it_count = theano.shared(value=zero_ary, name='msm_it_count') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_1 = theano.shared(value=zero_ary, name='msm_lam_kld_1') self.lam_kld_2 = theano.shared(value=zero_ary, name='msm_lam_kld_2') self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.p_s0_obs_given_z_obs.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.group_2_params = [] for i in range(self.ir_steps): self.group_2_params.extend(self.q_hi_given_x_si[i].mlp_params) self.group_2_params.extend(self.p_hi_given_si[i].mlp_params) self.group_2_params.extend(self.p_sip1_given_si_hi[i].mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.group_1_params + self.group_2_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z, self.kld_hi_cond, self.kld_hi_glob = \ self._construct_kld_costs() self.kld_cost = (self.lam_kld_1[0] * T.mean(self.kld_z)) + \ (self.lam_kld_2[0] * (T.mean(self.kld_hi_cond) + \ (self.kzg_weight[0] * T.mean(self.kld_hi_glob)))) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs() self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p) # Construct the updates for the generator and inferencer networks self.group_1_updates = get_param_updates(params=self.group_1_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.group_2_updates = get_param_updates(params=self.group_2_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.group_1_updates: self.joint_updates[k] = self.group_1_updates[k] for k in self.group_2_updates: self.joint_updates[k] = self.group_2_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() self.compute_post_klds = self._construct_compute_post_klds() self.compute_fe_terms = self._construct_compute_fe_terms() self.sample_from_prior = self._construct_sample_from_prior() # make easy access points for some interesting parameters self.inf_1_weights = self.q_z_given_x.shared_layers[0].W self.gen_1_weights = self.p_s0_obs_given_z_obs.mu_layers[-1].W self.inf_2_weights = self.q_hi_given_x_si[0].shared_layers[0].W self.gen_2_weights = self.p_sip1_given_si_hi[0].mu_layers[-1].W self.gen_inf_weights = self.p_hi_given_si[0].shared_layers[0].W return
def model_mimlcnn(datasets, Wordv, PF1v, PF2v, img_h, WForATData, linearW): """ 模型建模. :param datasets: 放进来的数据集. :param Wordv: "Word/Token - Embedding" 矩阵. :param PF1v: "PositionFeature1 - Embedding" 矩阵 :param PF2v: "PositionFeature2 - Embedding" 矩阵 :param img_h: Padding之后的句子长度. :param RForAT: 关系 - embedding矩阵 :param WForAT: attention算每个句子权值时需要用到的矩阵 :return: 建模之后的所有模型 """ # 1. 确定超参 logging.info('-------------- Model Settings --------------------') # if word embedding is initialized with 'word2vec', then 'length' is set by the dimension of the word vector automatically; # if initialized with 'rand', then the specified value of 'length' is used. # is_static = False w2v_static = conf.getboolean('word_vector', 'is_static') # pfv_length = 5 wordv_length = 50 img_W = 50 + 5 * 2 = 60 image_shape = (None, 1, img_h, _IMG_W) cp1_filter_shapes = [] cp1_pool_sizes = [] cp2_filter_shape = None cp2_pool_size = None assert len(_CP1_FILTER_HS) == 1 # use_stacked_cp = False if not _USE_STACK_CP: # _CP1_FILTER_HS = [3] for filter_h in _CP1_FILTER_HS: # cp1_n_filters = 230 # _IMG_W = _LEN_WORDV + 2 * _LEN_PFV _CP1_FILTER_W = _IMG_W # 230,1,3,60 # 每次抽取3个句子的特征 cp1_filter_shapes.append( (_CP1_N_FILTERS, 1, filter_h, _CP1_FILTER_W)) # filter完后的行数,86,1 cp1_pool_sizes.append( (img_h - filter_h + 1, _IMG_W - _CP1_FILTER_W + 1)) else: cp1_filter_shapes.append( (_CP1_N_FILTERS, 1, _CP1_FILTER_HS[0], _CP1_FILTER_W)) cp1_pool_sizes.append(_CP1_POOL_SIZE_4SCP) cp2_filter_shape = [_CP2_N_FILTERS, _CP1_N_FILTERS, _CP2_FILTER_H, 1] cp1_fm_img_h = image_shape[2] - _CP1_FILTER_HS[0] + 1 cp2_img_h = int(np.ceil(cp1_fm_img_h / float(cp1_pool_sizes[0][0]))) cp2_pool_size = [cp2_img_h - _CP2_FILTER_H + 1, 1] logging.info('| - image_shape: {0}'.format(image_shape)) logging.info('| - cp1_filter_shapes: {0}'.format(cp1_filter_shapes)) logging.info('| - cp1_non_linear: {0}'.format(_CP1_NON_LINEAR)) logging.info('| - cp1_pool_sizes: {0}'.format(cp1_pool_sizes)) if _USE_STACK_CP: logging.info('| - cp2_filter_shape: {0}'.format(cp2_filter_shape)) logging.info('| - cp2_non_linear: {0}'.format(_CP2_NON_LINEAR)) logging.info('| - cp2_pool_sizes: {0}'.format(cp2_pool_size)) logging.info('| - initial mlp_shape: {0}'.format(_MLP_SHAPE)) logging.info('| - dropout_rates: {0}'.format(_DROPOUT_RATES)) logging.info('| - batch_size: {0}'.format(_BATCH_SIZE)) logging.info('| - word_embedding_length: {0}'.format(_LEN_WORDV)) logging.info('| - word_embedding_initialization: {0}'.format( conf.get('word_vector', 'initialization'))) logging.info('| - word_embedding_static: {0}'.format(w2v_static)) logging.info('| - shuffle_batch: {0}'.format(_SHUFFLE_BATCH)) logging.info('| - lr_decay: {0}'.format(_LR_DECAY)) logging.info('| - sqr_norm_lim: {0}'.format(_SQR_NORM_LIM)) logging.info('| - learning_rate: {0}'.format(_LR)) logging.info('| - cost_type: {0}'.format(conf.get('mode', 'cost_type'))) logging.info('| - pr_margin: {0}'.format( conf.getfloat('mode', 'pr_margin'))) logging.info('| - score_margin: {0}'.format( conf.getfloat('mode', 'score_margin'))) logging.info('| - prediction_type: larger than 0.5 for each label') logging.info('--------------------------------------------------') # 2. 计算模型的输入 logging.info(' - Defining model variables for one mini-batch') bch_idx = T.scalar('batch_idx', dtype='int32') # bch_idx.tag.test_value = 1 xs = T.matrix('xs', dtype='int32') # 3个句子 # 不要被test_value欺骗,xs是句子数×88维的 xs.tag.test_value = np.asarray( [[0, 0, 0, 0, 3, 2, 3, 7, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 2, 4, 1, 8, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 3, 1, 6, 5, 3, 0, 0, 0, 0, 0, 0]], dtype='int32') pfinfos = T.matrix('pfinfos', dtype='int32') pfinfos.tag.test_value = np.array([[4, 2, 1], [5, 1, 3], [5, 0, 1]]) # p0_in_PFv = 52 p0_in_PFv = conf.getint("settings", "p0_in_PFv") # padding位置信息 def cal_padded_sentpf(pfinfo_m): slen = pfinfo_m[0] e1i = pfinfo_m[1] e2i = pfinfo_m[2] pf1 = T.arange(p0_in_PFv - e1i, p0_in_PFv + (slen - e1i), dtype='int32') pf2 = T.arange(p0_in_PFv - e2i, p0_in_PFv + (slen - e2i), dtype='int32') # 调整到最小1,最大101,长度与句子长度相同 clipped_pf1 = T.clip(pf1, 1, 101) clipped_pf2 = T.clip(pf2, 1, 101) # _N_PAD_HEAD = 4 pad_head = T.zeros(shape=(_N_PAD_HEAD, ), dtype='int32') pad_tail = T.zeros(shape=(img_h - _N_PAD_HEAD - slen, ), dtype='int32') # 把两端列表拼成一段列表 pf1_padded = T.concatenate([pad_head, clipped_pf1, pad_tail]) # 三部分相加=pad后长度. pf2_padded = T.concatenate([pad_head, clipped_pf2, pad_tail]) return pf1_padded, pf2_padded # 句子长度,实体1实体2位置 三种信息的padding,padding后向量长度为88,头和尾里面数字是0,实体1和实体2相对位置部分的数字最小是1最大是101 # 返回的是[实体1的位置padding],[实体2的位置padding],都是88维,前4维都为空,中间长度就是句子长度,剩下的也是0 (pf1s, pf2s), _ = theano.scan(fn=cal_padded_sentpf, sequences=[pfinfos]) e1is = pfinfos[:, 1] e2is = pfinfos[:, 2] # 每次传进来的连续x_m片段都是index从0开始的, 而ep2m依旧是按照数据集中所有的x_m来定位的. 所以在这里让ep2m中的所有元素减一个初始值, 让xs的index和ep2m的每个起始位置对应上. ep2m_raw = T.matrix('ep2m_raw', dtype='int32') ep2m_raw.tag.test_value = np.asarray([[25, 27], [27, 28]], dtype='int32') ep2m = ep2m_raw - ep2m_raw[0][0] ep2m.tag.test_value = np.asarray([[0, 2], [2, 3]], dtype='int32') ys = T.matrix('ys', dtype='int32') # _N_RELATIONS = 26 yl1 = [0] * _N_RELATIONS yl1[2] = 1 yl2 = [0] * _N_RELATIONS yl2[5] = 1 ys.tag.test_value = np.asarray([yl1, yl2], dtype='int32') # 3. 定义模型结构, 定义输出, 定义损失 assert pf1s.dtype == 'int32' and pf2s.dtype == 'int32' and xs.dtype == 'int32' _use_my_input = True if _use_my_input: # 1. 我的拼接方法 # 看到这终于明白了,Wordv是用word2Vec初始好的词向量 # 以维度为1连接传入数据,一个句子本来是向量,现在转成了矩阵 # Wordv[xs.flatten()] = [50维的词向量] # pf1s = fltn_vec_stk = T.horizontal_stack(Wordv[xs.flatten()], PF1v[pf1s.flatten()], PF2v[pf2s.flatten()]) # 句子数×1×88×60 cp_layer_input = fltn_vec_stk.reshape( (xs.shape[0], 1, xs.shape[1], _IMG_W)) else: # 2. Zeng的拼接方法 input_words = Wordv[xs.flatten()].reshape( (xs.shape[0], 1, xs.shape[1], _LEN_WORDV)) input_pf1s = PF1v[pf1s.flatten()].reshape( (pf1s.shape[0], 1, pf1s.shape[1], _LEN_PFV)) input_pf2s = PF2v[pf2s.flatten()].reshape( (pf2s.shape[0], 1, pf2s.shape[1], _LEN_PFV)) cp_layer_input = T.concatenate([input_words, input_pf1s, input_pf2s], axis=3) logging.info(' - Defining and assembling CP layer') cp_params = [] # 句子数×1×88×60 # cp_layer_input = # 从这里出一个60维的向量,放在卷积层后面 # input = 1×88×60 def atData(input, left, right): sentence = input[0] min = T.switch(T.lt(left, right), left, right) max = T.switch(T.lt(left, right), right, left) sentenceHead = sentence[:(min + _N_PAD_HEAD)] sentenceMiddle = sentence[(min + _N_PAD_HEAD + 1):(max + _N_PAD_HEAD)] sentenceTail = sentence[(max + _N_PAD_HEAD + 1):] # 去掉了两个entityPair # 86×60 newSentence = T.vertical_stack(sentenceHead, sentenceMiddle, sentenceTail) leftEntity = sentence[min + _N_PAD_HEAD] rightEntity = sentence[max + _N_PAD_HEAD] LRConnect = T.concatenate([leftEntity, rightEntity]) def AtLayerData(LRConnect, newSentenceCon): def forEveryWord(word): temp = T.concatenate([word, LRConnect]) # return T.concatenate(temp, rightEntity) return temp # 将两个entitypair加在了每个句子的后面 # 86×180 sentenceAfAdd, _ = theano.scan(forEveryWord, sequences=newSentenceCon) eForWord = T.dot(sentenceAfAdd, WForATData) aForWord = T.nnet.softmax(eForWord)[0] def mulWeight(word, weight): return word * weight # 86×60 newSRep, _ = theano.scan(mulWeight, sequences=[newSentence, aForWord]) # 1×60 finalSRep = T.sum(newSRep, axis=0) return T.dot(finalSRep, linearW) finalSRep, _ = theano.scan(AtLayerData, outputs_info=LRConnect, non_sequences=newSentence, n_steps=NUMBER_DATA) return finalSRep[-1] myobser1, _ = theano.scan(atData, sequences=[cp_layer_input, e1is, e2is]) # No CNN # cp_out = 句子数×690 myobser1 = 句子数×120 # new_cp_out = 句子数×120 new_cp_out = myobser1 # **************** # *****源代码****** # ***************** # # def ep_max_pooling(ep_mr, csmp_input): # 取出来的照样是句子数×120的矩阵 input_41ep = csmp_input[ep_mr[0]:ep_mr[1]] # Cross-sentence Max-pooling max_pooling_out = T.max(input_41ep, axis=0) # 返回的就是 Entity-pair Representation return max_pooling_out logging.info(' - Aassembling second Max-Pooling layer') # Entity-pair Representation的列表 # 例子数×(690+120) sec_maxPooling_out, _ = theano.scan(fn=ep_max_pooling, sequences=ep2m, non_sequences=new_cp_out) logging.info(' - Defining MLP layer') if not _USE_STACK_CP and _USE_PIECEWISE_POOLING_41CP: _MLP_SHAPE[0] = 2 * _IMG_W logging.info( " - MLP shape changes to {0}, because of piecewise max-pooling". format(_MLP_SHAPE)) mlp_layer = MLPDropout(rng, layer_sizes=_MLP_SHAPE, activations=_MLP_ACTIVATIONS, dropout_rates=_DROPOUT_RATES) mlp_layer.feed(sec_maxPooling_out, input_shape=(_BATCH_SIZE, _MLP_SHAPE[0])) dropout_score_batch = mlp_layer.dropout_layers[-1].score score_batch = mlp_layer.layers[-1].score dropout_p_ygx_batch = T.nnet.sigmoid(dropout_score_batch) p_ygx_batch = T.nnet.sigmoid(score_batch) obz_lr_masks = mlp_layer.lrmask predictions = predict_relations(p_ygx_batch) pred_pscores = p_ygx_batch # ************************** # *****加入attention机制****** # ************************** # 针对1个关系 # def forEveryExample(ep_mr, csmp_input): # # 取出来的照样是句子数×690的矩阵 # # 这些句子是对应一个实体对的 # # input_41ep = csmp_input[ep_mr[0]: ep_mr[1]] # # def forEverySentence(item): # temp = T.dot(item, WForAT) # # ???? change this # re = T.dot(temp, RForAT[0]) # return re # # slist, noup = theano.scan(forEverySentence, sequences=input_41ep) # # aForRj = T.nnet.softmax(slist)[0] # # def mulWeight(sentence, weight): # return sentence * weight # # newSRep, noup = theano.scan(mulWeight, sequences=[input_41ep, aForRj]) # # finalresult = T.sum(newSRep, axis=0) # # # return finalresult # return finalresult # # # # Entity-pair Representation的列表 # my_sec_add_out, _ = theano.scan(fn=forEveryExample, sequences=ep2m, non_sequences=cp_out) # # logging.info(' - Defining MLP layer') # # _USE_STACK_CP = False # # _USE_PIECEWISE_POOLING_41CP = True # if not _USE_STACK_CP and _USE_PIECEWISE_POOLING_41CP: # # _MLP_SHAPE = [230, 26] # _MLP_SHAPE[0] *= 3 # # _MLP_SHAPE = [690, 26] # logging.info(" - MLP shape changes to {0}, because of piecewise max-pooling".format(_MLP_SHAPE)) # # # _MLP_SHAPE = [690,26] _MLP_ACTIVATIONS = [Iden] dropout_rates = [0.5] # mlp_layer = MLPDropout(rng, layer_sizes=_MLP_SHAPE, activations=_MLP_ACTIVATIONS, dropout_rates=_DROPOUT_RATES) # # input_shape = (batch_size = 50,_MLP_SHAPE[0] = 690) # mlp_layer.feed(my_sec_add_out, input_shape=(_BATCH_SIZE, _MLP_SHAPE[0])) # # # 针对26个关系 # # logging.info(' - Defining MLP layer') # # assert _USE_STACK_CP == False # assert _USE_PIECEWISE_POOLING_41CP == True # # if not _USE_STACK_CP and _USE_PIECEWISE_POOLING_41CP: # # _MLP_SHAPE = [230, 26] # _MLP_SHAPE[0] *= 3 # # _MLP_SHAPE = [690, 26] # logging.info(" - MLP shape changes to {0}, because of piecewise max-pooling".format(_MLP_SHAPE)) # # # _MLP_SHAPE = [690,26] _MLP_ACTIVATIONS = [Iden] dropout_rates = [0.5] # # # my_mlp_layer = MyMLPDropout(rng, layer_sizes=[[_MLP_SHAPE[0] + _IMG_W * 2, 2]], activations=_MLP_ACTIVATIONS, # dropout_rates=_DROPOUT_RATES) # # def forEveryRelation(idx, ep2m, cp_out): # def forEveryExample(ep_mr, csmp_input): # # 取出来的照样是句子数×(690+120)的矩阵 # # 这些句子是对应一个实体对的 # # input_41ep = csmp_input[ep_mr[0]: ep_mr[1]] # # def attentionLayer(R, input_41ep_out): # def forEverySentence(item): # temp = T.dot(item, WForAT) # # ???? change this # re = T.dot(temp, R) # return re # # # slist就是ei # slist, noup = theano.scan(forEverySentence, sequences=input_41ep_out) # # aForRj = T.nnet.softmax(slist)[0] # # def mulWeight(sentence, weight): # return sentence * weight # # # 句子数×(690+120) # newSRep, noup = theano.scan(mulWeight, sequences=[input_41ep_out, aForRj]) # # # 1×(690+120) # finalresult = T.sum(newSRep, axis=0) # # return finalresult # # # AT层数×1×(690+120) # newSRepAf, _ = theano.scan(attentionLayer, outputs_info=RForAT[idx], # non_sequences=input_41ep, n_steps=NUMBER) # # # finalresult = T.sum(newSRepAf[-1], axis=0) # # # return finalresult # # 一次做完吧 # # return newSRepAf[-1] # # # (50, (690+120)) # my_sec_add_out, _ = theano.scan(fn=forEveryExample, sequences=ep2m, non_sequences=[cp_out]) # # return my_sec_add_out # # idx = T.ivector() # # ok = (26,50,(690+120)) # ok, up = theano.scan(forEveryRelation, sequences=[idx], # non_sequences=[ep2m, new_cp_out]) # # # (26, 50, (690 + 120)) # normalre, dropoutre = my_mlp_layer.feed(idx, ok, # input_shape=(_N_RELATIONS, _BATCH_SIZE, (_MLP_SHAPE[0] + 2 * _IMG_W))) logging.info(' - Cost, params and grads ...') # 用这个更新权重 # 计算损失的时候没有用到前面的score,score就是乘出来的26维的向量 # 第二个参数是把score用sigmoid做归一化得出来的结果 dropout_cost = compute_cost(dropout_p_ygx_batch, ys) cost = compute_cost(p_ygx_batch, ys) op_params = [] params = [] op_params += [Wordv] # is_static = False if not w2v_static: # if word vectors are allowed to change, add them as model hyper_parameters params += [Wordv] op_params += [PF1v, PF2v] params += [PF1v, PF2v] op_params += cp_params params += cp_params op_params += mlp_layer.params params += mlp_layer.params op_params += [WForATData, linearW] params += [WForATData, linearW] # op_params += [WForAT, RForAT] # params += [WForAT, RForAT] logging.info('Params to update: ' + str(', '.join([param.name for param in params]))) logging.info('Params to output: ' + str(', '.join([op_param.name for op_param in op_params]))) # 5. 权重更新方式. grad_updates = lasagne.updates.adadelta(dropout_cost, params) # 6. 定义theano_function train_x_m, train_PFinfo_m, train_ep2m, train_y, test_x_m, test_PFinfo_m, test_ep2m, test_y = datasets logging.info('Compiling train_update_model...') output_list = [cost, dropout_cost] if conf.getboolean('mode', 'output_details'): output_list += ([obz_lr_masks, p_ygx_batch] + op_params) train_update_model = theano.function( [bch_idx], output_list, updates=grad_updates, name='train_update_model', givens={ xs: get_1batch_x_m(bch_idx, train_x_m, train_ep2m), pfinfos: get_1batch_x_m(bch_idx, train_PFinfo_m, train_ep2m), ep2m_raw: train_ep2m[bch_idx * _BATCH_SIZE:(bch_idx + 1) * _BATCH_SIZE], ys: train_y[bch_idx * _BATCH_SIZE:(bch_idx + 1) * _BATCH_SIZE], }, ) # }, on_unused_input='warn') logging.info('Compiling set_zero function ...') Wordv_0sline = T.vector("Wordv_0sline", dtype=theano.config.floatX) PFv_0sline = T.vector("PFv_0sline", dtype=theano.config.floatX) set_zero = theano.function( [Wordv_0sline, PFv_0sline], updates=[(Wordv, T.set_subtensor(Wordv[0, :], Wordv_0sline)), (PF1v, T.set_subtensor(PF1v[0, :], PFv_0sline)), (PF2v, T.set_subtensor(PF2v[0, :], PFv_0sline))]) logging.info('Compiling trainset_error_model ...') trainset_error_model = theano.function( [bch_idx], [predictions, pred_pscores], givens={ xs: get_1batch_x_m(bch_idx, train_x_m, train_ep2m), pfinfos: get_1batch_x_m(bch_idx, train_PFinfo_m, train_ep2m), ep2m_raw: train_ep2m[bch_idx * _BATCH_SIZE:(bch_idx + 1) * _BATCH_SIZE], }) logging.info('Compiling testset_error_model ...') testset_error_model = theano.function( [bch_idx], [predictions, pred_pscores], givens={ xs: get_1batch_x_m(bch_idx, test_x_m, test_ep2m), pfinfos: get_1batch_x_m(bch_idx, test_PFinfo_m, test_ep2m), ep2m_raw: test_ep2m[bch_idx * _BATCH_SIZE:(bch_idx + 1) * _BATCH_SIZE], }) init_LR_W = mlp_layer.dropout_layers[-1].W.get_value() init_LR_b = mlp_layer.dropout_layers[-1].b.get_value() return train_update_model, trainset_error_model, testset_error_model, set_zero, init_LR_W, init_LR_b
import theano import theano.tensor as T import numpy as np a = T.imatrix() b = T.imatrix() ok = T.horizontal_stack(a, b) myfunc = theano.function([a, b], ok) a_init = np.reshape(np.arange(10, dtype='int32'), (2, 5)) b_init = np.reshape(np.arange(10, 20, dtype='int32'), (2, 5)) ok = myfunc(a_init, b_init) print ok
def __init__(self, rng=None, x_in=None, \ p_s0_obs_given_z_obs=None, p_hi_given_si=None, p_sip1_given_si_hi=None, \ p_x_given_si_hi=None, q_z_given_x=None, q_hi_given_x_si=None, \ obs_dim=None, z_dim=None, h_dim=None, \ model_init_obs=True, ir_steps=2, \ params=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # TODO: implement functionality for working with "latent" si assert(p_x_given_si_hi is None) # decide whether to initialize from a model or from a "constant" self.model_init_obs = model_init_obs # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) # record the dimensions of various spaces relevant to this model self.obs_dim = obs_dim self.z_dim = z_dim self.h_dim = h_dim self.ir_steps = ir_steps # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x = x_in self.batch_reps = T.lscalar() # setup switching variable for changing between sampling/training zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a weight for pulling priors over hi given si towards a # shared global prior -- e.g. zero mean and unit variance. self.kzg_weight = theano.shared(value=zero_ary, name='msm_kzg_weight') self.set_kzg_weight(0.1) # this weight balances l1 vs. l2 penalty on posterior KLds self.l1l2_weight = theano.shared(value=zero_ary, name='msm_l1l2_weight') self.set_l1l2_weight(1.0) # this parameter controls dropout rate in the generator read function self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) ############################# # Setup self.z and self.s0. # ############################# print("Building MSM step 0...") obs_scale = 0.0 if self.model_init_obs: # initialize obs state from generative model obs_scale = 1.0 self.q_z_given_x = q_z_given_x.shared_param_clone(rng=rng, Xd=self.x) self.z = self.q_z_given_x.output self.p_s0_obs_given_z_obs = p_s0_obs_given_z_obs.shared_param_clone( \ rng=rng, Xd=self.z) _s0_obs_model = self.p_s0_obs_given_z_obs.output_mean _s0_obs_const = self.p_s0_obs_given_z_obs.mu_layers[-1].b self.s0_obs = (obs_scale * _s0_obs_model) + \ ((1.0 - obs_scale) * _s0_obs_const) self.output_logvar = self.p_s0_obs_given_z_obs.sigma_layers[-1].b self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.output_logvar) ############################################################### # Setup the iterative refinement loop, starting from self.s0. # ############################################################### self.p_hi_given_si = [] # holds p_hi_given_si for each i self.p_sip1_given_si_hi = [] # holds p_sip1_given_si_hi for each i self.q_hi_given_x_si = [] # holds q_hi_given_x_si for each i self.si = [self.s0_obs] # holds si for each i self.hi = [] # holds hi for each i for i in range(self.ir_steps): print("Building MSM step {0:d}...".format(i+1)) si_obs = self.si[i] # get samples of next hi, conditioned on current si self.p_hi_given_si.append( \ p_hi_given_si.shared_param_clone(rng=rng, \ Xd=self.obs_transform(si_obs))) hi_p = self.p_hi_given_si[i].output # now we build the model for variational hi given si grad_ll = self.x - self.obs_transform(si_obs) self.q_hi_given_x_si.append(\ q_hi_given_x_si.shared_param_clone(rng=rng, \ Xd=T.horizontal_stack( \ grad_ll, self.obs_transform(si_obs)))) hi_q = self.q_hi_given_x_si[i].output # make hi samples that can be switched between hi_p and hi_q self.hi.append( ((self.train_switch[0] * hi_q) + \ ((1.0 - self.train_switch[0]) * hi_p)) ) # p_sip1_given_si_hi is conditioned on hi. self.p_sip1_given_si_hi.append( \ p_sip1_given_si_hi.shared_param_clone(rng=rng, \ Xd=self.hi[i])) # construct the update from si_obs to sip1_obs sip1_obs = si_obs + self.p_sip1_given_si_hi[i].output_mean # record the updated state of the generative process self.si.append(sip1_obs) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_1 = theano.shared(value=zero_ary, name='msm_lam_kld_1') self.lam_kld_2 = theano.shared(value=zero_ary, name='msm_lam_kld_2') self.set_lam_kld(lam_kld_1=1.0, lam_kld_2=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.p_s0_obs_given_z_obs.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.group_2_params = [] for i in range(self.ir_steps): self.group_2_params.extend(self.q_hi_given_x_si[i].mlp_params) self.group_2_params.extend(self.p_hi_given_si[i].mlp_params) self.group_2_params.extend(self.p_sip1_given_si_hi[i].mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.group_1_params + self.group_2_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z, self.kld_hi_cond, self.kld_hi_glob = \ self._construct_kld_costs() self.kld_cost = (self.lam_kld_1[0] * T.mean(self.kld_z)) + \ (self.lam_kld_2[0] * (T.mean(self.kld_hi_cond) + \ (self.kzg_weight[0] * T.mean(self.kld_hi_glob)))) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs() self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_cost + self.reg_cost # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.group_1_updates = get_adam_updates(params=self.group_1_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.group_2_updates = get_adam_updates(params=self.group_2_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.group_1_updates: self.joint_updates[k] = self.group_1_updates[k] for k in self.group_2_updates: self.joint_updates[k] = self.group_2_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() self.compute_post_klds = self._construct_compute_post_klds() self.compute_fe_terms = self._construct_compute_fe_terms() self.sample_from_prior = self._construct_sample_from_prior() # make easy access points for some interesting parameters self.inf_1_weights = self.q_z_given_x.shared_layers[0].W self.gen_1_weights = self.p_s0_obs_given_z_obs.mu_layers[-1].W self.inf_2_weights = self.q_hi_given_x_si[0].shared_layers[0].W self.gen_2_weights = self.p_sip1_given_si_hi[0].mu_layers[-1].W self.gen_inf_weights = self.p_hi_given_si[0].shared_layers[0].W return
def __init__(self, rng=None, \ x_in=None, x_out=None, \ p_s_given_z=None, \ p_h_given_s=None, \ p_x_given_s_h=None, \ q_z_given_x=None, \ q_h_given_x_s=None, \ x_dim=None, \ z_dim=None, \ s_dim=None, \ h_dim=None, \ params=None, \ shared_param_dicts=None): # setup a rng for this GIPair self.rng = RandStream(rng.randint(100000)) # grab the user-provided parameters self.params = params self.x_type = self.params['x_type'] assert((self.x_type == 'bernoulli') or (self.x_type == 'gaussian')) if 'obs_transform' in self.params: assert((self.params['obs_transform'] == 'sigmoid') or \ (self.params['obs_transform'] == 'none')) if self.params['obs_transform'] == 'sigmoid': self.obs_transform = lambda x: T.nnet.sigmoid(x) else: self.obs_transform = lambda x: x else: self.obs_transform = lambda x: T.nnet.sigmoid(x) if self.x_type == 'bernoulli': self.obs_transform = lambda x: T.nnet.sigmoid(x) self.shared_param_dicts = shared_param_dicts # record the dimensions of various spaces relevant to this model self.x_dim = x_dim self.z_dim = z_dim self.s_dim = s_dim self.h_dim = h_dim # grab handles to the relevant InfNets self.q_z_given_x = q_z_given_x self.q_h_given_x_s = q_h_given_x_s self.p_s_given_z = p_s_given_z self.p_h_given_s = p_h_given_s self.p_x_given_s_h = p_x_given_s_h # record the symbolic variables that will provide inputs to the # computation graph created to describe this MultiStageModel self.x_in = x_in self.x_out = x_out self.batch_reps = T.lscalar() # setup switching variable for changing between sampling/training zero_ary = to_fX( np.zeros((1,)) ) self.train_switch = theano.shared(value=zero_ary, name='msm_train_switch') self.set_train_switch(1.0) # setup a variable for controlling dropout noise self.drop_rate = theano.shared(value=zero_ary, name='msm_drop_rate') self.set_drop_rate(0.0) # this weight balances l1 vs. l2 penalty on posterior KLds self.lam_kld_l1l2 = theano.shared(value=zero_ary, name='msm_lam_kld_l1l2') self.set_lam_kld_l1l2(1.0) if self.shared_param_dicts is None: # initialize "optimizable" parameters specific to this MSM init_vec = to_fX( np.zeros((self.z_dim,)) ) self.p_z_mean = theano.shared(value=init_vec, name='msm_p_z_mean') self.p_z_logvar = theano.shared(value=init_vec, name='msm_p_z_logvar') init_vec = to_fX( np.zeros((self.x_dim,)) ) self.obs_logvar = theano.shared(value=zero_ary, name='msm_obs_logvar') self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) self.shared_param_dicts = {} self.shared_param_dicts['p_z_mean'] = self.p_z_mean self.shared_param_dicts['p_z_logvar'] = self.p_z_logvar self.shared_param_dicts['obs_logvar'] = self.obs_logvar else: self.p_z_mean = self.shared_param_dicts['p_z_mean'] self.p_z_logvar = self.shared_param_dicts['p_z_logvar'] self.obs_logvar = self.shared_param_dicts['obs_logvar'] self.bounded_logvar = 8.0 * T.tanh((1.0/8.0) * self.obs_logvar) # get a drop mask that drops things with probability p drop_scale = 1. / (1. - self.drop_rate[0]) drop_rnd = self.rng.uniform(size=self.x_out.shape, \ low=0.0, high=1.0, dtype=theano.config.floatX) drop_mask = drop_scale * (drop_rnd > self.drop_rate[0]) ############################################## # Setup the TwoStageModels main computation. # ############################################## print("Building TSM...") # samples of "first" latent state drop_x = drop_mask * self.x_in z_q_mean, z_q_logvar, self.z = \ self.q_z_given_x.apply(drop_x, do_samples=True) # compute relevant KLds for this step self.kld_z_q2ps = gaussian_kld(z_q_mean, z_q_logvar, \ self.p_z_mean, self.p_z_logvar) self.kld_z_p2qs = gaussian_kld(self.p_z_mean, self.p_z_logvar, \ z_q_mean, z_q_logvar) # transform "first" latent state into "second" latent state self.s, _ = self.p_s_given_z.apply(self.z, do_samples=False) # get samples of h, conditioned on current s h_p_mean, h_p_logvar, h_p = self.p_h_given_s.apply( \ self.s, do_samples=True) # get variational samples of h, given s and x_out h_q_mean, h_q_logvar, h_q = self.q_h_given_x_s.apply( \ T.horizontal_stack(self.x_out, self.s), \ do_samples=True) # make h samples that can be switched between h_p and h_q self.h = (self.train_switch[0] * h_q) + \ ((1.0 - self.train_switch[0]) * h_p) # compute relevant KLds for this step self.kld_h_q2ps = gaussian_kld(h_q_mean, h_q_logvar, \ h_p_mean, h_p_logvar) self.kld_h_p2qs = gaussian_kld(h_p_mean, h_p_logvar, \ h_q_mean, h_q_logvar) # p_x_given_s_h is conditioned on s and h. self.x_gen, _ = self.p_x_given_s_h.apply( \ T.horizontal_stack(self.s, self.h), \ do_samples=False) ###################################################################### # ALL SYMBOLIC VARS NEEDED FOR THE OBJECTIVE SHOULD NOW BE AVAILABLE # ###################################################################### # shared var learning rate for generator and inferencer zero_ary = to_fX( np.zeros((1,)) ) self.lr_1 = theano.shared(value=zero_ary, name='msm_lr_1') self.lr_2 = theano.shared(value=zero_ary, name='msm_lr_2') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='msm_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='msm_mom_2') # init parameters for controlling learning dynamics self.set_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='msm_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting prior kld against reconstruction self.lam_kld_z = theano.shared(value=zero_ary, name='msm_lam_kld_z') self.lam_kld_q2p = theano.shared(value=zero_ary, name='msm_lam_kld_q2p') self.lam_kld_p2q = theano.shared(value=zero_ary, name='msm_lam_kld_p2q') self.set_lam_kld(lam_kld_z=1.0, lam_kld_q2p=0.7, lam_kld_p2q=0.3) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='msm_lam_l2w') self.set_lam_l2w(1e-5) # Grab all of the "optimizable" parameters in "group 1" self.group_1_params = [] self.group_1_params.extend(self.q_z_given_x.mlp_params) self.group_1_params.extend(self.q_h_given_x_s.mlp_params) # Grab all of the "optimizable" parameters in "group 2" self.group_2_params = [self.p_z_mean, self.p_z_logvar] self.group_2_params.extend(self.p_s_given_z.mlp_params) self.group_2_params.extend(self.p_h_given_s.mlp_params) self.group_2_params.extend(self.p_x_given_s_h.mlp_params) # Make a joint list of parameters group 1/2 self.joint_params = self.group_1_params + self.group_2_params ################################# # CONSTRUCT THE KLD-BASED COSTS # ################################# self.kld_z_q2p, self.kld_z_p2q, self.kld_h_q2p, self.kld_h_p2q = \ self._construct_kld_costs(p=1.0) self.kld_z = (self.lam_kld_q2p[0] * self.kld_z_q2p) + \ (self.lam_kld_p2q[0] * self.kld_z_p2q) self.kld_h = (self.lam_kld_q2p[0] * self.kld_h_q2p) + \ (self.lam_kld_p2q[0] * self.kld_h_p2q) self.kld_costs = (self.lam_kld_z[0] * self.kld_z) + self.kld_h # now do l2 KLd costs self.kl2_z_q2p, self.kl2_z_p2q, self.kl2_h_q2p, self.kl2_h_p2q = \ self._construct_kld_costs(p=2.0) self.kl2_z = (self.lam_kld_q2p[0] * self.kl2_z_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_z_p2q) self.kl2_h = (self.lam_kld_q2p[0] * self.kl2_h_q2p) + \ (self.lam_kld_p2q[0] * self.kl2_h_p2q) self.kl2_costs = (self.lam_kld_z[0] * self.kl2_z) + self.kl2_h # compute joint l1/l2 KLd cost self.kld_l1l2_costs = (self.lam_kld_l1l2[0] * self.kld_costs) + \ ((1.0 - self.lam_kld_l1l2[0]) * self.kl2_costs) # compute "mean" (rather than per-input) costs self.kld_cost = T.mean(self.kld_costs) self.kl2_cost = T.mean(self.kl2_costs) self.kld_l1l2_cost = T.mean(self.kld_l1l2_costs) ################################# # CONSTRUCT THE NLL-BASED COSTS # ################################# self.nll_costs = self._construct_nll_costs(self.x_out) self.nll_cost = self.lam_nll[0] * T.mean(self.nll_costs) ######################################## # CONSTRUCT THE REST OF THE JOINT COST # ######################################## param_reg_cost = self._construct_reg_costs() self.reg_cost = self.lam_l2w[0] * param_reg_cost self.joint_cost = self.nll_cost + self.kld_l1l2_cost + self.reg_cost ############################## # CONSTRUCT A PER-INPUT COST # ############################## self.obs_costs = self.nll_costs + self.kld_l1l2_costs # Get the gradient of the joint cost for all optimizable parameters print("Computing gradients of self.joint_cost...") self.joint_grads = OrderedDict() grad_list = T.grad(self.joint_cost, self.joint_params) for i, p in enumerate(self.joint_params): self.joint_grads[p] = grad_list[i] # Construct the updates for the generator and inferencer networks self.group_1_updates = get_adam_updates(params=self.group_1_params, \ grads=self.joint_grads, alpha=self.lr_1, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.group_2_updates = get_adam_updates(params=self.group_2_params, \ grads=self.joint_grads, alpha=self.lr_2, \ beta1=self.mom_1, beta2=self.mom_2, \ mom2_init=1e-3, smoothing=1e-5, max_grad_norm=10.0) self.joint_updates = OrderedDict() for k in self.group_1_updates: self.joint_updates[k] = self.group_1_updates[k] for k in self.group_2_updates: self.joint_updates[k] = self.group_2_updates[k] # Construct a function for jointly training the generator/inferencer print("Compiling training function...") self.train_joint = self._construct_train_joint() print("Compiling free-energy sampler...") self.compute_fe_terms = self._construct_compute_fe_terms() print("Compiling open-loop model sampler...") self.sample_from_prior = self._construct_sample_from_prior() print("Compiling data-guided model sampler...") self.sample_from_input = self._construct_sample_from_input() # make easy access points for some interesting parameters self.gen_gen_weights = self.p_x_given_s_h.mu_layers[-1].W return
def predict_symbolic(self, mx, Sx, unroll_scan=False): idims = self.D odims = self.E Ms = self.sr.shape[1] sf2M = (self.hyp[:, idims]**2)/tt.cast(Ms, floatX) sn2 = self.hyp[:, idims+1]**2 # TODO this should just fallback to the method from the SSGP class if Sx is None: # first check if we received a vector [D] or a matrix [nxD] if mx.ndim == 1: mx = mx[None, :] srdotx = self.sr.dot(self.X.T).transpose(0,2,1) phi_x = tt.concatenate([tt.sin(srdotx), tt.cos(srdotx)], 2) M = (phi_x*self.beta_ss[:, None, :]).sum(-1) phi_x_L = tt.stack([ solve_lower_triangular(self.Lmm[i], phi_x[i].T) for i in range(odims)]) S = sn2[:, None]*(1 + (sf2M[:, None])*(phi_x_L**2).sum(-2)) + 1e-6 return M, S # precompute some variables srdotx = self.sr.dot(mx) srdotSx = self.sr.dot(Sx) srdotSxdotsr = tt.sum(srdotSx*self.sr, 2) e = tt.exp(-0.5*srdotSxdotsr) cos_srdotx = tt.cos(srdotx) sin_srdotx = tt.sin(srdotx) cos_srdotx_e = cos_srdotx*e sin_srdotx_e = sin_srdotx*e # compute the mean vector mphi = tt.horizontal_stack(sin_srdotx_e, cos_srdotx_e) # E x 2*Ms M = tt.sum(mphi*self.beta_ss, 1) # input output covariance mx_c = mx.dimshuffle(0, 'x') sin_srdotx_e_r = sin_srdotx_e.dimshuffle(0, 'x', 1) cos_srdotx_e_r = cos_srdotx_e.dimshuffle(0, 'x', 1) srdotSx_tr = srdotSx.transpose(0, 2, 1) c = tt.concatenate([mx_c*sin_srdotx_e_r + srdotSx_tr*cos_srdotx_e_r, mx_c*cos_srdotx_e_r - srdotSx_tr*sin_srdotx_e_r], axis=2) # E x D x 2*Ms beta_ss_r = self.beta_ss.dimshuffle(0, 'x', 1) # input output covariance (notice this is not premultiplied by the # input covariance inverse) V = tt.sum(c*beta_ss_r, 2).T - tt.outer(mx, M) srdotSxdotsr_c = srdotSxdotsr.dimshuffle(0, 1, 'x') srdotSxdotsr_r = srdotSxdotsr.dimshuffle(0, 'x', 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, iA, sn2, sf2M, sr, srdotSx, srdotSxdotsr_c, srdotSxdotsr_r, sin_srdotx, cos_srdotx, *args): # compute the second moments of the spectrum feature vectors siSxsj = srdotSx[i].dot(sr[j].T) # Ms x Ms sijSxsij = -0.5*(srdotSxdotsr_c[i] + srdotSxdotsr_r[j]) em = tt.exp(sijSxsij+siSxsj) # MsxMs ep = tt.exp(sijSxsij-siSxsj) # MsxMs si = sin_srdotx[i] # Msx1 ci = cos_srdotx[i] # Msx1 sj = sin_srdotx[j] # Msx1 cj = cos_srdotx[j] # Msx1 sicj = tt.outer(si, cj) # MsxMs cisj = tt.outer(ci, sj) # MsxMs sisj = tt.outer(si, sj) # MsxMs cicj = tt.outer(ci, cj) # MsxMs sm = (sicj-cisj)*em sp = (sicj+cisj)*ep cm = (sisj+cicj)*em cp = (cicj-sisj)*ep # Populate the second moment matrix of the feature vector Q_up = tt.concatenate([cm-cp, sm+sp], axis=1) Q_lo = tt.concatenate([sp-sm, cm+cp], axis=1) Q = tt.concatenate([Q_up, Q_lo], axis=0) # Compute the second moment of the output m2 = 0.5*matrix_dot(beta[i], Q, beta[j].T) m2 = theano.ifelse.ifelse( tt.eq(i, j), m2 + sn2[i]*(1.0 + sf2M[i]*tt.sum(self.iA[i]*Q)) + 1e-6, m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2 nseq = [self.beta_ss, self.iA, sn2, sf2M, self.sr, srdotSx, srdotSxdotsr_c, srdotSxdotsr_r, sin_srdotx, cos_srdotx, self.Lmm] if unroll_scan: from lasagne.utils import unroll_scan [M2_] = unroll_scan(second_moments, indices, [M2], nseq, len(triu_indices[0])) updts = {} else: M2_, updts = theano.scan(fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False, name="%s>M2_scan" % (self.name)) M2 = M2_[-1] M2 = M2 + tt.triu(M2, k=1).T S = M2 - tt.outer(M, M) return M, S, V
def test_TransMatConjugateStep_subtensors(): # Confirm that Dirichlet/non-Dirichlet mixed rows can be # parsed with pm.Model(): d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1]) d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1]) p_0_rv = tt.as_tensor([0, 0, 1]) p_1_rv = tt.zeros(3) p_1_rv = tt.set_subtensor(p_0_rv[[0, 2]], d_0_rv) p_2_rv = tt.zeros(3) p_2_rv = tt.set_subtensor(p_1_rv[[1, 2]], d_1_rv) P_tt = tt.stack([p_0_rv, p_1_rv, p_2_rv]) P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt)) DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], shape=(10, )) transmat = TransMatConjugateStep(P_rv) assert transmat.row_remaps == {0: 1, 1: 2} exp_slices = {0: np.r_[0, 2], 1: np.r_[1, 2]} assert exp_slices.keys() == transmat.row_slices.keys() assert all( np.array_equal(transmat.row_slices[i], exp_slices[i]) for i in exp_slices.keys()) # Same thing, just with some manipulations of the transition matrix with pm.Model(): d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1]) d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1]) p_0_rv = tt.as_tensor([0, 0, 1]) p_1_rv = tt.zeros(3) p_1_rv = tt.set_subtensor(p_0_rv[[0, 2]], d_0_rv) p_2_rv = tt.zeros(3) p_2_rv = tt.set_subtensor(p_1_rv[[1, 2]], d_1_rv) P_tt = tt.horizontal_stack(p_0_rv[..., None], p_1_rv[..., None], p_2_rv[..., None]) P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt.T)) DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], shape=(10, )) transmat = TransMatConjugateStep(P_rv) assert transmat.row_remaps == {0: 1, 1: 2} exp_slices = {0: np.r_[0, 2], 1: np.r_[1, 2]} assert exp_slices.keys() == transmat.row_slices.keys() assert all( np.array_equal(transmat.row_slices[i], exp_slices[i]) for i in exp_slices.keys()) # Use an observed `DiscreteMarkovChain` and check the conjugate results with pm.Model(): d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1]) d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1]) p_0_rv = tt.as_tensor([0, 0, 1]) p_1_rv = tt.zeros(3) p_1_rv = tt.set_subtensor(p_0_rv[[0, 2]], d_0_rv) p_2_rv = tt.zeros(3) p_2_rv = tt.set_subtensor(p_1_rv[[1, 2]], d_1_rv) P_tt = tt.horizontal_stack(p_0_rv[..., None], p_1_rv[..., None], p_2_rv[..., None]) P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt.T)) DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], shape=(4, ), observed=np.r_[0, 1, 0, 2]) transmat = TransMatConjugateStep(P_rv)
def learnAndPredict(Ti, C, TOList): rng = np.random.RandomState(SEED) learning_rate = learning_rate0 print np.mean(Ti[1000, :]) aminW = np.amin(Ti[:1000, :]) amaxW = np.amax(Ti[:1000, :]) Ti[:1000, :] = (Ti[:1000, :] - aminW) / (amaxW - aminW) astdW = np.std(Ti[:1000, :]) ameanW = np.mean(Ti[:1000, :]) Ti[:1000, :] = (Ti[:1000, :] - ameanW) / astdW aminacW = np.amin(Ti[1000, :]) amaxacW = np.amax(Ti[1000, :]) print aminW, amaxW, aminacW, amaxacW Ti[1000, :] = (Ti[1000, :] - aminacW) / (amaxacW - aminacW) astdacW = np.std(Ti[1000, :]) ameanacW = np.mean(Ti[1000, :]) Ti[1000, :] = (Ti[1000, :] - ameanacW) / astdacW ile__ = len(TOList) ileList = np.zeros(ile__) for titer in range(len(TOList)): print np.mean(TOList[titer][1000, :]) TOList[titer][:1000, :] = (TOList[titer][:1000, :] - aminW) / (amaxW - aminW) TOList[titer][:1000, :] = (TOList[titer][:1000, :] - ameanW) / astdW TOList[titer][1000, :] = (TOList[titer][1000, :] - aminacW) / (amaxacW - aminacW) TOList[titer][1000, :] = (TOList[titer][1000, :] - ameanacW) / astdacW _, ileList[titer] = TOList[titer].shape _, ile = Ti.shape N = NN data = [] yyy = [] need = 1 BYL = {} j = 0 dwa = 0 ONES = [] ZEROS = [] for i in range(NN): for j in range(NN): if i != j: if C[i][j] == 1: ONES.append((i, j)) else: ZEROS.append((i, j)) Nones = len(ONES) rng.shuffle(ONES) Nzeros = len(ZEROS) print Nones print Nzeros Needed = NUM_TRAIN / 2 onesPerPair = Needed / Nones + 1 onesIter = 0 jj = 0 while jj < NUM_TRAIN: if jj % 300000 == 0: print jj / 300000, need = 1 - need if need == 1: pairNo = onesIter % Nones ppp = onesIter / Nones s, t = ONES[pairNo] shift = rng.randint(0, ile - L) onesIter += 1 if need == 0: zer = rng.randint(Nzeros) s, t = ZEROS[zer] del ZEROS[zer] Nzeros -= 1 shift = rng.randint(0, ile - L) x = np.hstack((Ti[s][shift:shift + L], Ti[t][shift:shift + L], Ti[1000][shift:shift + L])) y = C[s][t] data.append(x) yyy.append(y) jj += 1 data = np.array(data, dtype=theano.config.floatX) is_train = np.array(([0] * 96 + [1, 1, 2, 2]) * (NUM_TRAIN / 100)) yyy = np.array(yyy) train_set_x0, train_set_y0 = np.array( data[is_train == 0]), yyy[is_train == 0] test_set_x, test_set_y = np.array(data[is_train == 1]), yyy[is_train == 1] valid_set_x, valid_set_y = np.array( data[is_train == 2]), yyy[is_train == 2] n_train_batches = len(train_set_y0) / batch_size n_valid_batches = len(valid_set_y) / batch_size n_test_batches = len(test_set_y) / batch_size epoch = T.scalar() index = T.lscalar() x = T.matrix('x') inone2 = T.matrix('inone2') y = T.ivector('y') print '... building the model' #-------- my layers ------------------- #--------------------- layer0_input = x.reshape((batch_size, 1, 3, L)) Cx = 5 layer0 = ConvolutionalLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 3, L), filter_shape=(nkerns[0], 1, 2, Cx), poolsize=(1, 1), fac=0) ONE = (3 - 2 + 1) / 1 L2 = (L - Cx + 1) / 1 #--------------------- Cx2 = 5 layer1 = ConvolutionalLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], ONE, L2), filter_shape=(nkerns[1], nkerns[0], 2, Cx2), poolsize=(1, 1), activation=ReLU, fac=0) ONE = (ONE - 2 + 1) / 1 L3 = (L2 - Cx2 + 1) / 1 #--------------------- Cx3 = 1 layer1b = ConvolutionalLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], ONE, L3), filter_shape=(nkerns[2], nkerns[1], 1, Cx3), poolsize=(1, POOL), activation=ReLU, fac=0) ONE = (ONE - 1 + 1) / 1 L4 = (L3 - Cx3 + 1) / POOL REGx = 100 #--------------------- layer2_input = layer1b.output.flatten(2) print layer2_input.shape use_b = False layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[2] * L4, n_out=REGx, activation=T.tanh, use_bias=use_b) layer3 = LogisticRegression(input=layer2.output, n_in=REGx, n_out=2) cost = layer3.negative_log_likelihood(y) out_x2 = theano.shared( np.asarray(np.zeros((N, L)), dtype=theano.config.floatX)) inone2 = theano.shared( np.asarray(np.zeros((1, L)), dtype=theano.config.floatX)) inone3 = theano.shared( np.asarray(np.zeros((1, L)), dtype=theano.config.floatX)) inone4 = theano.shared( np.asarray(np.zeros((1, L)), dtype=theano.config.floatX)) test_set_x = theano.shared( np.asarray(test_set_x, dtype=theano.config.floatX)) train_set_x = theano.shared( np.asarray(train_set_x0, dtype=theano.config.floatX)) train_set_y = T.cast( theano.shared(np.asarray(train_set_y0, dtype=theano.config.floatX)), 'int32') test_set_y = T.cast( theano.shared(np.asarray(test_set_y, dtype=theano.config.floatX)), 'int32') valid_set_y = T.cast( theano.shared(np.asarray(valid_set_y, dtype=theano.config.floatX)), 'int32') valid_set_x = theano.shared( np.asarray(valid_set_x, dtype=theano.config.floatX)) test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) mom_start = 0.5 mom_end = 0.98 mom_epoch_interval = n_epochs * 1.0 #### @@@@@@@@@@@ class_params0 = [layer3, layer2, layer1, layer1b, layer0] class_params = [param for layer in class_params0 for param in layer.params] gparams = [] for param in class_params: gparam = T.grad(cost, param) gparams.append(gparam) gparams_mom = [] for param in class_params: gparam_mom = theano.shared( np.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)) gparams_mom.append(gparam_mom) mom = ifelse( epoch < mom_epoch_interval, mom_start * (1.0 - epoch / mom_epoch_interval) + mom_end * (epoch / mom_epoch_interval), mom_end) updates = OrderedDict() for gparam_mom, gparam in zip(gparams_mom, gparams): updates[gparam_mom] = mom * gparam_mom - (1. - mom) * learning_rate * gparam for param, gparam_mom in zip(class_params, gparams_mom): stepped_param = param + updates[gparam_mom] squared_filter_length_limit = 15.0 if param.get_value(borrow=True).ndim == 2: col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(squared_filter_length_limit)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param output = cost train_model = theano.function( inputs=[epoch, index], outputs=output, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) keep = theano.function( [index], layer3.errorsFull(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, on_unused_input='warn') timer = time.clock() print "finished reading", (timer - start_time0) / 60., "minutes " # TRAIN MODEL # print '... training' validation_frequency = n_train_batches best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. epochc = 0 while (epochc < n_epochs): epochc = epochc + 1 learning_rate = learning_rate0 * (1.2 - ((1.0 * epochc) / n_epochs)) for minibatch_index in xrange(n_train_batches): iter = (epochc - 1) * n_train_batches + minibatch_index cost_ij = train_model(epochc, minibatch_index) if (iter + 1) % validation_frequency == 0: validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) print(' %i) err %.2f ' % (epochc, this_validation_loss / 10) ), L, nkerns, REGx, "|", Cx, Cx2, Cx3, batch_size if this_validation_loss < best_validation_loss or epochc % 30 == 0: best_validation_loss = this_validation_loss best_iter = iter test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = np.mean(test_losses) print( (' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epochc, minibatch_index + 1, n_train_batches, test_score / 10)) ############ timel = time.clock() print "finished learning", (timel - timer) / 60., "minutes " ppm = theano.function( [index], layer3.pred_proba_mine(), givens={ x: T.horizontal_stack( T.tile(inone2, (batch_size, 1)), out_x2[index * batch_size:(index + 1) * batch_size], T.tile(inone3, (batch_size, 1))), y: train_set_y[0 * (batch_size):(0 + 1) * (batch_size)] }, on_unused_input='warn') NONZERO = (N * N - N) gc.collect() RESList = [np.zeros((N, N)) for it in range(ile__)] for __net in range(ile__): TO = TOList[__net] ileO = ileList[__net] RES = RESList[__net] shift = 0.1 DELTAshift = (ileO - L) / (Q - 1) print "DELTAshift:", DELTAshift for q in range(Q): dataO = [] print(q + 1), "/", Q, " ", out_x2.set_value( np.asarray(np.array(TO[:, shift:shift + L]), dtype=theano.config.floatX)) PARTIAL = np.zeros((N, N)) inone3.set_value( np.asarray(np.array(TO[1000][shift:shift + L]).reshape(1, L), dtype=theano.config.floatX)) for i in range(N): inone2.set_value( np.asarray(np.array(TO[i][shift:shift + L]).reshape(1, L), dtype=theano.config.floatX)) p = [ppm(ii) for ii in xrange(N / batch_size)] for pos in range(N): if pos != i: PARTIAL[i][pos] += p[pos / batch_size][pos % batch_size][1] for i in range(N): for j in range(N): RES[i][j] += PARTIAL[i][j] shift += DELTAshift print "Finished", __net RESList[__net] = RES / np.max(RES) gc.collect() end_time = time.clock() print "finished predicting", (end_time - timel) / 60., "minutes ", str( nkerns), "using SEED = ", SEED print('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time0) / 60.)) return RESList
# construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=50 * ((l1ims-4)/2)**2, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=2) else: # Output (14,14) -> (5, 5) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20, 32 * 4 * 4) = (20, 512) layer2_input = T.horizontal_stack(layer1.output.flatten(2), x_extra) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=50 * ((l1ims-4)/2)**2 + ExtraColumns, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=2) model = [layer0, layer1, layer2, layer3] # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = layer3.negative_log_likelihood(y)
def get_output_for(self, inputs, deterministic=False, **kwargs): # extract inputs H1, H2 = inputs # train set size m = H1.shape[0].astype(theano.config.floatX) # running average projection matrix update if not deterministic: # compute batch mean mean1 = T.mean(H1, axis=0) mean2 = T.mean(H2, axis=0) # running average updates of means mean1 = (floatX(1.0 - self.alpha) * self.mean1 + self.alpha * mean1) running_mean1 = theano.clone(self.mean1, share_inputs=False) running_mean1.default_update = mean1 mean1 += 0 * running_mean1 mean2 = (floatX(1.0 - self.alpha) * self.mean2 + self.alpha * mean2) running_mean2 = theano.clone(self.mean2, share_inputs=False) running_mean2.default_update = mean2 mean2 += 0 * running_mean2 # hidden representations H1bar = H1 - mean1 H2bar = H2 - mean2 # transpose to formulas in paper H1bar = H1bar.T H2bar = H2bar.T # cross-covariance S12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T) # covariance 1 S11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T) S11 = S11 + self.r1 * T.identity_like(S11) # covariance 2 S22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T) S22 = S22 + self.r2 * T.identity_like(S22) # running average updates of statistics S12 = (floatX(1.0 - self.alpha) * self.S12 + self.alpha * S12) running_S12 = theano.clone(self.S12, share_inputs=False) running_S12.default_update = S12 S12 += 0 * running_S12 S11 = (floatX(1.0 - self.alpha) * self.S11 + self.alpha * S11) running_S11 = theano.clone(self.S11, share_inputs=False) running_S11.default_update = S11 S11 += 0 * running_S11 S22 = (floatX(1.0 - self.alpha) * self.S22 + self.alpha * S22) running_S22 = theano.clone(self.S22, share_inputs=False) running_S22.default_update = S22 S22 += 0 * running_S22 # theano-compatible formulation of paper d, A = T.nlinalg.eigh(S11) S11si = (A * np.reciprocal(np.sqrt(d))).dot(A.T) # = S11^-.5 d, A = T.nlinalg.eigh(S22) S22si = (A * np.reciprocal(np.sqrt(d))).dot(A.T) # = S22^-.5 # compute TT' and T'T (regularized) Tnp = S11si.dot(S12).dot(S22si) M1 = Tnp.dot(Tnp.T) M2 = Tnp.T.dot(Tnp) M1 += self.rT * T.identity_like(M1) M2 += self.rT * T.identity_like(M2) # compute eigen decomposition E1, E = T.nlinalg.eigh(M1) _, F = T.nlinalg.eigh(M2) # maximize correlation E1 = T.clip(E1, 1e-7, 1.0) E1 = T.sqrt(E1) self.loss = -T.mean(E1) * self.wl self.corr = E1 # compute projection matrices U = S11si.dot(E) V = S22si.dot(F) # flip signs of projections to match # (needed because we do two decompositions as opposed to a SVD) s = T.sgn(U.T.dot(S12).dot(V).diagonal()) U *= s # update of projection matrices running_U = theano.clone(self.U, share_inputs=False) running_U.default_update = U U += floatX(0) * running_U running_V = theano.clone(self.V, share_inputs=False) running_V.default_update = V V += floatX(0) * running_V # use projections of layer else: # hidden representations H1bar = H1 - self.mean1 H2bar = H2 - self.mean2 # transpose to formulas in paper H1bar = H1bar.T H2bar = H2bar.T U, V = self.U, self.V # re-project data lv1_cca = H1bar.T.dot(U) lv2_cca_fixed = H2bar.T.dot(V) output = T.horizontal_stack(lv1_cca, lv2_cca_fixed) return output
def hstack(tensors): return T.horizontal_stack(*tensors)
def __init__(self, rng=None, \ Xd=None, Yd=None, Xc=None, Xm=None, \ g_net=None, i_net=None, p_net=None, \ data_dim=None, prior_dim=None, label_dim=None, \ batch_size=None, \ params=None, shared_param_dicts=None): # TODO: refactor for use with "encoded" inferencer/generator assert(not (i_net.use_encoder or g_net.use_encoder)) # setup a rng for this GITrip self.rng = RandStream(rng.randint(100000)) # setup the prior distribution over the categorical variable if params is None: self.params = {} else: self.params = params # record the dimensionality of the data handled by this GITrip self.data_dim = data_dim self.label_dim = label_dim self.prior_dim = prior_dim self.batch_size = batch_size # create a mask for disabling and/or reweighting input dimensions row_mask = np.ones((self.data_dim,)).astype(theano.config.floatX) self.input_mask = theano.shared(value=row_mask, name='git_input_mask') # record the symbolic variables that will provide inputs to the # computation graph created to describe this GITrip self.Xd = self.input_mask * Xd self.Yd = Yd self.Xc = Xc self.Xm = Xm # construct a vertically-repeated identity matrix for marginalizing # over possible values of the categorical latent variable. Ic = np.vstack([np.identity(label_dim) for i in range(batch_size)]) self.Ic = theano.shared(value=Ic.astype(theano.config.floatX), name='git_Ic') # create "shared-parameter" clones of the continuous and categorical # inferencers that this GITrip will be built on. self.IN = i_net.shared_param_clone(rng=rng, \ Xd=self.Xd, Xc=self.Xc, Xm=self.Xm) self.PN = p_net.shared_param_clone(rng=rng, Xd=self.Xd) # create symbolic variables for the approximate posteriors over the # continuous and categorical latent variables self.Xp = self.IN.output self.Yp = safe_softmax(self.PN.output_spawn[0]) self.Yp_proto = safe_softmax(self.PN.output_proto) # create a symbolic variable structured to allow easy "marginalization" # over possible settings of the categorical latent variable. the left # matrix (i.e. self.Ic) comprises batch_size copies of the label_dim # dimensional identity matrix stacked on top of each other, and the # right matrix comprises a single sample from the approximate posterior # over the continuous latent variables for each of batch_size examples # with each sample repeated label_dim times. self.XYp = T.horizontal_stack(self.Ic, T.repeat(self.Xp, \ self.label_dim, axis=0)) # pipe the "convenient marginlization" matrix into a shared parameter # clone of the generator network self.GN = g_net.shared_param_clone(rng=rng, Xp=self.XYp) # capture a handle for sampled reconstructions from the generator self.Xg = self.GN.output # we will be assuming one proto-net in the pseudo-ensemble represented # by self.PN, and either one or two spawn-nets for that proto-net. assert(len(self.PN.proto_nets) == 1) assert((len(self.PN.spawn_nets) == 1) or \ (len(self.PN.spawn_nets) == 2)) # output of the generator and input to the inferencer should both be # equal to self.data_dim assert(self.data_dim == self.GN.mlp_layers[-1].out_dim) assert(self.data_dim == self.IN.shared_layers[0].in_dim) assert(self.data_dim == self.PN.proto_nets[0][0].in_dim) # mu/sigma outputs of self.IN should be equal to prior_dim, output of # self.PN should be equal to label_dim, and input of self.GN should be # equal to prior_dim + label_dim assert(self.prior_dim == self.IN.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim) assert(self.label_dim == self.PN.proto_nets[0][-1].out_dim) assert((self.prior_dim + self.label_dim) == self.GN.mlp_layers[0].in_dim) # determine whether this GITrip is a clone or an original if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to some important shared parameters. self.shared_param_dicts = {} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True if not self.is_clone: # shared var learning rate for generator and inferencer zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lr_gn = theano.shared(value=zero_ary, name='git_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='git_lr_in') self.lr_pn = theano.shared(value=zero_ary, name='git_lr_pn') # shared var momentum parameters for generator and inferencer self.mo_gn = theano.shared(value=zero_ary, name='git_mo_gn') self.mo_in = theano.shared(value=zero_ary, name='git_mo_in') self.mo_pn = theano.shared(value=zero_ary, name='git_mo_pn') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_nll = theano.shared(value=zero_ary, name='git_lam_nll') self.set_lam_nll(lam_nll=1.0) # init shared var for weighting posterior KL-div from prior self.lam_kld = theano.shared(value=zero_ary, name='git_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for weighting semi-supervised classification self.lam_cat = theano.shared(value=zero_ary, name='git_lam_cat') self.set_lam_cat(lam_cat=0.0) # init shared var for weighting ensemble agreement regularization self.lam_pea = theano.shared(value=zero_ary, name='git_lam_pea') self.set_lam_pea(lam_pea=0.0) # init shared var for weighting entropy regularization on the # inferred posteriors over the categorical variable of interest self.lam_ent = theano.shared(value=zero_ary, name='git_lam_ent') self.set_lam_ent(lam_ent=0.0) # init shared var for weighting dirichlet regularization on the # inferred posteriors over the categorical variable of interest self.lam_dir = theano.shared(value=zero_ary, name='git_lam_dir') self.set_lam_dir(lam_dir=0.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='git_lam_l2w') self.set_lam_l2w(lam_l2w=1e-3) # record shared parameters that are to be shared among clones self.shared_param_dicts['git_lr_gn'] = self.lr_gn self.shared_param_dicts['git_lr_in'] = self.lr_in self.shared_param_dicts['git_lr_pn'] = self.lr_pn self.shared_param_dicts['git_mo_gn'] = self.mo_gn self.shared_param_dicts['git_mo_in'] = self.mo_in self.shared_param_dicts['git_mo_pn'] = self.mo_pn self.shared_param_dicts['git_lam_nll'] = self.lam_nll self.shared_param_dicts['git_lam_kld'] = self.lam_kld self.shared_param_dicts['git_lam_cat'] = self.lam_cat self.shared_param_dicts['git_lam_pea'] = self.lam_pea self.shared_param_dicts['git_lam_ent'] = self.lam_ent self.shared_param_dicts['git_lam_dir'] = self.lam_dir self.shared_param_dicts['git_lam_l2w'] = self.lam_l2w self.shared_param_dicts['git_input_mask'] = self.input_mask else: # use some shared parameters that are shared among all clones of # some "base" GITrip self.lr_gn = self.shared_param_dicts['git_lr_gn'] self.lr_in = self.shared_param_dicts['git_lr_in'] self.lr_pn = self.shared_param_dicts['git_lr_pn'] self.mo_gn = self.shared_param_dicts['git_mo_gn'] self.mo_in = self.shared_param_dicts['git_mo_in'] self.mo_pn = self.shared_param_dicts['git_mo_pn'] self.lam_nll = self.shared_param_dicts['git_lam_nll'] self.lam_kld = self.shared_param_dicts['git_lam_kld'] self.lam_cat = self.shared_param_dicts['git_lam_cat'] self.lam_pea = self.shared_param_dicts['git_lam_pea'] self.lam_ent = self.shared_param_dicts['git_lam_ent'] self.lam_dir = self.shared_param_dicts['git_lam_dir'] self.lam_l2w = self.shared_param_dicts['git_lam_l2w'] self.input_mask = self.shared_param_dicts['git_input_mask'] # Grab the full set of "optimizable" parameters from the generator # and inferencer networks that we'll be working with. self.gn_params = [p for p in self.GN.mlp_params] self.in_params = [p for p in self.IN.mlp_params] self.pn_params = [p for p in self.PN.proto_params] ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.data_nll_cost = self.lam_nll[0] * self._construct_data_nll_cost() self.post_kld_cost = self.lam_kld[0] * self._construct_post_kld_cost() self.post_cat_cost = self.lam_cat[0] * self._construct_post_cat_cost() self.post_pea_cost = self.lam_pea[0] * self._construct_post_pea_cost() self.post_ent_cost = self.lam_ent[0] * self._construct_post_ent_cost() self.post_dir_cost = self.lam_dir[0] * self._construct_post_dir_cost() self.other_reg_costs = self._construct_other_reg_cost() self.other_reg_cost = self.other_reg_costs[0] self.joint_cost = self.data_nll_cost + self.post_kld_cost + self.post_cat_cost + \ self.post_pea_cost + self.post_ent_cost + self.post_dir_cost + \ self.other_reg_cost # Initialize momentums for mini-batch SGD updates. All parameters need # to be safely nestled in their lists by now. self.joint_moms = OrderedDict() self.gn_moms = OrderedDict() self.in_moms = OrderedDict() self.pn_moms = OrderedDict() for p in self.gn_params: p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0 self.gn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX)) self.joint_moms[p] = self.gn_moms[p] for p in self.in_params: p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0 self.in_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX)) self.joint_moms[p] = self.in_moms[p] for p in self.pn_params: p_mo = np.zeros(p.get_value(borrow=True).shape) + 5.0 self.pn_moms[p] = theano.shared(value=p_mo.astype(theano.config.floatX)) self.joint_moms[p] = self.pn_moms[p] # Now, we need to construct updates for inferencers and the generator self.joint_updates = OrderedDict() self.gn_updates = OrderedDict() self.in_updates = OrderedDict() self.pn_updates = OrderedDict() self.grad_sq_sums = [] ####################################### # Construct updates for the generator # ####################################### for var in self.gn_params: # these updates are for trainable params in the generator net... # first, get gradient of cost w.r.t. var var_grad = T.grad(self.joint_cost, var, \ consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-1.0,1.0) #var_grad = ifelse(T.any(T.isnan(nan_grad)), T.zeros_like(nan_grad), nan_grad) #self.grad_sq_sums.append(T.sum(var_grad**2.0)) # get the momentum for this var var_mom = self.gn_moms[var] # update the momentum for this var using its grad self.gn_updates[var_mom] = (self.mo_gn[0] * var_mom) + \ ((1.0 - self.mo_gn[0]) * (var_grad**2.0)) self.joint_updates[var_mom] = self.gn_updates[var_mom] # make basic update to the var var_new = var - (self.lr_gn[0] * (var_grad / T.sqrt(var_mom + 1e-2))) self.gn_updates[var] = var_new # add this var's update to the joint updates too self.joint_updates[var] = self.gn_updates[var] ################################################### # Construct updates for the continuous inferencer # ################################################### for var in self.in_params: # these updates are for trainable params in the inferencer net... # first, get gradient of cost w.r.t. var var_grad = T.grad(self.joint_cost, var, \ consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-1.0,1.0) #var_grad = ifelse(T.any(T.isnan(nan_grad)), T.zeros_like(nan_grad), nan_grad) #self.grad_sq_sums.append(T.sum(var_grad**2.0)) # get the momentum for this var var_mom = self.in_moms[var] # update the momentum for this var using its grad self.in_updates[var_mom] = (self.mo_in[0] * var_mom) + \ ((1.0 - self.mo_in[0]) * (var_grad**2.0)) self.joint_updates[var_mom] = self.in_updates[var_mom] # make basic update to the var var_new = var - (self.lr_in[0] * (var_grad / T.sqrt(var_mom + 1e-2))) self.in_updates[var] = var_new # add this var's update to the joint updates too self.joint_updates[var] = self.in_updates[var] #################################################### # Construct updates for the categorical inferencer # #################################################### for var in self.pn_params: # these updates are for trainable params in the inferencer net... # first, get gradient of cost w.r.t. var var_grad = T.grad(self.joint_cost, var, \ consider_constant=[self.GN.dist_mean, self.GN.dist_cov]).clip(-1.0,1.0) #var_grad = ifelse(T.any(T.isnan(nan_grad)), T.zeros_like(nan_grad), nan_grad) #self.grad_sq_sums.append(T.sum(var_grad**2.0)) # get the momentum for this var var_mom = self.pn_moms[var] # update the momentum for this var using its grad self.pn_updates[var_mom] = (self.mo_pn[0] * var_mom) + \ ((1.0 - self.mo_pn[0]) * (var_grad**2.0)) self.joint_updates[var_mom] = self.pn_updates[var_mom] # make basic update to the var var_new = var - (self.lr_pn[0] * (var_grad / T.sqrt(var_mom + 1e-2))) self.pn_updates[var] = var_new # add this var's update to the joint updates too self.joint_updates[var] = self.pn_updates[var] # Record the sum of squared gradients (for NaN checking) self.grad_sq_sum = T.sum(self.grad_sq_sums) # Construct batch-based training functions for the generator and # inferer networks, as well as a joint training function. #self.train_gn = self._construct_train_gn() #self.train_in = self._construct_train_in() self.train_joint = self._construct_train_joint() return
def get_output_for(self, inputs, deterministic=False, **kwargs): # extract inputs H1, H2 = inputs # train set size m = H1.shape[0].astype(theano.config.floatX) # running average projection matrix update if not deterministic: # compute batch mean mean1 = T.mean(H1, axis=0) mean2 = T.mean(H2, axis=0) # running average updates of means mean1 = (floatX(1.0 - self.alpha) * self.mean1 + self.alpha * mean1) running_mean1 = theano.clone(self.mean1, share_inputs=False) running_mean1.default_update = mean1 mean1 += 0 * running_mean1 mean2 = (floatX(1.0 - self.alpha) * self.mean2 + self.alpha * mean2) running_mean2 = theano.clone(self.mean2, share_inputs=False) running_mean2.default_update = mean2 mean2 += 0 * running_mean2 # hidden representations H1bar = H1 - mean1 H2bar = H2 - mean2 # transpose to correlation format H1bar = H1bar.T H2bar = H2bar.T # cross-covariance S12 = (1.0 / (m - 1)) * T.dot(H1bar, H2bar.T) # covariance 1 S11 = (1.0 / (m - 1)) * T.dot(H1bar, H1bar.T) S11 = S11 + self.r1 * T.identity_like(S11) # covariance 2 S22 = (1.0 / (m - 1)) * T.dot(H2bar, H2bar.T) S22 = S22 + self.r2 * T.identity_like(S22) # theano-compatible formulation of paper d, A = T.nlinalg.eigh(S11) S11si = (A * np.reciprocal(np.sqrt(d))).dot(A.T) # = S11^-.5 d, A = T.nlinalg.eigh(S22) S22si = (A * np.reciprocal(np.sqrt(d))).dot(A.T) # = S22^-.5 # compute TT' and T'T (regularized) Tnp = S11si.dot(S12).dot(S22si) M1 = Tnp.dot(Tnp.T) M2 = Tnp.T.dot(Tnp) M1 += self.rT * T.identity_like(M1) M2 += self.rT * T.identity_like(M2) # compute eigen decomposition E1, E = T.nlinalg.eigh(M1) _, F = T.nlinalg.eigh(M2) # compute correlation E1 = T.clip(E1, 1e-7, 1.0) E1 = T.sqrt(E1) self.corr = E1 # transpose back to network format H1bar = H1bar.T H2bar = H2bar.T # use means of layer else: # hidden representations H1bar = H1 - self.mean1 H2bar = H2 - self.mean2 # re-project data lv1_cca = H1bar.dot(self.U) lv2_cca = H2bar.dot(self.V) output = T.horizontal_stack(lv1_cca, lv2_cca) return output