def __init__(self, get_optimizer, theano_warning='raise'): v = self.v w = self.w theanofunction = lazytheanofunc('warn', mode='FAST_RUN') theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN') # Create theano expressions x, z = ndict.ordereddicts(self.variables()) self.var_x, self.var_z, = x, z # Helper variables A = T.fmatrix('A') self.var_A = A # Get gradient symbols allvars = list(x.values()) + list( z.values()) + [A] # note: '+' concatenates lists # TODO: more beautiful/standardized way of setting distributions # (should be even simpler than this) self.dist_qz = {} self.dist_px = {} self.dist_pz = {} logpx, logpz, logqz = self.factors(x, z, A) if get_optimizer == None: def get_optimizer(w, g): from collections import OrderedDict updates = OrderedDict() for i in w: updates[w[i]] = w[i] return updates # Log-likelihood lower bound self.f_L = theanofunction(allvars, [logpx, logpz, logqz]) L = (logpx + logpz - logqz).sum() g = T.grad(L, list(v.values()) + list(w.values())) gv, gw = dict(list(zip(list(v.keys()), g[0:len(v)]))), dict( list(zip(list(w.keys()), g[len(v):len(v) + len(w)]))) updates = get_optimizer(v, gv) updates.update(get_optimizer(w, gw)) #self.profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker()) #self.f_evalAndUpdate = theano.function(allvars, [logpx + logpz - logqz], updates=updates_w, mode=self.profmode) #theano.printing.debugprint(self.f_evalAndUpdate) self.f_eval = theanofunction(allvars, [logpx + logpz - logqz]) self.f_evalAndUpdate = theanofunction(allvars, [logpx + logpz - logqz], updates=updates)
def __init__(self, get_optimizer, theano_warning="raise"): v = self.v w = self.w theanofunction = lazytheanofunc("warn", mode="FAST_RUN") theanofunction_silent = lazytheanofunc("ignore", mode="FAST_RUN") # Create theano expressions x, z = ndict.ordereddicts(self.variables()) self.var_x, self.var_z, = x, z # Helper variables A = T.fmatrix("A") self.var_A = A # Get gradient symbols allvars = x.values() + z.values() + [A] # note: '+' concatenates lists # TODO: more beautiful/standardized way of setting distributions # (should be even simpler than this) self.dist_qz = {} self.dist_px = {} self.dist_pz = {} logpx, logpz, logqz = self.factors(x, z, A) if get_optimizer == None: def get_optimizer(w, g): from collections import OrderedDict updates = OrderedDict() for i in w: updates[w[i]] = w[i] return updates # Log-likelihood lower bound self.f_L = theanofunction(allvars, [logpx, logpz, logqz]) L = (logpx + logpz - logqz).sum() g = T.grad(L, v.values() + w.values()) gv, gw = dict(zip(v.keys(), g[0 : len(v)])), dict(zip(w.keys(), g[len(v) : len(v) + len(w)])) updates = get_optimizer(v, gv) updates.update(get_optimizer(w, gw)) # self.profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker()) # self.f_evalAndUpdate = theano.function(allvars, [logpx + logpz - logqz], updates=updates_w, mode=self.profmode) # theano.printing.debugprint(self.f_evalAndUpdate) self.f_eval = theanofunction(allvars, [logpx + logpz - logqz]) self.f_evalAndUpdate = theanofunction(allvars, [logpx + logpz - logqz], updates=updates)
def __init__(self, theano_warning='raise'): theanofunction = lazytheanofunc('warn', mode='FAST_RUN') theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN') # Create theano expressions v, w, x, z = ndict.ordereddicts(self.variables()) self.var_v, self.var_w, self.var_x, self.var_z, = v, w, x, z # Helper variables A = T.dmatrix('A') self.var_A = A # Get gradient symbols allvars = list(v.values()) + list(w.values()) + list( x.values()) + list(z.values()) + [A ] # note: '+' concatenates lists # TODO: more beautiful/standardized way of setting distributions # (should be even simpler than this) self.dist_qz = {} self.dist_px = {} self.dist_pz = {} logpv, logpw, logpx, logpz, logqz = self.factors(v, w, x, z, A) # Log-likelihood lower bound self.f_L = theanofunction(allvars, [logpx, logpz, logqz]) L = (logpx + logpz - logqz).sum() dL_dw = T.grad(L, list(v.values()) + list(w.values())) self.f_dL_dw = theanofunction(allvars, [logpx, logpz, logqz] + dL_dw) weights = T.dmatrix() dL_weighted_dw = T.grad((weights * (logpx + logpz - logqz)).sum(), list(v.values()) + list(w.values())) self.f_dL_weighted_dw = theanofunction( allvars + [weights], [logpx + logpz - logqz, weights * (logpx + logpz - logqz)] + dL_weighted_dw) # prior dlogpw_dw = T.grad(logpv + logpw, list(v.values()) + list(w.values()), disconnected_inputs='ignore') self.f_logpw = theanofunction( list(v.values()) + list(w.values()), [logpv, logpw]) self.f_dlogpw_dw = theanofunction( list(v.values()) + list(w.values()), [logpv, logpw] + dlogpw_dw)
def factors(self, w, x, z, A): # Define logp(w) logpw = 0 for i in range(len(self.n_units)-1): logpw += ap.logpdfs.normal(w['w'+str(i)], 0, self.prior_sd).sum() logpw += ap.logpdfs.normal(w['b'+str(i)], 0, self.prior_sd).sum() if self.nonlinearity == 'tanh': f = T.tanh elif self.nonlinearity == 'sigmoid': f = T.nnet.sigmoid elif self.nonlinearity == 'softplus': f = T.nnet.softplus else: raise Exception("Unknown nonlinarity "+self.nonlinearity) # Define logp(x) hiddens = [T.dot(w['w0'], x['x']) + T.dot(w['b0'], A)] for i in range(1, len(self.n_units)-1): hiddens.append(T.dot(w['w'+str(i)], f(hiddens[-1])) + T.dot(w['b'+str(i)], A)) self.p = T.nnet.softmax(hiddens[-1].T).T self.entropy = T.nnet.categorical_crossentropy(self.p.T, self.p.T).T logpx = (- T.nnet.categorical_crossentropy(self.p.T, x['y'].T).T).reshape((1,-1)) # function for distribution q(z|x) theanofunc = lazytheanofunc('ignore', mode='FAST_RUN') self.dist_px['y'] = theanofunc([x['x']] + w.values() + [A], self.p) logpz = 0 * A return logpw, logpx, logpz
def factors(self, w, x, z, A): # Define logp(w) logpw = 0 for i in range(len(self.n_units)-1): logpw += ap.logpdfs.normal(w['w'+str(i)], 0, self.prior_sd).sum() logpw += ap.logpdfs.normal(w['b'+str(i)], 0, self.prior_sd).sum() if self.nonlinearity == 'tanh': f = T.tanh elif self.nonlinearity == 'sigmoid': f = T.nnet.sigmoid elif self.nonlinearity == 'softplus': f = T.nnet.softplus else: raise Exception("Unknown nonlinarity "+self.nonlinearity) # Define logp(x) hiddens = [T.dot(w['w0'], x['x']) + T.dot(w['b0'], A)] for i in range(1, len(self.n_units)-1): hiddens.append(T.dot(w['w'+str(i)], f(hiddens[-1])) + T.dot(w['b'+str(i)], A)) self.p = T.nnet.softmax(hiddens[-1].T).T self.entropy = T.nnet.categorical_crossentropy(self.p.T, self.p.T).T logpx = (- T.nnet.categorical_crossentropy(self.p.T, x['y'].T).T).reshape((1,-1)) # function for distribution q(z|x) theanofunc = lazytheanofunc('ignore', mode='FAST_RUN') self.dist_px['y'] = theanofunc([x['x']] + list(w.values()) + [A], self.p) logpz = 0 * A return logpw, logpx, logpz
def __init__(self, theano_warning='raise'): theanofunction = lazytheanofunc('warn', mode='FAST_RUN') theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN') # Create theano expressions v, w, x, z = ndict.ordereddicts(self.variables()) self.var_v, self.var_w, self.var_x, self.var_z, = v, w, x, z # Helper variables A = T.dmatrix('A') self.var_A = A # Get gradient symbols allvars = v.values() + w.values() + x.values() + z.values() + [A] # note: '+' concatenates lists # TODO: more beautiful/standardized way of setting distributions # (should be even simpler than this) self.dist_qz = {} self.dist_px = {} self.dist_pz = {} logpv, logpw, logpx, logpz, logqz = self.factors(v, w, x, z, A) # Log-likelihood lower bound self.f_L = theanofunction(allvars, [logpx, logpz, logqz]) L = (logpx + logpz - logqz).sum() dL_dw = T.grad(L, v.values() + w.values()) self.f_dL_dw = theanofunction(allvars, [logpx, logpz, logqz] + dL_dw) weights = T.dmatrix() dL_weighted_dw = T.grad((weights * (logpx + logpz - logqz)).sum(), v.values() + w.values()) self.f_dL_weighted_dw = theanofunction(allvars + [weights], [logpx + logpz - logqz, weights*(logpx + logpz - logqz)] + dL_weighted_dw) # prior dlogpw_dw = T.grad(logpv + logpw, v.values() + w.values(), disconnected_inputs='ignore') self.f_logpw = theanofunction(v.values() + w.values(), [logpv, logpw]) self.f_dlogpw_dw = theanofunction(v.values() + w.values(), [logpv, logpw] + dlogpw_dw)
def factors(self, v, w, x, z, A): ''' z['eps'] are the independent epsilons (Gaussian with unit variance) x['x'] is the data x['y'] is categorial data (1-of-K coded) The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(y) + log p(x|y,z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x,y): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' def f_softplus(x): return T.log(T.exp(x) + 1)# - np.log(2) def f_rectlin(x): return x*(x>0) def f_rectlin2(x): return x*(x>0) + 0.01 * x nonlinear = {'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2} nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] # Compute q(z|x) hidden_q = [nonlinear_q(T.dot(v['w0'], x['x']) + T.dot(v['b0'], A))] for i in range(1, len(self.n_hidden_q)): hidden_q.append(nonlinear_q(T.dot(v['w'+str(i)], hidden_q[-1]) + T.dot(v['b'+str(i)], A))) q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(v['logvar_b'], A) else: raise Exception() # function for distribution q(z|x) theanofunc = lazytheanofunc('ignore', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x']] + v.values() + [A], [q_mean, q_logvar]) # Compute virtual sample _z = q_mean + T.exp(0.5 * q_logvar) * z['eps'] # Compute log p(x|z) hidden_p = [nonlinear_p(T.dot(w['w0'], _z) + T.dot(w['b0'], A))] for i in range(1, len(self.n_hidden_p)): hidden_p.append(nonlinear_p(T.dot(w['w'+str(i)], hidden_p[-1]) + T.dot(w['b'+str(i)], A))) if self.type_px == 'bernoulli': px = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = - T.nnet.binary_crossentropy(px, x['x']) self.dist_px['x'] = theanofunc([_z] + w.values() + [A], px) elif self.type_px == 'gaussian'or self.type_px == 'sigmoidgaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) if self.type_px == 'sigmoidgaussian': x_mean = T.nnet.sigmoid(x_mean) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([_z] + w.values() + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(np.ones((1, self.n_x)), _logpx) # logpx = logp(x|z,w) + logp(y|x,w) # log p(y|x) _logpy = T.dot(w['out_w_y'], hidden_q[-1]) + T.dot(w['out_b_y'], A) py = T.nnet.softmax(_logpy.T).T logpy = (- T.nnet.categorical_crossentropy(py.T, x['y'].T).T).reshape((1,-1)) self.dist_px['y'] = theanofunc([x['x']] + v.values() + w.values() + [A], py) # log p(z) (prior of z) if self.type_pz == 'gaussianmarg': logpz = -0.5 * (np.log(2 * np.pi) + (q_mean**2 + T.exp(q_logvar))).sum(axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) if self.type_qz == 'gaussianmarg': logqz = - 0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # Note: logpv and logpw are a scalars def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() logpv = 0 logpv += f_prior(v['w0']) for i in range(1, len(self.n_hidden_q)): logpv += f_prior(v['w'+str(i)]) logpv += f_prior(v['mean_w']) if self.type_qz in ['gaussian','gaussianmarg']: logpv += f_prior(v['logvar_w']) logpw = 0 logpw += f_prior(w['w0']) for i in range(1, len(self.n_hidden_p)): logpw += f_prior(w['w'+str(i)]) logpw += f_prior(w['out_w']) logpw += f_prior(w['out_w_y']) if self.type_px in ['sigmoidgaussian', 'gaussian']: logpw += f_prior(w['out_logvar_w']) if self.type_pz == 'studentt': logpw += f_prior(w['logv']) # Custom grad, for when only 'x' is given (and not 'y') theanofunction = lazytheanofunc('warn', mode='FAST_RUN') dL2_dw = T.grad((logpx + logpz - logqz).sum(), v.values() + w.values(), disconnected_inputs = 'ignore') allvars = self.var_v.values() + self.var_w.values() + [x['x']] + z.values() + [A] self.f_dL2_dw = theanofunction(allvars, [logpx, logpz, logqz] + dL2_dw) return logpv, logpw, logpx+logpy, logpz, logqz
def factors(self, x, z, A): v = self.v w = self.w ''' z is unused x['x'] is the data The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(x|z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' def f_softplus(x): return T.log(T.exp(x) + 1)# - np.log(2) def f_rectlin(x): return x*(x>0) def f_rectlin2(x): return x*(x>0) + 0.01 * x nonlinear = {'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2} nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] #rng = rng_curand.CURAND_RandomStreams(0) import theano.tensor.shared_randomstreams rng = theano.tensor.shared_randomstreams.RandomStreams(0) # Compute q(z|x,y) hidden_q = [nonlinear_q(T.dot(v['w0x'], x['x']) + T.dot(v['w0y'], x['y']) + T.dot(v['b0'], A))] for i in range(1, len(self.n_hidden_q)): hidden_q.append(nonlinear_q(T.dot(v['w'+str(i)], hidden_q[-1]) + T.dot(v['b'+str(i)], A))) q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(v['logvar_b'], A) else: raise Exception() # function for distribution q(z|x) theanofunc = lazytheanofunc('warn', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x'], x['y']] + [A], [q_mean, q_logvar]) # Compute virtual sample eps = rng.normal(size=q_mean.shape, dtype='float32') _z = q_mean + T.exp(0.5 * q_logvar) * eps # Compute log p(x|z) hidden_p = [nonlinear_p(T.dot(w['w0y'], x['y']) + T.dot(w['w0z'], _z) + T.dot(w['b0'], A))] for i in range(1, len(self.n_hidden_p)): hidden_p.append(nonlinear_p(T.dot(w['w'+str(i)], hidden_p[-1]) + T.dot(w['b'+str(i)], A))) if self.dropout: hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5) if self.type_px == 'bernoulli': p = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = - T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], p) elif self.type_px == 'gaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar]) elif self.type_px == 'laplace': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.laplace(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w) # log p(y) (prior of y) #_logpy = w['logpy'] #if self.uniform_y: _logpy *= 0 #py_model = T.nnet.softmax(T.dot(_logpy, A).T).T #logpy = (- T.nnet.categorical_crossentropy(py_model.T, x['y'].T).T).reshape((1,-1)) #logpx += logpy #self.dist_px['y'] = theanofunc([A], py_model) # log p(z) (prior of z) if self.type_pz == 'gaussianmarg': logpz = -0.5 * (np.log(2 * np.pi) + (q_mean**2 + T.exp(q_logvar))).sum(axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'mog': pz = 0 for i in range(self.n_mixture): pz += T.exp(ap.logpdfs.normal2(_z, T.dot(w['mog_mean'+str(i)], A), T.dot(w['mog_logvar'+str(i)], A))) logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log(float(self.n_mixture)) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) if self.type_qz == 'gaussianmarg': logqz = - 0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # Note: logpv and logpw are a scalars def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() logpv = 0 logpv += f_prior(v['w0x']) logpv += f_prior(v['w0y']) for i in range(1, len(self.n_hidden_q)): logpv += f_prior(v['w'+str(i)]) logpv += f_prior(v['mean_w']) if self.type_qz in ['gaussian','gaussianmarg']: logpv += f_prior(v['logvar_w']) logpw = 0 logpw += f_prior(w['w0y']) logpw += f_prior(w['w0z']) for i in range(1, len(self.n_hidden_p)): logpw += f_prior(w['w'+str(i)]) logpw += f_prior(w['out_w']) if self.type_px in ['sigmoidgaussian', 'gaussian','laplace']: logpw += f_prior(w['out_logvar_w']) if self.type_pz == 'studentt': logpw += f_prior(w['logv']) #return logpv, logpw, logpx, logpz, logqz return logpx, logpz, logqz
def __init__(self, theano_warning='raise', hessian=True): theanofunction = lazytheanofunc('warn', mode='FAST_RUN') theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN') # Create theano expressions w, x, z = ndict.ordereddicts(self.variables()) self.var_w, self.var_x, self.var_z, = w, x, z # Helper variables A = T.dmatrix('A') self.var_A = A # Get gradient symbols self.allvars = list(w.values()) + list(x.values()) + list( z.values()) + [A] # note: '+' concatenates lists self.allvars_keys = list(w.keys()) + list(x.keys()) + list( z.keys()) + ['A'] if False: # Put test values # needs fully implemented gen_xz(), which is not always the case # Also, the FD has no test values theano.config.compute_test_value = 'raise' _w = self.init_w() for i in _w: w[i].tag.test_value = _w[i] _x, _z, _ = self.gen_xz(_w, {}, {}, 10) _x, _z = self.xz_to_theano(_x, _z) for i in _x: x[i].tag.test_value = _x[i] for i in _z: z[i].tag.test_value = _z[i] # TODO: more beautiful/standardized way of setting distributions # (should be even simpler then this) self.dist_px = {} self.dist_pz = {} logpw, logpx, logpz = self.factors(w, x, z, A) self.var_logpw, self.var_logpx, self.var_logpz = logpw, logpx, logpz # Complete-data likelihood estimate logpxz = logpx.sum() + logpz.sum() self.f_logpxz = theanofunction(self.allvars, [logpx, logpz]) dlogpxz_dwz = T.grad(logpxz, list(w.values()) + list(z.values())) self.f_dlogpxz_dwz = theanofunction(self.allvars, [logpx, logpz] + dlogpxz_dwz) #self.f_dlogpxz_dw = theanofunction(allvars, [logpxz] + dlogpxz_dw) #self.f_dlogpxz_dz = theanofunction(allvars, [logpxz] + dlogpxz_dz) # prior dlogpw_dw = T.grad(logpw, list(w.values()), disconnected_inputs='ignore') self.f_logpw = theanofunction(list(w.values()), logpw) self.f_dlogpw_dw = theanofunction(list(w.values()), [logpw] + dlogpw_dw) if False: # MC-LIKELIHOOD logpx_max = logpx.max() logpxmc = T.log(T.exp(logpx - logpx_max).mean()) + logpx_max self.f_logpxmc = theanofunction(self.allvars, logpxmc) dlogpxmc_dw = T.grad(logpxmc, list(w.values()), disconnected_inputs=theano_warning) self.f_dlogpxmc_dw = theanofunction(self.allvars, [logpxmc] + dlogpxmc_dw) if True and len(z) > 0: # Fisher divergence (FD) gz = T.grad(logpxz, list(z.values())) gz2 = [T.dmatrix() for _ in gz] fd = 0 for i in range(len(gz)): fd += T.sum((gz[i] - gz2[i])**2) dfd_dw = T.grad(fd, list(w.values())) self.f_dfd_dw = theanofunction(self.allvars + gz2, [logpx, logpz, fd] + dfd_dw) if False and hessian: # Hessian of logpxz wrt z (works best with n_batch=1) hessian_z = theano.gradient.hessian(logpxz, z_concat) self.f_hessian_z = theanofunction(self.allvars, hessian_z)
def __init__(self, theano_warning='raise', hessian=True): theanofunction = lazytheanofunc('warn', mode='FAST_RUN') theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN') # Create theano expressions w, x, z = ndict.ordereddicts(self.variables()) self.var_w, self.var_x, self.var_z, = w, x, z # Helper variables A = T.dmatrix('A') self.var_A = A # Get gradient symbols self.allvars = w.values() + x.values() + z.values() + [A] # note: '+' concatenates lists self.allvars_keys = w.keys() + x.keys() + z.keys() + ['A'] if False: # Put test values # needs fully implemented gen_xz(), which is not always the case # Also, the FD has no test values theano.config.compute_test_value = 'raise' _w = self.init_w() for i in _w: w[i].tag.test_value = _w[i] _x, _z, _ = self.gen_xz(_w, {}, {}, 10) _x, _z = self.xz_to_theano(_x, _z) for i in _x: x[i].tag.test_value = _x[i] for i in _z: z[i].tag.test_value = _z[i] # TODO: more beautiful/standardized way of setting distributions # (should be even simpler then this) self.dist_px = {} self.dist_pz = {} logpw, logpx, logpz = self.factors(w, x, z, A) self.var_logpw, self.var_logpx, self.var_logpz = logpw, logpx, logpz # Complete-data likelihood estimate logpxz = logpx.sum() + logpz.sum() self.f_logpxz = theanofunction(self.allvars, [logpx, logpz]) dlogpxz_dwz = T.grad(logpxz, w.values() + z.values()) self.f_dlogpxz_dwz = theanofunction(self.allvars, [logpx, logpz] + dlogpxz_dwz) #self.f_dlogpxz_dw = theanofunction(allvars, [logpxz] + dlogpxz_dw) #self.f_dlogpxz_dz = theanofunction(allvars, [logpxz] + dlogpxz_dz) # prior dlogpw_dw = T.grad(logpw, w.values(), disconnected_inputs='ignore') self.f_logpw = theanofunction(w.values(), logpw) self.f_dlogpw_dw = theanofunction(w.values(), [logpw] + dlogpw_dw) if False: # MC-LIKELIHOOD logpx_max = logpx.max() logpxmc = T.log(T.exp(logpx - logpx_max).mean()) + logpx_max self.f_logpxmc = theanofunction(self.allvars, logpxmc) dlogpxmc_dw = T.grad(logpxmc, w.values(), disconnected_inputs=theano_warning) self.f_dlogpxmc_dw = theanofunction(self.allvars, [logpxmc] + dlogpxmc_dw) if True and len(z) > 0: # Fisher divergence (FD) gz = T.grad(logpxz, z.values()) gz2 = [T.dmatrix() for _ in gz] fd = 0 for i in range(len(gz)): fd += T.sum((gz[i]-gz2[i])**2) dfd_dw = T.grad(fd, w.values()) self.f_dfd_dw = theanofunction(self.allvars + gz2, [logpx, logpz, fd] + dfd_dw) if False and hessian: # Hessian of logpxz wrt z (works best with n_batch=1) hessian_z = theano.gradient.hessian(logpxz, z_concat) self.f_hessian_z = theanofunction(self.allvars, hessian_z)
def factors(self, x, z, A): v = self.v w = self.w ''' z is unused x['x'] is the data The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(x|z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' # Compute q(z|x) hidden_q = [x['x']] def f_softplus(x): return T.log(T.exp(x) + 1)# - np.log(2) def f_rectlin(x): return x*(x>0) def f_rectlin2(x): return x*(x>0) + 0.01 * x nonlinear = {'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2} nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] #rng = rng_curand.CURAND_RandomStreams(0) import theano.tensor.shared_randomstreams rng = theano.tensor.shared_randomstreams.RandomStreams(0) # TOTAL HACK #hidden_q.append(nonlinear_q(T.dot(v['scale0'], A) * T.dot(w['out_w'].T, hidden_q[-1]) + T.dot(v['b0'], A))) #hidden_q.append(nonlinear_q(T.dot(v['scale1'], A) * T.dot(w['w1'].T, hidden_q[-1]) + T.dot(v['b1'], A))) for i in range(len(self.n_hidden_q)): hidden_q.append(nonlinear_q(T.dot(v['w'+str(i)], hidden_q[-1]) + T.dot(v['b'+str(i)], A))) if self.dropout: hidden_q[-1] *= 2. * (rng.uniform(size=hidden_q[-1].shape, dtype='float32') > .5) q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(v['logvar_b'], A) else: raise Exception() # function for distribution q(z|x) theanofunc = lazytheanofunc('warn', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x']] + [A], [q_mean, q_logvar]) # Compute virtual sample eps = rng.normal(size=q_mean.shape, dtype='float32') _z = q_mean + T.exp(0.5 * q_logvar) * eps # Compute log p(x|z) hidden_p = [_z] for i in range(len(self.n_hidden_p)): hidden_p.append(nonlinear_p(T.dot(w['w'+str(i)], hidden_p[-1]) + T.dot(w['b'+str(i)], A))) if self.dropout: hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5) if self.type_px == 'bernoulli': p = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = - T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([_z] + [A], p) elif self.type_px == 'gaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar]) elif self.type_px == 'bounded01': x_mean = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) x_logvar = T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) # Make it a mixture between uniform and Gaussian w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A)) _logpx = T.log(w_unif + (1-w_unif) * T.exp(_logpx)) self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w) # log p(z) (prior of z) if self.type_pz == 'gaussianmarg': logpz = -0.5 * (np.log(2 * np.pi) + (q_mean**2 + T.exp(q_logvar))).sum(axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'mog': pz = 0 for i in range(self.n_mixture): pz += T.exp(ap.logpdfs.normal2(_z, T.dot(w['mog_mean'+str(i)], A), T.dot(w['mog_logvar'+str(i)], A))) logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log(float(self.n_mixture)) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) if self.type_qz == 'gaussianmarg': logqz = - 0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # [new part] Fisher divergence of latent variables if self.var_smoothing > 0: dlogq_dz = T.grad(logqz.sum(), _z) # gives error when using gaussianmarg instead of gaussian dlogp_dz = T.grad((logpx + logpz).sum(), _z) FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True) # [end new part] logqz -= self.var_smoothing * FD # Note: logpv and logpw are a scalars if True: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() else: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.standard_laplace(_w / prior_sd).sum() return logpx, logpz, logqz
def __init__(self, get_optimizer, theano_warning='raise'): v = self.v w = self.w theanofunction = lazytheanofunc('warn', mode='FAST_RUN') theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN') # Create theano expressions x, z = ndict.ordereddicts(self.variables()) self.var_x, self.var_z, = x, z # Helper variables A = T.fmatrix('A') self.var_A = A ''' # Get gradient symbols print 'model, x' for (d, xx) in x.items(): print d print xx.shape print x.values() ''' allvars = x.values() + z.values() + [A] # note: '+' concatenates lists # TODO: more beautiful/standardized way of setting distributions # (should be even simpler than this) self.dist_qz = {} self.dist_px = {} self.dist_pz = {} factors = self.factors(x, z, A) if len(factors) == 3: (logpx, logpz, logqz) = factors cost = 0 sparsity_penalty = 0 else: (logpx, logpz, logqz, cost, sparsity_penalty) = factors if get_optimizer == None: def get_optimizer(w, g): from collections import OrderedDict updates = OrderedDict() for i in w: updates[w[i]] = w[i] return updates # Log-likelihood lower bound self.f_L = theanofunction( allvars, [logpx, logpz, logqz, cost, sparsity_penalty]) L = (logpx + logpz - logqz).sum() - cost - sparsity_penalty g = T.grad(L, v.values() + w.values()) gv, gw = dict(zip(v.keys(), g[0:len(v)])), dict( zip(w.keys(), g[len(v):len(v) + len(w)])) updates = get_optimizer(v, gv) updates.update(get_optimizer(w, gw)) #self.profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker()) #self.f_evalAndUpdate = theano.function(allvars, [logpx + logpz - logqz], updates=updates_w, mode=self.profmode) #theano.printing.debugprint(self.f_evalAndUpdate) self.f_eval_test = theanofunction( allvars, [logpx + logpz - logqz, logpx, logpz, -logqz]) self.f_eval = theanofunction(allvars, [logpx + logpz - logqz]) self.f_evalAndUpdate = theanofunction(allvars, [logpx + logpz - logqz], updates=updates) self.f_eval_for_classcondition_prior = theanofunction( allvars, [logpx - logqz])
def factors(self, x, z, A): v = self.v # parameters of recognition model. w = self.w # parameters of generative model. ''' z is unused x['x'] is the data The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(x|z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' # Compute q(z|x) hidden_q = [x['x']] hidden_q_s = [x['x']] def f_softplus(x): return T.log(T.exp(x) + 1)# - np.log(2) def f_rectlin(x): return x*(x>0) def f_rectlin2(x): return x*(x>0) + 0.01 * x nonlinear = {'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2} nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] #rng = rng_curand.CURAND_RandomStreams(0) import theano.tensor.shared_randomstreams rng = theano.tensor.shared_randomstreams.RandomStreams(0) # TOTAL HACK #hidden_q.append(nonlinear_q(T.dot(v['scale0'], A) * T.dot(w['out_w'].T, hidden_q[-1]) + T.dot(v['b0'], A))) #hidden_q.append(nonlinear_q(T.dot(v['scale1'], A) * T.dot(w['w1'].T, hidden_q[-1]) + T.dot(v['b1'], A))) for i in range(len(self.n_hidden_q)): hidden_q.append(nonlinear_q(T.dot(v['w'+str(i)], hidden_q[-1]) + T.dot(v['b'+str(i)], A))) hidden_q_s.append(T.nnet.sigmoid(T.dot(v['w'+str(i)], hidden_q_s[-1]) + T.dot(v['b'+str(i)], A))) if self.dropout: hidden_q[-1] *= 2. * (rng.uniform(size=hidden_q[-1].shape, dtype='float32') > .5) hidden_q_s[-1] *= 2. * (rng.uniform(size=hidden_q_s[-1].shape, dtype='float32') > .5) ''' print 'mm_model' for (d, xx) in x.items(): print d ''' #print 'x', x['mean_prior'].type #print 'T', (T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)).type if not self.train_residual: q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) else: q_mean = x['mean_prior'] + T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) #q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(v['logvar_b'], A) else: raise Exception() ell = cast32(self.ell) self.param_c = shared32(0) sv = self.sv a_a = cast32(self.average_activation) s_w = cast32(self.sparsity_weight) def activate(): res = 0 if self.super_to_mean: lenw = len(v['W'].get_value()) res += T.dot(v['W'][:-1,:].T, q_mean) res += T.dot(v['W'][lenw-1:lenw,:].T, A) else: lenw = len(v['W'].get_value()) for (hi, hidden) in enumerate(hidden_q[1+sv:]): res += T.dot(v['W'][sum(self.n_hidden_q[sv:sv+hi]):sum(self.n_hidden_q[sv:sv+hi+1]),:].T, hidden) res += T.dot(v['W'][lenw-1:lenw,:].T, A) return res predy = T.argmax(activate(), axis=0) # function for distribution q(z|x) theanofunc = lazytheanofunc('warn', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x'], x['mean_prior']] + [A], [q_mean, q_logvar]) self.dist_qz['hidden'] = theanofunc([x['x'], x['mean_prior']] + [A], hidden_q[1:]) self.dist_qz['predy'] = theanofunc([x['x'], x['mean_prior']] + [A], predy) # compute cost (posterior regularization). true_resp = (activate() * x['y']).sum(axis=0, keepdims=True) T.addbroadcast(true_resp, 0) cost = self.param_c * (ell * (1-x['y']) + activate() - true_resp).max(axis=0).sum() \ + self.Lambda * (v['W'] * v['W']).sum() # compute the sparsity penalty sparsity_penalty = 0 for i in range(1, len(hidden_q_s)): sparsity_penalty += (a_a*T.log(a_a/(hidden_q_s[i].mean(axis=1))) + (1-a_a)*T.log((1-a_a)/(1-(hidden_q_s[i].mean(axis=1))))).sum(axis=0) sparsity_penalty *= s_w # Compute virtual sample eps = rng.normal(size=q_mean.shape, dtype='float32') _z = q_mean + T.exp(0.5 * q_logvar) * eps # Compute log p(x|z) hidden_p = [_z] for i in range(len(self.n_hidden_p)): hidden_p.append(nonlinear_p(T.dot(w['w'+str(i)], hidden_p[-1]) + T.dot(w['b'+str(i)], A))) if self.dropout: hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5) if self.type_px == 'bernoulli': p = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = - T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([_z] + [A], p) elif self.type_px == 'gaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar]) elif self.type_px == 'bounded01': x_mean = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) x_logvar = T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) # Make it a mixture between uniform and Gaussian w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A)) _logpx = T.log(w_unif + (1-w_unif) * T.exp(_logpx)) self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w) # log p(z) (prior of z) if self.type_pz == 'gaussianmarg': if not self.train_residual: logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + ((q_mean-x['mean_prior'])**2 + T.exp(q_logvar))/self.sigma_square).sum(axis=0, keepdims=True) else: logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + (q_mean**2 + T.exp(q_logvar))/self.sigma_square).sum(axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'mog': pz = 0 for i in range(self.n_mixture): pz += T.exp(ap.logpdfs.normal2(_z, T.dot(w['mog_mean'+str(i)], A), T.dot(w['mog_logvar'+str(i)], A))) logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log(float(self.n_mixture)) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) if self.type_qz == 'gaussianmarg': logqz = - 0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # [new part] Fisher divergence of latent variables if self.var_smoothing > 0: dlogq_dz = T.grad(logqz.sum(), _z) # gives error when using gaussianmarg instead of gaussian dlogp_dz = T.grad((logpx + logpz).sum(), _z) FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True) # [end new part] logqz -= self.var_smoothing * FD # Note: logpv and logpw are a scalars if True: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() else: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.standard_laplace(_w / prior_sd).sum() return logpx, logpz, logqz, cost, sparsity_penalty
def factors(self, v, w, x, z, A): ''' z['eps'] is the independent epsilons (Gaussian with unit variance) x['x'] is the data x['y'] is categorial data (1-of-K coded) The names of list z[...] may be confusing here: the latent variable z is not included in the list z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(y) + log p(x|y,z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x,y): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' def f_softplus(x): return T.log(T.exp(x) + 1) # - np.log(2) def f_rectlin(x): return x * (x > 0) def f_rectlin2(x): return x * (x > 0) + 0.01 * x nonlinear = { 'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2 } nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] # Compute q(z|x,y) hidden_q = [ nonlinear_q( T.dot(v['w0x'], x['x']) + T.dot(v['w0y'], x['y']) + T.dot(v['b0'], A)) ] for i in range(1, len(self.n_hidden_q)): hidden_q.append( nonlinear_q( T.dot(v['w' + str(i)], hidden_q[-1]) + T.dot(v['b' + str(i)], A))) q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot( v['logvar_b'], A) else: raise Exception() # function for distribution q(z|x) theanofunc = lazytheanofunc('ignore', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x'], x['y']] + list(v.values()) + [A], [q_mean, q_logvar]) # Compute virtual sample _z = q_mean + T.exp(0.5 * q_logvar) * z['eps'] # Compute log p(x|y,z) hidden_p = [ nonlinear_p( T.dot(w['w0y'], x['y']) + T.dot(w['w0z'], _z) + T.dot(w['b0'], A)) ] for i in range(1, len(self.n_hidden_p)): hidden_p.append( nonlinear_p( T.dot(w['w' + str(i)], hidden_p[-1]) + T.dot(w['b' + str(i)], A))) if self.type_px == 'bernoulli': p = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = -T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([x['y'], _z] + list(w.values()) + [A], p) elif self.type_px == 'gaussian' or self.type_px == 'sigmoidgaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) if self.type_px == 'sigmoidgaussian': x_mean = T.nnet.sigmoid(x_mean) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([x['y'], _z] + list(w.values()) + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(np.ones((1, self.n_x)), _logpx) # logpx = logp(y|w) + logp(x|z,w) # log p(y) (prior of y) _logpy = w['logpy'] if self.uniform_y: _logpy *= 0 py_model = T.nnet.softmax(T.dot(_logpy, A).T).T logpy = ( -T.nnet.categorical_crossentropy(py_model.T, x['y'].T).T).reshape( (1, -1)) logpx += logpy self.dist_px['y'] = theanofunc(list(w.values()) + [A], py_model) # log p(z) (prior of z) if self.type_pz == 'gaussianmarg': logpz = -0.5 * (np.log(2 * np.pi) + (q_mean**2 + T.exp(q_logvar))).sum(axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) if self.type_qz == 'gaussianmarg': logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum( axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # Note: logpv and logpw are a scalars def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() logpv = 0 logpv += f_prior(v['w0x']) logpv += f_prior(v['w0y']) for i in range(1, len(self.n_hidden_q)): logpv += f_prior(v['w' + str(i)]) logpv += f_prior(v['mean_w']) if self.type_qz in ['gaussian', 'gaussianmarg']: logpv += f_prior(v['logvar_w']) logpw = 0 logpw += f_prior(w['w0y']) logpw += f_prior(w['w0z']) for i in range(1, len(self.n_hidden_p)): logpw += f_prior(w['w' + str(i)]) logpw += f_prior(w['out_w']) if self.type_px in ['sigmoidgaussian', 'gaussian']: logpw += f_prior(w['out_logvar_w']) if self.type_pz == 'studentt': logpw += f_prior(w['logv']) return logpv, logpw, logpx, logpz, logqz
def factors(self, x, z, A): v = self.v w = self.w ''' z is unused x['x'] is the data The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(x|z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' def f_softplus(x): return T.log(T.exp(x) + 1) # - np.log(2) def f_rectlin(x): return x * (x > 0) def f_rectlin2(x): return x * (x > 0) + 0.01 * x nonlinear = { 'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2 } nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] #rng = rng_curand.CURAND_RandomStreams(0) import theano.tensor.shared_randomstreams rng = theano.tensor.shared_randomstreams.RandomStreams(0) # Compute q(z|x,y) hidden_q = [ nonlinear_q( T.dot(v['w0x'], x['x']) + T.dot(v['w0y'], x['y']) + T.dot(v['b0'], A)) ] for i in range(1, len(self.n_hidden_q)): hidden_q.append( nonlinear_q( T.dot(v['w' + str(i)], hidden_q[-1]) + T.dot(v['b' + str(i)], A))) q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot( v['logvar_b'], A) else: raise Exception() # function for distribution q(z|x) theanofunc = lazytheanofunc('warn', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x'], x['y']] + [A], [q_mean, q_logvar]) # Compute virtual sample eps = rng.normal(size=q_mean.shape, dtype='float32') _z = q_mean + T.exp(0.5 * q_logvar) * eps # Compute log p(x|z) hidden_p = [ nonlinear_p( T.dot(w['w0y'], x['y']) + T.dot(w['w0z'], _z) + T.dot(w['b0'], A)) ] for i in range(1, len(self.n_hidden_p)): hidden_p.append( nonlinear_p( T.dot(w['w' + str(i)], hidden_p[-1]) + T.dot(w['b' + str(i)], A))) if self.dropout: hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5) if self.type_px == 'bernoulli': p = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = -T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], p) elif self.type_px == 'gaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar]) elif self.type_px == 'laplace': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.laplace(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w) # log p(y) (prior of y) #_logpy = w['logpy'] #if self.uniform_y: _logpy *= 0 #py_model = T.nnet.softmax(T.dot(_logpy, A).T).T #logpy = (- T.nnet.categorical_crossentropy(py_model.T, x['y'].T).T).reshape((1,-1)) #logpx += logpy #self.dist_px['y'] = theanofunc([A], py_model) # log p(z) (prior of z) if self.type_pz == 'gaussianmarg': logpz = -0.5 * (np.log(2 * np.pi) + (q_mean**2 + T.exp(q_logvar))).sum(axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'mog': pz = 0 for i in range(self.n_mixture): pz += T.exp( ap.logpdfs.normal2(_z, T.dot(w['mog_mean' + str(i)], A), T.dot(w['mog_logvar' + str(i)], A))) logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log( float(self.n_mixture)) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) if self.type_qz == 'gaussianmarg': logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum( axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # Note: logpv and logpw are a scalars def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() logpv = 0 logpv += f_prior(v['w0x']) logpv += f_prior(v['w0y']) for i in range(1, len(self.n_hidden_q)): logpv += f_prior(v['w' + str(i)]) logpv += f_prior(v['mean_w']) if self.type_qz in ['gaussian', 'gaussianmarg']: logpv += f_prior(v['logvar_w']) logpw = 0 logpw += f_prior(w['w0y']) logpw += f_prior(w['w0z']) for i in range(1, len(self.n_hidden_p)): logpw += f_prior(w['w' + str(i)]) logpw += f_prior(w['out_w']) if self.type_px in ['sigmoidgaussian', 'gaussian', 'laplace']: logpw += f_prior(w['out_logvar_w']) if self.type_pz == 'studentt': logpw += f_prior(w['logv']) #return logpv, logpw, logpx, logpz, logqz return logpx, logpz, logqz
def factors(self, v, w, x, z, A): ''' z['eps'] is the independent epsilons (Gaussian with unit variance) x['x'] is the data The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(x|z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' # Compute q(z|x) hidden_q = [x['x']] def f_softplus(x): return T.log(T.exp(x) + 1) # - np.log(2) def f_rectlin(x): return x * (x > 0) def f_rectlin2(x): return x * (x > 0) + 0.01 * x nonlinear = { 'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2 } nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] for i in range(len(self.n_hidden_q)): hidden_q.append( nonlinear_q( T.dot(v['w' + str(i)], hidden_q[-1]) + T.dot(v['b' + str(i)], A))) q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot( v['logvar_b'], A) else: raise Exception() # function for distribution q(z|x) theanofunc = lazytheanofunc('warn', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x']] + v.values() + [A], [q_mean, q_logvar]) # Compute virtual sample _z = q_mean + T.exp(0.5 * q_logvar) * z['eps'] # Compute log p(x|z) hidden_p = [_z] for i in range(len(self.n_hidden_p)): hidden_p.append( nonlinear_p( T.dot(w['w' + str(i)], hidden_p[-1]) + T.dot(w['b' + str(i)], A))) if self.type_px == 'bernoulli': p = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = -T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([_z] + w.values() + [A], p) elif self.type_px == 'gaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([_z] + w.values() + [A], [x_mean, x_logvar]) elif self.type_px == 'bounded01': x_mean = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) x_logvar = T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) # Make it a mixture between uniform and Gaussian w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A)) _logpx = T.log(w_unif + (1 - w_unif) * T.exp(_logpx)) self.dist_px['x'] = theanofunc([_z] + w.values() + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(np.ones((1, self.n_x)), _logpx) # logpx = log p(x|z,w) # log p(z) (prior of z) if self.type_pz == 'gaussianmarg': logpz = -0.5 * (np.log(2 * np.pi) + (q_mean**2 + T.exp(q_logvar))).sum(axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) if self.type_qz == 'gaussianmarg': logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum( axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # [new part] Fisher divergence of latent variables if self.var_smoothing > 0: dlogq_dz = T.grad( logqz.sum(), _z) # gives error when using gaussianmarg instead of gaussian dlogp_dz = T.grad((logpx + logpz).sum(), _z) FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True) # [end new part] logqz -= self.var_smoothing * FD # Note: logpv and logpw are a scalars if True: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() else: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.standard_laplace(_w / prior_sd).sum() logpv = 0 for i in range(len(self.n_hidden_q)): logpv += f_prior(v['w' + str(i)]) logpv += f_prior(v['mean_w']) if self.type_qz in ['gaussian', 'gaussianmarg']: logpv += f_prior(v['logvar_w']) logpw = 0 for i in range(len(self.n_hidden_p)): logpw += f_prior(w['w' + str(i)]) logpw += f_prior(w['out_w']) if self.type_px == 'gaussian': logpw += f_prior(w['out_logvar_w']) if self.type_pz == 'studentt': logpw += f_prior(w['logv']) return logpv, logpw, logpx, logpz, logqz
def factors(self, x, z, A): v = self.v # parameters of recognition model. w = self.w # parameters of generative model. ''' z is unused x['x'] is the data The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(x|z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' # Compute q(z|x) hidden_q = [x['x']] hidden_q_s = [x['x']] def f_softplus(x): return T.log(T.exp(x) + 1) # - np.log(2) def f_rectlin(x): return x * (x > 0) def f_rectlin2(x): return x * (x > 0) + 0.01 * x nonlinear = { 'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2 } nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] #rng = rng_curand.CURAND_RandomStreams(0) import theano.tensor.shared_randomstreams rng = theano.tensor.shared_randomstreams.RandomStreams(0) # TOTAL HACK #hidden_q.append(nonlinear_q(T.dot(v['scale0'], A) * T.dot(w['out_w'].T, hidden_q[-1]) + T.dot(v['b0'], A))) #hidden_q.append(nonlinear_q(T.dot(v['scale1'], A) * T.dot(w['w1'].T, hidden_q[-1]) + T.dot(v['b1'], A))) for i in range(len(self.n_hidden_q)): hidden_q.append( nonlinear_q( T.dot(v['w' + str(i)], hidden_q[-1]) + T.dot(v['b' + str(i)], A))) hidden_q_s.append( T.nnet.sigmoid( T.dot(v['w' + str(i)], hidden_q_s[-1]) + T.dot(v['b' + str(i)], A))) if self.dropout: hidden_q[-1] *= 2. * (rng.uniform(size=hidden_q[-1].shape, dtype='float32') > .5) hidden_q_s[-1] *= 2. * (rng.uniform(size=hidden_q_s[-1].shape, dtype='float32') > .5) ''' print 'mm_model' for (d, xx) in x.items(): print d ''' #print 'x', x['mean_prior'].type #print 'T', (T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)).type if not self.train_residual: q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) else: q_mean = x['mean_prior'] + T.dot( v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) #q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot( v['logvar_b'], A) else: raise Exception() ell = cast32(self.ell) self.param_c = shared32(0) sv = self.sv a_a = cast32(self.average_activation) s_w = cast32(self.sparsity_weight) def activate(): res = 0 if self.super_to_mean: lenw = len(v['W'].get_value()) res += T.dot(v['W'][:-1, :].T, q_mean) res += T.dot(v['W'][lenw - 1:lenw, :].T, A) else: lenw = len(v['W'].get_value()) for (hi, hidden) in enumerate(hidden_q[1 + sv:]): res += T.dot( v['W'][sum(self.n_hidden_q[sv:sv + hi] ):sum(self.n_hidden_q[sv:sv + hi + 1]), :].T, hidden) res += T.dot(v['W'][lenw - 1:lenw, :].T, A) return res predy = T.argmax(activate(), axis=0) # function for distribution q(z|x) theanofunc = lazytheanofunc('warn', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x'], x['mean_prior']] + [A], [q_mean, q_logvar]) self.dist_qz['hidden'] = theanofunc([x['x'], x['mean_prior']] + [A], hidden_q[1:]) self.dist_qz['predy'] = theanofunc([x['x'], x['mean_prior']] + [A], predy) # compute cost (posterior regularization). true_resp = (activate() * x['y']).sum(axis=0, keepdims=True) T.addbroadcast(true_resp, 0) cost = self.param_c * (ell * (1-x['y']) + activate() - true_resp).max(axis=0).sum() \ + self.Lambda * (v['W'] * v['W']).sum() # compute the sparsity penalty sparsity_penalty = 0 for i in range(1, len(hidden_q_s)): sparsity_penalty += (a_a * T.log(a_a / (hidden_q_s[i].mean(axis=1))) + (1 - a_a) * T.log( (1 - a_a) / (1 - (hidden_q_s[i].mean(axis=1))))).sum( axis=0) sparsity_penalty *= s_w # Compute virtual sample eps = rng.normal(size=q_mean.shape, dtype='float32') _z = q_mean + T.exp(0.5 * q_logvar) * eps # Compute log p(x|z) hidden_p = [_z] for i in range(len(self.n_hidden_p)): hidden_p.append( nonlinear_p( T.dot(w['w' + str(i)], hidden_p[-1]) + T.dot(w['b' + str(i)], A))) if self.dropout: hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5) if self.type_px == 'bernoulli': p = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = -T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([_z] + [A], p) elif self.type_px == 'gaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar]) elif self.type_px == 'bounded01': x_mean = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) x_logvar = T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) # Make it a mixture between uniform and Gaussian w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A)) _logpx = T.log(w_unif + (1 - w_unif) * T.exp(_logpx)) self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w) # log p(z) (prior of z) if self.type_pz == 'gaussianmarg': if not self.train_residual: logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + ( (q_mean - x['mean_prior'])**2 + T.exp(q_logvar)) / self.sigma_square).sum(axis=0, keepdims=True) else: logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + (q_mean**2 + T.exp(q_logvar)) / self.sigma_square).sum(axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'mog': pz = 0 for i in range(self.n_mixture): pz += T.exp( ap.logpdfs.normal2(_z, T.dot(w['mog_mean' + str(i)], A), T.dot(w['mog_logvar' + str(i)], A))) logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log( float(self.n_mixture)) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) if self.type_qz == 'gaussianmarg': logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum( axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # [new part] Fisher divergence of latent variables if self.var_smoothing > 0: dlogq_dz = T.grad( logqz.sum(), _z) # gives error when using gaussianmarg instead of gaussian dlogp_dz = T.grad((logpx + logpz).sum(), _z) FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True) # [end new part] logqz -= self.var_smoothing * FD # Note: logpv and logpw are a scalars if True: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() else: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.standard_laplace(_w / prior_sd).sum() return logpx, logpz, logqz, cost, sparsity_penalty
def factors(self, x, z, A): v = self.v w = self.w ''' z is unused x['x'] is the data The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...], but implicitely computed from epsilon and parameters in w. z is computed with g(.) from eps and variational parameters let logpx be the generative model density: log p(x|z) where z=g(.) let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x) So the lower bound L(x) = logpx + logpz let logpv and logpw be the (prior) density of the parameters ''' # Compute q(z|x) hidden_q = [x['x']] def f_softplus(x): return T.log(T.exp(x) + 1) # - np.log(2) def f_rectlin(x): return x * (x > 0) def f_rectlin2(x): return x * (x > 0) + 0.01 * x nonlinear = { 'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2 } nonlinear_q = nonlinear[self.nonlinear_q] nonlinear_p = nonlinear[self.nonlinear_p] #rng = rng_curand.CURAND_RandomStreams(0) import theano.tensor.shared_randomstreams rng = theano.tensor.shared_randomstreams.RandomStreams(0) # TOTAL HACK #hidden_q.append(nonlinear_q(T.dot(v['scale0'], A) * T.dot(w['out_w'].T, hidden_q[-1]) + T.dot(v['b0'], A))) #hidden_q.append(nonlinear_q(T.dot(v['scale1'], A) * T.dot(w['w1'].T, hidden_q[-1]) + T.dot(v['b1'], A))) for i in range(len(self.n_hidden_q)): hidden_q.append( nonlinear_q( T.dot(v['w' + str(i)], hidden_q[-1]) + T.dot(v['b' + str(i)], A))) if self.dropout: hidden_q[-1] *= 2. * (rng.uniform(size=hidden_q[-1].shape, dtype='float32') > .5) if not self.train_residual: q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) else: q_mean = x['mean_prior'] + T.dot( v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A) if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg': q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot( v['logvar_b'], A) else: raise Exception() # function for distribution q(z|x) theanofunc = lazytheanofunc('warn', mode='FAST_RUN') self.dist_qz['z'] = theanofunc([x['x'], x['mean_prior']] + [A], [q_mean, q_logvar]) self.dist_qz['hidden'] = theanofunc([x['x'], x['mean_prior']] + [A], hidden_q[1:]) # Compute virtual sample eps = rng.normal(size=q_mean.shape, dtype='float32') _z = q_mean + T.exp(0.5 * q_logvar) * eps # Compute log p(x|z) hidden_p = [_z] for i in range(len(self.n_hidden_p)): hidden_p.append( nonlinear_p( T.dot(w['w' + str(i)], hidden_p[-1]) + T.dot(w['b' + str(i)], A))) if self.dropout: hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5) if self.type_px == 'bernoulli': p = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) _logpx = -T.nnet.binary_crossentropy(p, x['x']) self.dist_px['x'] = theanofunc([_z] + [A], p) elif self.type_px == 'gaussian': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar]) elif self.type_px == 'bounded01': x_mean = T.nnet.sigmoid( T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)) x_logvar = T.dot(w['out_logvar_b'], A) _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar) # Make it a mixture between uniform and Gaussian w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A)) _logpx = T.log(w_unif + (1 - w_unif) * T.exp(_logpx)) self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar]) elif self.type_px == 'exponential': log_x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) _logpx = ap.logpdfs.exp(x['x'], log_x_mean) self.dist_px['x'] = theanofunc([_z] + [A], [log_x_mean]) elif self.type_px == 'mixture': x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A) x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot( w['out_logvar_b'], A) normal_list = np.asarray([1, 6, 10, 14, 18]) exponential_list = np.asarray( [0, 3, 5, 9, 13, 17, 21, 22, 23, 24, 25, 26, 27]) uniform_list = np.asarray([2, 4, 7, 11, 15, 19]) threemodal_list = np.asarray([8, 12, 16, 20]) #print type(x['x']) _logpx_normal = ap.logpdfs.normal2(x['x'][normal_list, :], x_mean[normal_list, :], x_logvar[normal_list, :]) #print type(x_mean) _logpx_exponential = ap.logpdfs.exp(x['x'][exponential_list, :], x_mean[exponential_list, :]) _logpx = _logpx_normal + _logpx_exponential self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar]) else: raise Exception("") # Note: logpx is a row vector (one element per sample) logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w) # log p(z) (prior of z) if self.type_pz == 'gaussianmarg': if not self.train_residual: logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + ( (q_mean - x['mean_prior'])**2 + T.exp(q_logvar)) / self.sigma_square).sum(axis=0, keepdims=True) else: logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + (q_mean**2 + T.exp(q_logvar)) / self.sigma_square).sum(axis=0, keepdims=True) elif self.type_pz == 'gaussian': logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'mog': pz = 0 for i in range(self.n_mixture): pz += T.exp( ap.logpdfs.normal2(_z, T.dot(w['mog_mean' + str(i)], A), T.dot(w['mog_logvar' + str(i)], A))) logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log( float(self.n_mixture)) elif self.type_pz == 'laplace': logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True) elif self.type_pz == 'studentt': logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True) else: raise Exception("Unknown type_pz") # loq q(z|x) (entropy of z) if self.type_qz == 'gaussianmarg': logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum( axis=0, keepdims=True) elif self.type_qz == 'gaussian': logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True) else: raise Exception() # [new part] Fisher divergence of latent variables if self.var_smoothing > 0: dlogq_dz = T.grad( logqz.sum(), _z) # gives error when using gaussianmarg instead of gaussian dlogp_dz = T.grad((logpx + logpz).sum(), _z) FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True) # [end new part] logqz -= self.var_smoothing * FD # Note: logpv and logpw are a scalars if True: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.normal(_w, 0, prior_sd).sum() else: def f_prior(_w, prior_sd=self.prior_sd): return ap.logpdfs.standard_laplace(_w / prior_sd).sum() return logpx, logpz, logqz
def __init__(self, get_optimizer, theano_warning='raise'): v = self.v w = self.w theanofunction = lazytheanofunc('warn', mode='FAST_RUN') theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN') # Create theano expressions x, z = ndict.ordereddicts(self.variables()) self.var_x, self.var_z, = x, z # Helper variables A = T.fmatrix('A') self.var_A = A ''' # Get gradient symbols print 'model, x' for (d, xx) in x.items(): print d print xx.shape print x.values() ''' allvars = x.values() + z.values() + [A] # note: '+' concatenates lists # TODO: more beautiful/standardized way of setting distributions # (should be even simpler than this) self.dist_qz = {} self.dist_px = {} self.dist_pz = {} factors = self.factors(x, z, A) if len(factors) == 3: (logpx, logpz, logqz) = factors cost = 0 sparsity_penalty = 0 else: (logpx, logpz, logqz, cost, sparsity_penalty) = factors if get_optimizer == None: def get_optimizer(w, g): from collections import OrderedDict updates = OrderedDict() for i in w: updates[w[i]] = w[i] return updates # Log-likelihood lower bound self.f_L = theanofunction(allvars, [logpx, logpz, logqz, cost, sparsity_penalty]) L = (logpx + logpz - logqz).sum() - cost - sparsity_penalty g = T.grad(L, v.values() + w.values()) gv, gw = dict(zip(v.keys(), g[0:len(v)])), dict(zip(w.keys(), g[len(v):len(v)+len(w)])) updates = get_optimizer(v, gv) updates.update(get_optimizer(w, gw)) #self.profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker()) #self.f_evalAndUpdate = theano.function(allvars, [logpx + logpz - logqz], updates=updates_w, mode=self.profmode) #theano.printing.debugprint(self.f_evalAndUpdate) self.f_eval_test = theanofunction(allvars, [logpx + logpz - logqz, logpx, logpz, -logqz]) self.f_eval = theanofunction(allvars, [logpx + logpz - logqz]) self.f_evalAndUpdate = theanofunction(allvars, [logpx + logpz - logqz], updates=updates) self.f_eval_for_classcondition_prior = theanofunction(allvars, [logpx - logqz])