def test_mean_H_given_V(self): tol = 1e-6 # P(h_1 | v) / P(h_2 | v) = a # => exp(-E(v, h_1)) / exp(-E(v,h_2)) = a # => exp(E(v,h_2)-E(v,h_1)) = a # E(v,h_2) - E(v,h_1) = log(a) # also log P(h_1 | v) - log P(h_2) = log(a) rng = N.random.RandomState([1, 2, 3]) m = 5 Vv = as_floatX(N.zeros((m, self.nv)) + rng.randn(self.nv)) Hv = as_floatX(rng.randn(m, self.nh) > 0.) log_Pv = self.log_P_H_given_V_func(Hv, Vv) Ev = self.E_func(Vv, Hv) for i in xrange(m): for j in xrange(i + 1, m): log_a = log_Pv[i] - log_Pv[j] e = Ev[j] - Ev[i] assert abs(e-log_a) < tol
def test_mean_H_given_V(self): tol = 1e-6 # P(h_1 | v) / P(h_2 | v) = a # => exp(-E(v, h_1)) / exp(-E(v,h_2)) = a # => exp(E(v,h_2)-E(v,h_1)) = a # E(v,h_2) - E(v,h_1) = log(a) # also log P(h_1 | v) - log P(h_2) = log(a) rng = N.random.RandomState([1, 2, 3]) m = 5 Vv = as_floatX(N.zeros((m, nv)) + rng.randn(nv)) Hv = as_floatX(rng.randn(m, nh) > 0.) log_Pv = log_P_H_given_V_func(Hv, Vv) Ev = E_func(Vv, Hv) for i in xrange(m): for j in xrange(i + 1, m): log_a = log_Pv[i] - log_Pv[j] e = Ev[j] - Ev[i] assert abs(e - log_a) < tol
def test_triangle_code(): rng = np.random.RandomState([20,18,9]) m = 5 n = 6 k = 7 X = as_floatX(rng.randn(m,n)) D = as_floatX(rng.randn(k,n)) D_norm_squared = np.sum(D**2,axis=1) X_norm_squared = np.sum(X**2,axis=1) sq_distance = -2.0 * np.dot(X,D.T) + D_norm_squared + np.atleast_2d(X_norm_squared).T distance = np.sqrt(sq_distance) mu = np.mean(distance, axis = 1) expected = np.maximum(0.0,mu.reshape(mu.size,1)-distance) Xv = T.matrix() Dv = T.matrix() code = triangle_code(X = Xv, centroids = Dv) actual = function([Xv,Dv],code)(X,D) assert np.allclose(expected, actual)
def test_d_negent_h_d_h(self): "tests that the gradient of the negative entropy of h with respect to \hat{h} matches my analytical version of it " model = self.model ip = self.model.e_step X = self.X assert X.shape[0] == self.m H = np.cast[config.floatX](self.model.rng.uniform(0.001,.999,(self.m, self.N))) S = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,(self.m, self.N))) H_var = T.matrix(name='H_var') H_var.tag.test_value = H S_var = T.matrix(name='S_var') S_var.tag.test_value = S sigma0 = ip.infer_var_s0_hat() Sigma1 = ip.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) negent = - self.model.entropy_h( H_hat = H_var ).sum() assert len(negent.type.broadcastable) == 0 grad_H = T.grad(negent, H_var) grad_func = function([H_var, S_var], grad_H, on_unused_input = 'ignore') grad_theano = grad_func(H,S) half = as_floatX(0.5) one = as_floatX(1.) two = as_floatX(2.) pi = as_floatX(np.pi) e = as_floatX(np.e) mu = self.model.mu alpha = self.model.alpha W = self.model.W B = self.model.B w = self.model.w term1 = T.log(H_var) term2 = -T.log(one - H_var) analytical = term1 + term2 grad_analytical = function([H_var, S_var], analytical, on_unused_input = 'ignore')(H,S) if not np.allclose(grad_theano, grad_analytical): print('grad theano: ',(grad_theano.min(), grad_theano.mean(), grad_theano.max())) print('grad analytical: ',(grad_analytical.min(), grad_analytical.mean(), grad_analytical.max())) ad = np.abs(grad_theano-grad_analytical) print('abs diff: ',(ad.min(),ad.mean(),ad.max())) assert False
def test_d_negent_h_d_h(self): "tests that the gradient of the negative entropy of h with respect to \hat{h} matches my analytical version of it " model = self.model ip = self.model.e_step X = self.X assert X.shape[0] == self.m H = np.cast[config.floatX](self.model.rng.uniform(0.001,.999,(self.m, self.N))) S = np.cast[config.floatX](self.model.rng.uniform(-5.,5.,(self.m, self.N))) H_var = T.matrix(name='H_var') H_var.tag.test_value = H S_var = T.matrix(name='S_var') S_var.tag.test_value = S sigma0 = ip.infer_var_s0_hat() Sigma1 = ip.infer_var_s1_hat() mu0 = T.zeros_like(model.mu) negent = - self.model.entropy_h( H_hat = H_var ).sum() assert len(negent.type.broadcastable) == 0 grad_H = T.grad(negent, H_var) grad_func = function([H_var, S_var], grad_H, on_unused_input = 'ignore') grad_theano = grad_func(H,S) half = as_floatX(0.5) one = as_floatX(1.) two = as_floatX(2.) pi = as_floatX(np.pi) e = as_floatX(np.e) mu = self.model.mu alpha = self.model.alpha W = self.model.W B = self.model.B w = self.model.w term1 = T.log(H_var) term2 = -T.log(one - H_var) analytical = term1 + term2 grad_analytical = function([H_var, S_var], analytical, on_unused_input = 'ignore')(H,S) if not np.allclose(grad_theano, grad_analytical): print 'grad theano: ',(grad_theano.min(), grad_theano.mean(), grad_theano.max()) print 'grad analytical: ',(grad_analytical.min(), grad_analytical.mean(), grad_analytical.max()) ad = np.abs(grad_theano-grad_analytical) print 'abs diff: ',(ad.min(),ad.mean(),ad.max()) assert False
def test_convolutional_compatible(): """ VAE allows convolutional encoding networks """ encoding_model = MLP( layers=[ SpaceConverter( layer_name='conv2d_converter', output_space=Conv2DSpace(shape=[4, 4], num_channels=1) ), ConvRectifiedLinear( layer_name='h', output_channels=2, kernel_shape=[2, 2], kernel_stride=[1, 1], pool_shape=[1, 1], pool_stride=[1, 1], pool_type='max', irange=0.01) ] ) decoding_model = MLP(layers=[Linear(layer_name='h', dim=16, irange=0.01)]) prior = DiagonalGaussianPrior() conditional = BernoulliVector(mlp=decoding_model, name='conditional') posterior = DiagonalGaussian(mlp=encoding_model, name='posterior') vae = VAE(nvis=16, prior=prior, conditional=conditional, posterior=posterior, nhid=16) X = T.matrix('X') lower_bound = vae.log_likelihood_lower_bound(X, num_samples=10) f = theano.function(inputs=[X], outputs=lower_bound) rng = make_np_rng(default_seed=11223) f(as_floatX(rng.uniform(size=(10, 16))))
def test_unit_norm(self): """ Test that using std_bias = 0.0 and use_norm = True results in vectors having unit norm """ tol = 1e-5 num_examples = 5 num_features = 10 rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(num_examples, num_features)) dataset = DenseDesignMatrix(X=X) # the setting of subtract_mean is not relevant to the test # the test only applies when std_bias = 0.0 and use_std = False preprocessor = GlobalContrastNormalization(subtract_mean=False, sqrt_bias=0.0, use_std=False) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() norms = np.sqrt(np.square(result).sum(axis=1)) max_norm_error = np.abs(norms - 1.).max() tol = 3e-5 assert max_norm_error < tol
def gibbs_step_for_v(self, v, rng): # Sometimes, the number of examples in the data set is not a # multiple of self.batch_size. batch_size = v.shape[0] # sample h given v h_mean = self.mean_h_given_v(v) h_mean_shape = (batch_size, self.nhid) h_sample = as_floatX(rng.uniform(size=h_mean_shape) < h_mean) # sample s given (v,h) s_mu, s_var = self.mean_var_s_given_v_h1(v) #s_mu_shape = (batch_size, self.nslab) s_mu_shape = (16, self.nslab) # @dave: THEANO HACK (bugfix for rita2) s_sample = s_mu + rng.normal(size=s_mu_shape) * tensor.sqrt(s_var) #s_sample=(s_sample.reshape()*h_sample.dimshuffle(0,1,'x')).flatten(2) # sample v given (s,h) v_mean, v_var = self.mean_var_v_given_h_s(h_sample, s_sample) #v_mean_shape = (batch_size, self.nvis) v_mean_shape = (16, int(self.nvis)) # @dave: THEANO HACK (bugfix for rita2) v_sample = rng.normal(size=v_mean_shape) * tensor.sqrt(v_var) + v_mean del batch_size return v_sample, locals()
def get_gradients(self, model, data, ** kwargs): v = data mean_matrix = model.propup(v) #====================================================== part_j = self.p - mean_matrix.mean(axis=0) part_i1_matrix = mean_matrix * (1. - mean_matrix) #part_i = T.dot(v.T, part_i1_matrix) #part_orin = part_i * part_j #矩阵右乘一个行向量 #coeff_w = -2. * v.shape[0] #gW = coeff_w * part_orin #HL sparse项产生的梯度,不含lambda_ #======================================================= part_j1 = part_j part_j2 = part_i1_matrix.mean(axis=0) gc = -2. * part_j1 * part_j2 W, c, b = list(model.get_params()) #gradients = OrderedDict(izip([W, c], [1/self.p*gW, 1/self.p*gc])) gradients = OrderedDict(izip([c], [as_floatX(1/self.p*gc)])) updates = OrderedDict() return gradients, updates
def test_convolutional_compatible(): """ VAE allows convolutional encoding networks """ encoding_model = MLP( layers=[ SpaceConverter(layer_name="conv2d_converter", output_space=Conv2DSpace(shape=[4, 4], num_channels=1)), ConvRectifiedLinear( layer_name="h", output_channels=2, kernel_shape=[2, 2], kernel_stride=[1, 1], pool_shape=[1, 1], pool_stride=[1, 1], pool_type="max", irange=0.01, ), ] ) decoding_model = MLP(layers=[Linear(layer_name="h", dim=16, irange=0.01)]) prior = DiagonalGaussianPrior() conditional = BernoulliVector(mlp=decoding_model, name="conditional") posterior = DiagonalGaussian(mlp=encoding_model, name="posterior") vae = VAE(nvis=16, prior=prior, conditional=conditional, posterior=posterior, nhid=16) X = T.matrix("X") lower_bound = vae.log_likelihood_lower_bound(X, num_samples=10) f = theano.function(inputs=[X], outputs=lower_bound) rng = make_np_rng(default_seed=11223) f(as_floatX(rng.uniform(size=(10, 16))))
def setup(self): """ We use a small predefined 8x5 matrix for which we know the ZCA transform. """ self.X = np.array([[-10.0, 3.0, 19.0, 9.0, -15.0], [7.0, 26.0, 26.0, 26.0, -3.0], [17.0, -17.0, -37.0, -36.0, -11.0], [19.0, 15.0, -2.0, 5.0, 9.0], [-3.0, -8.0, -35.0, -25.0, -8.0], [-18.0, 3.0, 4.0, 15.0, 14.0], [5.0, -4.0, -5.0, -7.0, -11.0], [23.0, 22.0, 15.0, 20.0, 12.0]]) self.dataset = DenseDesignMatrix(X=as_floatX(self.X), y=as_floatX(np.ones((8, 1)))) self.num_components = self.dataset.get_design_matrix().shape[1] - 1
def get_monitoring_channels(self, data): X, Y = data rval = OrderedDict() nll = self.nll(data) rval['perplexity'] = as_floatX(10 ** (nll/np.log(10))) return rval
def create_colors(n_colors): """ Create an array of n_colors Parameters ---------- n_colors : int The number of colors to create Returns ------- colors_rgb : np.array An array of shape (n_colors, 3) in RGB format """ # Create the list of color hue colors_hue = np.arange(n_colors) colors_hue = as_floatX(colors_hue) colors_hue *= 1./n_colors # Set the color in HSV format colors_hsv = np.ones((n_colors, 3)) colors_hsv[:, 2] *= .75 colors_hsv[:, 0] = colors_hue # Put in a matplotlib-friendly format colors_hsv = colors_hsv.reshape((1, )+colors_hsv.shape) # Convert to RGB colors_rgb = matplotlib.colors.hsv_to_rgb(colors_hsv) colors_rgb = colors_rgb[0] return colors_rgb
def learning_rate_updates(self): """ Compute a dictionary of shared variable updates related to annealing the learning rate. Returns ------- updates : dict A dictionary with the shared variables representing SGD metadata as keys and a symbolic expression of how they are to be updated as values. """ ups = {} # Annealing coefficient. Here we're using a formula of # min(base_lr, anneal_start / (iteration + 1)) if self.anneal_start is None: annealed = sharedX(self.base_lr) else: frac = self.anneal_start / (self.iteration + 1.) annealed = tensor.minimum( as_floatX(frac), self.base_lr # maximum learning rate ) # Update the shared variable for the annealed learning rate. ups[self.annealed] = annealed ups[self.iteration] = self.iteration + 1 # Calculate the learning rates for each parameter, in the order # they appear in self.params learn_rates = [annealed * self.learning_rates[p] for p in self.params] return ups, learn_rates
def cost(self,Y,q_h): z = self.score(q_h) z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 rval = as_floatX(log_prob_of.mean()) return - rval
def cost_from_X(self, data): X, Y = data z = self.score(X) z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 rval = as_floatX(log_prob_of.mean()) return - rval
def test_free_energy(self): rng = N.random.RandomState([1, 2, 3]) m = 2**nh Vv = as_floatX(N.zeros((m, nv)) + rng.randn(nv)) F, = F_func(Vv[0:1, :]) Hv = as_floatX(N.zeros((m, nh))) for i in xrange(m): for j in xrange(nh): Hv[i, j] = (i & (2**j)) / (2**j) Ev = E_func(Vv, Hv) Fv = -N.log(N.exp(-Ev).sum()) assert abs(F - Fv) < 1e-6
def theano_norms(W): """ .. todo:: WRITEME properly returns a vector containing the L2 norm of each column of W, where W and the return value are symbolic theano variables """ return T.sqrt(as_floatX(1e-8)+T.sqr(W).sum(axis=0))
def test_score(self): rng = N.random.RandomState([1, 2, 3]) m = 10 Vv = as_floatX(rng.randn(m, nv)) Sv = score_func(Vv) gSv = generic_score_func(Vv) assert N.allclose(Sv, gSv)
def test_free_energy(self): rng = N.random.RandomState([1, 2, 3]) m = 2 ** self.nh Vv = as_floatX(N.zeros((m, self.nv)) + rng.randn(self.nv)) F, = self.F_func(Vv[0:1, :]) Hv = as_floatX(N.zeros((m, self.nh))) for i in xrange(m): for j in xrange(self.nh): Hv[i, j] = (i & (2 ** j)) / (2 ** j) Ev = self.E_func(Vv, Hv) Fv = -N.log(N.exp(-Ev).sum()) assert abs(F-Fv) < 1e-6
def test_score(self): rng = N.random.RandomState([1, 2, 3]) m = 10 Vv = as_floatX(rng.randn(m, self.nv)) Sv = self.score_func(Vv) gSv = self.generic_score_func(Vv) assert N.allclose(Sv, gSv)
def setUpClass(cls): cls.test_m = 2 cls.rng = N.random.RandomState([1, 2, 3]) cls.nv = 3 cls.nh = 4 cls.vW = cls.rng.randn(cls.nv, cls.nh) cls.W = sharedX(cls.vW) cls.vbv = as_floatX(cls.rng.randn(cls.nv)) cls.bv = T.as_tensor_variable(cls.vbv) cls.bv.tag.test_value = cls.vbv cls.vbh = as_floatX(cls.rng.randn(cls.nh)) cls.bh = T.as_tensor_variable(cls.vbh) cls.bh.tag.test_value = cls.bh cls.vsigma = as_floatX(cls.rng.uniform(0.1, 5)) cls.sigma = T.as_tensor_variable(cls.vsigma) cls.sigma.tag.test_value = cls.vsigma cls.E = GRBM_Type_1(transformer=MatrixMul(cls.W), bias_vis=cls.bv, bias_hid=cls.bh, sigma=cls.sigma) cls.V = T.matrix() cls.V.tag.test_value = as_floatX(cls.rng.rand(cls.test_m, cls.nv)) cls.H = T.matrix() cls.H.tag.test_value = as_floatX(cls.rng.rand(cls.test_m, cls.nh)) cls.E_func = function([cls.V, cls.H], cls.E([cls.V, cls.H])) cls.F_func = function([cls.V], cls.E.free_energy(cls.V)) cls.log_P_H_given_V_func = \ function([cls.H, cls.V], cls.E.log_P_H_given_V(cls.H, cls.V)) cls.score_func = function([cls.V], cls.E.score(cls.V)) cls.F_of_V = cls.E.free_energy(cls.V) cls.dummy = T.sum(cls.F_of_V) cls.negscore = T.grad(cls.dummy, cls.V) cls.score = -cls.negscore cls.generic_score_func = function([cls.V], cls.score)
def get_monitoring_channels(self, data): X, Y = data rval = OrderedDict() W_context = self.W W_target = self.W b = self.b C = self.C sq_W_context = T.sqr(W_context) # sq_W_target = T.sqr(W_target) sq_b = T.sqr(b) sq_c = T.sqr(C) row_norms_W_context = T.sqrt(sq_W_context.sum(axis=1)) col_norms_W_context = T.sqrt(sq_W_context.sum(axis=0)) # row_norms_W_target = T.sqrt(sq_W_target.sum(axis=1)) # col_norms_W_target = T.sqrt(sq_W_target.sum(axis=0)) col_norms_b = T.sqrt(sq_b.sum(axis=0)) col_norms_c = T.sqrt(sq_c.sum(axis=0)) rval = OrderedDict([ ('W_context_row_norms_min' , row_norms_W_context.min()), ('W_context_row_norms_mean' , row_norms_W_context.mean()), ('W_context_row_norms_max' , row_norms_W_context.max()), ('W_context_col_norms_min' , col_norms_W_context.min()), ('W_context_col_norms_mean' , col_norms_W_context.mean()), ('W_context_col_norms_max' , col_norms_W_context.max()), # ('W_target_row_norms_min' , row_norms_W_target.min()), # ('W_target_row_norms_mean' , row_norms_W_target.mean()), # ('W_target_row_norms_max' , row_norms_W_target.max()), # ('W_target_col_norms_min' , col_norms_W_target.min()), # ('W_target_col_norms_mean' , col_norms_W_target.mean()), # ('W_target_col_norms_max' , col_norms_W_target.max()), ('b_col_norms_min' , col_norms_b.min()), ('b_col_norms_mean' , col_norms_b.mean()), ('b_col_norms_max' , col_norms_b.max()), ('c_col_norms_min' , col_norms_c.min()), ('c_col_norms_mean' , col_norms_c.mean()), ('c_col_norms_max' , col_norms_c.max()), ]) nll = self.cost_from_X(data) rval['perplexity'] = as_floatX(10 ** (nll/np.log(10))) return rval
def normalize_image(img): """ Converts an image into the format used by ``read()``. """ if img.mode == 'LAB' or img.mode == 'HSV': raise ValueError('%s image mode is not supported' % img.mode) img = img.convert('RGBA') imarray = as_floatX(numpy.array(img)) / 255.0 assert numpy.all(imarray >= 0.0) and numpy.all(imarray <= 1.0) assert len(imarray.shape) == 3 assert imarray.shape[2] == 4 return imarray
def test(store_inverse): rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(15, 10)) preprocessed_X = copy.copy(X) preprocessor = ZCA(store_inverse=store_inverse) dataset = DenseDesignMatrix(X=preprocessed_X, preprocessor=preprocessor, fit_preprocessor=True) preprocessed_X = dataset.get_design_matrix() assert_allclose(X, preprocessor.inverse(preprocessed_X))
def setUpClass(cls): cls.test_m = 2 cls.rng = N.random.RandomState([1, 2, 3]) cls.nv = 3 cls.nh = 4 cls.vW = cls.rng.randn(cls.nv, cls.nh) cls.W = sharedX(cls.vW) cls.vbv = as_floatX(cls.rng.randn(cls.nv)) cls.bv = T.as_tensor_variable(cls.vbv) cls.bv.tag.test_value = cls.vbv cls.vbh = as_floatX(cls.rng.randn(cls.nh)) cls.bh = T.as_tensor_variable(cls.vbh) cls.bh.tag.test_value = cls.bh cls.vsigma = as_floatX(cls.rng.uniform(0.1, 5)) cls.sigma = T.as_tensor_variable(cls.vsigma) cls.sigma.tag.test_value = cls.vsigma cls.E = GRBM_Type_1(transformer=MatrixMul(cls.W), bias_vis=cls.bv, bias_hid=cls.bh, sigma=cls.sigma) cls.V = T.matrix() cls.V.tag.test_value = as_floatX(cls.rng.rand(cls.test_m, cls.nv)) cls.H = T.matrix() cls.H.tag.test_value = as_floatX(cls.rng.rand(cls.test_m, cls.nh)) cls.E_func = function([cls.V, cls.H], cls.E([cls.V, cls.H])) cls.F_func = function([cls.V], cls.E.free_energy(cls.V)) cls.log_P_H_given_V_func = \ function([cls.H, cls.V], cls.E.log_P_H_given_V(cls.H, cls.V)) cls.score_func = function([cls.V], cls.E.score(cls.V)) cls.F_of_V = cls.E.free_energy(cls.V) cls.dummy = T.sum(cls.F_of_V) cls.negscore = T.grad(cls.dummy, cls.V) cls.score = - cls.negscore cls.generic_score_func = function([cls.V], cls.score)
def nll(self, data): X, Y = data z = self.score(X) z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) Y = OneHotFormatter(self.dict_size).theano_expr(Y) Y = Y.reshape((Y.shape[0], Y.shape[2])) #import ipdb #ipdb.set_trace() log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 rval = as_floatX(log_prob_of.mean()) return - rval
def test_alpha_jump(self): " tests that alpha is where I think it should be " stats = self.stats mean_h = stats.d['mean_h'] new_mu = self.model.mu mean_hs = stats.d['mean_hs'] mean_sq_s = stats.d['mean_sq_s'] one = as_floatX(1.) two = as_floatX(2.) s_denom1 = mean_sq_s s_denom2 = - two * new_mu * mean_hs s_denom3 = T.sqr(new_mu) * mean_h s_denom = s_denom1 + s_denom2 + s_denom3 new_alpha = one / s_denom new_alpha.name = 'new_alpha' f = function([], new_alpha) Alphav = f() aAlphav = self.model.alpha.get_value() diffs = Alphav - aAlphav max_diff = np.abs(diffs).max() if max_diff > self.tol: print 'Actual alpha: ' print aAlphav print 'Expected alpha: ' print Alphav raise Exception("alpha deviates from its correct value by at most "+str(max_diff))
def test_alpha_jump(self): " tests that alpha is where I think it should be " stats = self.stats mean_h = stats.d['mean_h'] new_mu = self.model.mu mean_hs = stats.d['mean_hs'] mean_sq_s = stats.d['mean_sq_s'] one = as_floatX(1.) two = as_floatX(2.) s_denom1 = mean_sq_s s_denom2 = -two * new_mu * mean_hs s_denom3 = T.sqr(new_mu) * mean_h s_denom = s_denom1 + s_denom2 + s_denom3 new_alpha = one / s_denom new_alpha.name = 'new_alpha' f = function([], new_alpha) Alphav = f() aAlphav = self.model.alpha.get_value() diffs = Alphav - aAlphav max_diff = np.abs(diffs).max() if max_diff > self.tol: print 'Actual alpha: ' print aAlphav print 'Expected alpha: ' print Alphav raise Exception( "alpha deviates from its correct value by at most " + str(max_diff))
def test_multiple_samples_allowed(): """ VAE allows multiple samples per data point """ encoding_model = MLP(layers=[Linear(layer_name="h", dim=10, irange=0.01)]) decoding_model = MLP(layers=[Linear(layer_name="h", dim=10, irange=0.01)]) prior = DiagonalGaussianPrior() conditional = BernoulliVector(mlp=decoding_model, name="conditional") posterior = DiagonalGaussian(mlp=encoding_model, name="posterior") vae = VAE(nvis=10, prior=prior, conditional=conditional, posterior=posterior, nhid=5) X = T.matrix("X") lower_bound = vae.log_likelihood_lower_bound(X, num_samples=10) f = theano.function(inputs=[X], outputs=lower_bound) rng = make_np_rng(default_seed=11223) f(as_floatX(rng.uniform(size=(10, 10))))
def test_multiple_samples_allowed(): """ VAE allows multiple samples per data point """ encoding_model = MLP(layers=[Linear(layer_name='h', dim=10, irange=0.01)]) decoding_model = MLP(layers=[Linear(layer_name='h', dim=10, irange=0.01)]) prior = DiagonalGaussianPrior() conditional = BernoulliVector(mlp=decoding_model, name='conditional') posterior = DiagonalGaussian(mlp=encoding_model, name='posterior') vae = VAE(nvis=10, prior=prior, conditional=conditional, posterior=posterior, nhid=5) X = T.matrix('X') lower_bound = vae.log_likelihood_lower_bound(X, num_samples=10) f = theano.function(inputs=[X], outputs=lower_bound) rng = make_np_rng(default_seed=11223) f(as_floatX(rng.uniform(size=(10, 10))))
def test_zero_image(self): """ Test on zero-value image if cause any division by zero """ X = as_floatX(np.zeros((5, 32 * 32 * 3))) axes = ['b', 0, 1, 'c'] view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) dataset = DenseDesignMatrix(X=X, view_converter=view_converter) dataset.axes = axes preprocessor = LeCunLCN(img_shape=[32, 32]) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert isfinite(result)
def test_channel(self): """ Test if works fine withe different number of channel as argument """ rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(5, 32 * 32 * 3)) axes = ['b', 0, 1, 'c'] view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) dataset = DenseDesignMatrix(X=X, view_converter=view_converter) dataset.axes = axes preprocessor = LeCunLCN(img_shape=[32, 32], channels=[1, 2]) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert isfinite(result)
def test_zero_vector(self): """ Test that passing in the zero vector does not result in a divide by 0 """ dataset = DenseDesignMatrix(X=as_floatX(np.zeros((1, 1)))) # the settings of subtract_mean and use_norm are not relevant to # the test # std_bias = 0.0 is the only value for which there should be a risk # of failure occurring preprocessor = GlobalContrastNormalization(subtract_mean=True, sqrt_bias=0.0, use_std=True) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert not np.any(np.isnan(result)) assert not np.any(np.isinf(result))
def test_conditional_encode_conditional_parameters(): """ Conditional.encode_conditional_parameters calls its MLP's fprop method """ mlp = MLP(layers=[Linear(layer_name="h", dim=5, irange=0.01, max_col_norm=0.01)]) conditional = DummyConditional(mlp=mlp, name="conditional") vae = DummyVAE() conditional.set_vae(vae) input_space = VectorSpace(dim=5) conditional.initialize_parameters(input_space=input_space, ndim=5) X = T.matrix("X") mlp_Y1, mlp_Y2 = mlp.fprop(X) cond_Y1, cond_Y2 = conditional.encode_conditional_params(X) f = theano.function([X], [mlp_Y1, mlp_Y2, cond_Y1, cond_Y2]) rval = f(as_floatX(numpy.random.uniform(size=(10, 5)))) numpy.testing.assert_allclose(rval[0], rval[2]) numpy.testing.assert_allclose(rval[1], rval[3])
def test_grad_alpha(self): """tests that the gradient of the log probability with respect to alpha matches my analytical derivation """ #self.model.set_param_values(self.new_params) g = T.grad(self.prob, self.model.alpha, consider_constant=self.mf_obs.values()) mu = self.model.mu alpha = self.model.alpha half = as_floatX(.5) mean_sq_s = self.stats.d['mean_sq_s'] mean_hs = self.stats.d['mean_hs'] mean_h = self.stats.d['mean_h'] term1 = -half * mean_sq_s term2 = mu * mean_hs term3 = -half * T.sqr(mu) * mean_h term4 = half / alpha analytical = term1 + term2 + term3 + term4 f = function([], (g, analytical)) gv, av = f() assert gv.shape == av.shape max_diff = np.abs(gv - av).max() if max_diff > self.tol: print "gv" print gv print "av" print av raise Exception( "analytical gradient on alpha deviates from theano gradient on alpha by up to " + str(max_diff))
def gibbs_step_for_v(self, v, rng): """ Do a round of block Gibbs sampling given visible configuration Parameters ---------- v : tensor_like Theano symbolic representing the hidden unit states for a batch of training examples (or negative phase particles), with the first dimension indexing training examples and the second indexing data dimensions. rng : RandomStreams object Random number generator to use for sampling the hidden and visible units. Returns ------- v_sample : tensor_like Theano symbolic representing the new visible unit state after one round of Gibbs sampling. locals : dict Contains the following auxillary state as keys (all symbolics except shape tuples): * `h_mean`: the returned value from `mean_h_given_v` * `h_mean_shape`: shape tuple indicating the size of `h_mean` and `h_sample` * `h_sample`: the stochastically sampled hidden units * `v_mean_shape`: shape tuple indicating the shape of `v_mean` and `v_sample` * `v_mean`: the returned value from `mean_v_given_h` * `v_sample`: the stochastically sampled visible units """ h_mean = self.mean_h_given_v(v) # For binary hidden units # TODO: factor further to extend to other kinds of hidden units # (e.g. spike-and-slab) h_mean_shape = self.batch_size, self.nhid h_sample = as_floatX(rng.uniform(size=h_mean_shape) < h_mean) v_mean_shape = self.batch_size, self.nvis # v_mean is always based on h_sample, not h_mean, because we don't # want h transmitting more than one bit of information per unit. v_mean = self.mean_v_given_h(h_sample) v_sample = self.sample_visibles([v_mean], v_mean_shape, rng) return v_sample, locals()
def __init__(self, params, base_lr, anneal_start=None, **kwargs): """ Construct an SGDOptimizer. Parameters ---------- params : object or list Either a Model object with a .get_params() method, or a list of parameters to be optimized. base_lr : float The base learning rate before annealing or parameter-specific scaling. anneal_start : int Number of steps after which to start annealing the learning rate at a 1/t schedule, where t is the number of stochastic gradient updates. Notes ----- The formula to compute the effective learning rate on a parameter is: <paramname>_lr * max(0.0, min(base_lr, lr_anneal_start/(iteration+1))) Parameter-specific learning rates can be set by passing keyword arguments <name>_lr, where name is the .name attribute of a given parameter. Parameter-specific bounding values can be specified by passing keyword arguments <param>_clip, which should be a (min, max) pair. """ if hasattr(params, '__iter__'): self.params = params elif hasattr(params, 'get_params') and hasattr(params.get_params, '__call__'): self.params = params.get_params() else: raise ValueError("SGDOptimizer couldn't figure out what to do " "with first argument: '%s'" % str(params)) if anneal_start == None: self.anneal_start = None else: self.anneal_start = as_floatX(anneal_start) # Set up the clipping values self.clipping_values = {}
def test_zero_vector(self): """ Test that passing in the zero vector does not result in a divide by 0 """ dataset = DenseDesignMatrix(X=as_floatX(np.zeros((1, 1)))) # the settings of subtract_mean and use_norm are not relevant to # the test # std_bias = 0.0 is the only value for which there should be a risk # of failure occurring preprocessor = GlobalContrastNormalization(subtract_mean=True, sqrt_bias=0.0, use_std=True) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert isfinite(result)
def test_zca(): """ Confirm that ZCA.inv_P_ is the correct inverse of ZCA.P_. There's a lot else about the ZCA class that could be tested here. """ rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(15, 10)) preprocessor = ZCA() preprocessor.fit(X) def is_identity(matrix): identity = np.identity(matrix.shape[0], theano.config.floatX) abs_difference = np.abs(identity - matrix) return (abs_difference < .0001).all() assert preprocessor.P_.shape == (X.shape[1], X.shape[1]) assert not is_identity(preprocessor.P_) assert is_identity(np.dot(preprocessor.P_, preprocessor.inv_P_))
def test_conditional_encode_conditional_parameters(): """ Conditional.encode_conditional_parameters calls its MLP's fprop method """ mlp = MLP(layers=[Linear(layer_name='h', dim=5, irange=0.01, max_col_norm=0.01)]) conditional = DummyConditional(mlp=mlp, name='conditional') vae = DummyVAE() conditional.set_vae(vae) input_space = VectorSpace(dim=5) conditional.initialize_parameters(input_space=input_space, ndim=5) X = T.matrix('X') mlp_Y1, mlp_Y2 = mlp.fprop(X) cond_Y1, cond_Y2 = conditional.encode_conditional_params(X) f = theano.function([X], [mlp_Y1, mlp_Y2, cond_Y1, cond_Y2]) rval = f(as_floatX(numpy.random.uniform(size=(10, 5)))) numpy.testing.assert_allclose(rval[0], rval[2]) numpy.testing.assert_allclose(rval[1], rval[3])
def sample_visibles(self, params, shape, rng): """ Stochastically sample the visible units given hidden unit configurations for a set of training examples. Parameters ---------- params : list List of the necessary parameters to sample :math:`p(v|h)`. In the case of a binary-binary RBM this is a single-element list containing the symbolic representing :math:`p(v|h)`, as returned by `mean_v_given_h`. Returns ------- vprime : tensor_like Theano symbolic representing stochastic samples from :math:`p(v|h)` """ v_mean = params[0] return as_floatX(rng.uniform(size=shape) < v_mean)
def test_rgb_yuv(): """ Test on a random image if the per-processor loads and works without anyerror and doesn't result in any nan or inf values """ rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(5, 32 * 32 * 3)) axes = ['b', 0, 1, 'c'] view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) dataset = DenseDesignMatrix(X=X, view_converter=view_converter) dataset.axes = axes preprocessor = RGB_YUV() dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert isfinite(result)
def test_random_image(self): """ Test on a random image if the per-processor loads and works without anyerror and doesn't result in any nan or inf values """ rng = np.random.RandomState([1, 2, 3]) X = as_floatX(rng.randn(5, 32 * 32 * 3)) axes = ['b', 0, 1, 'c'] view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3), axes) dataset = DenseDesignMatrix(X=X, view_converter=view_converter) dataset.axes = axes preprocessor = LeCunLCN(img_shape=[32, 32]) dataset.apply_preprocessor(preprocessor) result = dataset.get_design_matrix() assert not np.any(np.isnan(result)) assert not np.any(np.isinf(result))
def gibbs_step_for_v(self, v, rng): # Sometimes, the number of examples in the data set is not a # multiple of self.batch_size. batch_size = v.shape[0] # sample h given v h_mean = self.mean_h_given_v(v) h_mean_shape = (batch_size, self.nhid) h_sample = as_floatX(rng.uniform(size=h_mean_shape) < h_mean) # sample s given (v,h) s_mu, s_var = self.mean_var_s_given_v_h1(v) s_mu_shape = (batch_size, self.nslab) s_sample = s_mu + rng.normal(size=s_mu_shape) * tensor.sqrt(s_var) #s_sample=(s_sample.reshape()*h_sample.dimshuffle(0,1,'x')).flatten(2) # sample v given (s,h) v_mean, v_var = self.mean_var_v_given_h_s(h_sample, s_sample) v_mean_shape = (batch_size, self.nvis) v_sample = rng.normal(size=v_mean_shape) * tensor.sqrt(v_var) + v_mean del batch_size return v_sample, locals()
def setup(self): rng = np.random.RandomState([1, 2, 3]) self.dataset = DenseDesignMatrix(X=as_floatX(rng.randn(15, 10)), y=as_floatX(rng.randn(15, 1))) self.num_components = self.dataset.get_design_matrix().shape[1] - 1
def __init__(self, params, base_lr, anneal_start=None, use_adagrad=False, **kwargs): """ Construct an SGDOptimizer. Parameters ---------- params : object or list Either a Model object with a .get_params() method, or a list of parameters to be optimized. base_lr : float The base learning rate before annealing or parameter-specific scaling. anneal_start : int Number of steps after which to start annealing the learning rate at a 1/t schedule, where t is the number of stochastic gradient updates. use_adagrad : bool 'adagrad' adaptive learning rate scheme is used. If set to True, base_lr is used as e0. Notes ----- The formula to compute the effective learning rate on a parameter is: <paramname>_lr * max(0.0, min(base_lr, lr_anneal_start/(iteration+1))) Parameter-specific learning rates can be set by passing keyword arguments <name>_lr, where name is the .name attribute of a given parameter. Parameter-specific bounding values can be specified by passing keyword arguments <param>_clip, which should be a (min, max) pair. Adagrad is recommended with sparse inputs. It normalizes the base learning rate of a parameter theta_i by the accumulated 2-norm of its gradient: e{ti} = e0 / sqrt( sum_t (dL_t / dtheta_i)^2 ) """ if hasattr(params, '__iter__'): self.params = params elif hasattr(params, 'get_params') and hasattr(params.get_params, '__call__'): self.params = params.get_params() else: raise ValueError("SGDOptimizer couldn't figure out what to do " "with first argument: '%s'" % str(params)) if anneal_start == None: self.anneal_start = None else: self.anneal_start = as_floatX(anneal_start) # Create accumulators and epsilon0's self.use_adagrad = use_adagrad if self.use_adagrad: self.accumulators = {} self.e0s = {} for param in self.params: self.accumulators[param] = theano.shared(value=as_floatX(0.), name='acc_%s' % param.name) self.e0s[param] = as_floatX(base_lr) # Set up the clipping values self.clipping_values = {} # Keep track of names already seen clip_names_seen = set() for parameter in self.params: clip_name = '%s_clip' % parameter.name if clip_name in kwargs: if clip_name in clip_names_seen: print >> sys.stderr, ( 'Warning: In SGDOptimizer, ' 'at least two parameters have the same name. ' 'Both will be affected by the keyword argument ' '%s.' % clip_name) clip_names_seen.add(clip_name) p_min, p_max = kwargs[clip_name] assert p_min <= p_max self.clipping_values[parameter] = (p_min, p_max) # Check that no ..._clip keyword is being ignored for clip_name in clip_names_seen: kwargs.pop(clip_name) for kw in kwargs.iterkeys(): if kw[-5:] == '_clip': print >> sys.stderr, ( 'Warning: in SGDOptimizer, ' 'keyword argument %s will be ignored, ' 'because no parameter was found with name %s.' % (kw, kw[:-5])) self.learning_rates_setup(base_lr, **kwargs)
from theano.compat.six.moves import xrange import theano.tensor as T from theano import function from pylearn2.utils import as_floatX from pylearn2.utils import sharedX from pylearn2.linear.matrixmul import MatrixMul test_m = 2 rng = N.random.RandomState([1, 2, 3]) nv = 3 nh = 4 vW = rng.randn(nv, nh) W = sharedX(vW) vbv = as_floatX(rng.randn(nv)) bv = T.as_tensor_variable(vbv) bv.tag.test_value = vbv vbh = as_floatX(rng.randn(nh)) bh = T.as_tensor_variable(vbh) bh.tag.test_value = bh vsigma = as_floatX(rng.uniform(0.1, 5)) sigma = T.as_tensor_variable(vsigma) sigma.tag.test_value = vsigma E = GRBM_Type_1(transformer=MatrixMul(W), bias_vis=bv, bias_hid=bh, sigma=sigma) V = T.matrix()