def init_parameters(self): # marginal precision on visible units self.lambd = sharedX(self.iscales['lambd'] * numpy.ones(self.n_v), name='lambd') # init scalar norm for each entry of Wv sn_val = self.iscales['scalar_norms'] * numpy.ones(self.n_f) self.scalar_norms = sharedX(sn_val, name='scalar_norms') # init weight matrices self.Wv = self.init_weight(1.0, (self.n_v, self.n_f), 'Wv') if self.sparse_gmask or self.sparse_hmask: assert self.sparse_gmask and self.sparse_hmask self.Wg = sharedX(self.sparse_gmask.mask * self.iscales.get('Wg', 1.0), name='Wg') self.Wh = sharedX(self.sparse_hmask.mask * self.iscales.get('Wh', 1.0), name='Wh') else: self.Wg = self.init_weight(1.0, (self.n_g, self.n_f), 'Wg') self.Wh = self.init_weight(1.0, (self.n_h, self.n_f), 'Wh') # bias parameters of g, h self.gbias = sharedX(self.iscales['gbias'] * numpy.ones(self.n_g), name='gbias') self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') # mean (mu) and precision (alpha) parameters on s self.mu = sharedX(self.iscales['mu'] * numpy.ones(self.n_g), name='mu') self.alpha = sharedX(self.iscales['alpha'] * numpy.ones(self.n_g), name='alpha') # mean (eta) and precision (beta) parameters on t self.eta = sharedX(self.iscales['eta'] * numpy.ones(self.n_h), name='eta') self.beta = sharedX(self.iscales['beta'] * numpy.ones(self.n_h), name='beta') # optional reparametrization of precision parameters self.lambd_prec = T.nnet.softplus(self.lambd) self.alpha_prec = T.nnet.softplus(self.alpha) self.beta_prec = T.nnet.softplus(self.beta)
def init_chains(self): """ Allocate shared variable for persistent chain """ # initialize buffers to store inference state self.pos_g = sharedX(numpy.zeros((self.batch_size, self.n_g)), name='pos_g') self.pos_h = sharedX(numpy.zeros((self.batch_size, self.n_h)), name='pos_h') self.pos_s1 = sharedX(numpy.zeros((self.batch_size, self.n_s)), name='pos_s1') self.pos_s0 = sharedX(numpy.zeros((self.batch_size, self.n_s)), name='pos_s0') # initialize visible unit chains scale = numpy.sqrt(1./softplus(self.lambd.get_value())) neg_v = self.rng.normal(loc=0, scale=scale, size=(self.batch_size, self.n_v)) self.neg_v = sharedX(neg_v, name='neg_v') # initialize s-chain loc = self.mu.get_value() scale = numpy.sqrt(1./softplus(self.alpha.get_value())) neg_s = self.rng.normal(loc=loc, scale=scale, size=(self.batch_size, self.n_s)) self.neg_s = sharedX(neg_s, name='neg_s') # initialize binary g-h chains pval_g = sigm(self.gbias.get_value()) pval_h = sigm(self.hbias.get_value()) neg_g = self.rng.binomial(n=1, p=pval_g, size=(self.batch_size, self.n_g)) neg_h = self.rng.binomial(n=1, p=pval_h, size=(self.batch_size, self.n_h)) self.neg_h = sharedX(neg_h, name='neg_h') self.neg_g = sharedX(neg_g, name='neg_g') # other misc. self.pos_counter = sharedX(0., name='pos_counter') self.odd_even = sharedX(0., name='odd_even')
def init_parameters(self): # init scalar norm for each entry of Wv sn_val = self.iscales['scalar_norms'] * numpy.ones(self.n_s) self.scalar_norms = sharedX(sn_val, name='scalar_norms') # init weight matrices normalize_wv = self.flags['wv_norm'] == 'unit' self.Wv = self.init_weight(self.iscales['Wv'], (self.n_v, self.n_s), 'Wv', normalize=normalize_wv) if self.sparse_gmask or self.sparse_hmask: assert self.sparse_gmask and self.sparse_hmask self.Wg = sharedX(self.sparse_gmask.mask * self.iscales.get('Wg', 1.0), name='Wg') self.Wh = sharedX(self.sparse_hmask.mask * self.iscales.get('Wh', 1.0), name='Wh') else: normalize_wg = self.flags['wg_norm'] == 'unit' normalize_wh = self.flags['wh_norm'] == 'unit' self.Wg = self.init_weight(self.iscales['Wg'], (self.n_g, self.n_s), 'Wg', normalize=normalize_wg) self.Wh = self.init_weight(self.iscales['Wh'], (self.n_h, self.n_s), 'Wh', normalize=normalize_wh) # avg norm (for wgh_norm='roland') norm_wg = numpy.sqrt(numpy.sum(self.Wg.get_value()**2, axis=0)).mean() norm_wh = numpy.sqrt(numpy.sum(self.Wh.get_value()**2, axis=0)).mean() self.avg_norm_wg = sharedX(norm_wg, name='avg_norm_wg') self.avg_norm_wh = sharedX(norm_wh, name='avg_norm_wh') # allocate shared variables for bias parameters self.gbias = sharedX(self.iscales['gbias'] * numpy.ones(self.n_g), name='gbias') self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') self.vbias = sharedX(self.iscales['vbias'] * numpy.ones(self.n_v), name='vbias') # mean (mu) and precision (alpha) parameters on s self.mu = sharedX(self.iscales['mu'] * numpy.ones(self.n_s), name='mu') self.alpha = sharedX(self.iscales['alpha'] * numpy.ones(self.n_s), name='alpha') self.alpha_prec = T.nnet.softplus(self.alpha)
def __init__(self, conf, numpy_rng, W, Lambda): """ :param W: a LinearTransform instance for the weights. :param Lambda: a LinearTransform instance, parametrizing the h-dependent precision information regarding visibles. """ self.conf = conf self.W = W self.Lambda = Lambda if Lambda: if W.col_shape() != Lambda.col_shape(): raise ValueError('col_shape mismatch', (W.col_shape(), Lambda.col_shape())) if W.row_shape() != Lambda.row_shape(): raise ValueError('row_shape mismatch', (W.row_shape(), Lambda.row_shape())) # Energy term has vW(sh), so... h_shp = self.h_shp = W.col_shape() s_shp = self.s_shp = W.col_shape() v_shp = self.v_shp = W.row_shape() logger.info("RBM Shapes h_shp=%s, s_shp=%s, v_shp=%s" %(h_shp, s_shp, v_shp)) # alpha (precision on slab variables) alpha_init = numpy.zeros(s_shp)+conf['alpha0'] if conf['alpha_irange']: alpha_init += (2 * numpy_rng.rand(*s_shp) - 1)*conf['alpha_irange'] if conf['alpha_logdomain']: self.alpha = sharedX(numpy.log(alpha_init), name='alpha') else: self.alpha = sharedX(alpha_init, name='alpha') # mu (mean of slab vars) self.mu = sharedX( conf['mu0'] + numpy_rng.uniform(size=s_shp, low=-conf['mu_irange'], high=conf['mu_irange']), name='mu') # b (bias of spike vars) self.b = sharedX( conf['b0'] + numpy_rng.uniform(size=h_shp, low=-conf['b_irange'], high=conf['b_irange']), name='b') # B (precision on visible vars) if conf['B_full_diag']: B_init = numpy.zeros(v_shp) + conf['B0'] else: B_init = numpy.zeros(()) + conf['B0'] if conf['B_logdomain']: B_init = numpy.log(B_init) self.B = sharedX(B_init, name='B') self._params = [self.mu, self.B, self.b, self.alpha]
def __init__(self, name, hidden_dim, input_dim, init_std): super(RNN, self).__init__(name, trainable=True) self.hidden_dim = hidden_dim self.Wx = sharedX(np.random.randn(input_dim, hidden_dim) * init_std, name=name + '/Wx') self.Wh = sharedX(np.random.randn(hidden_dim, hidden_dim) * init_std, name=name + '/Wh') self.b = sharedX(np.zeros((hidden_dim)), name=name + '/b')
def init_parameters(self): # init weight matrices self.Wv = self.init_weight(self.iscales.get('Wv', 1.0), (self.n_v, self.n_h), 'Wv', normalize=False) # allocate shared variables for bias parameters self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') # diagonal of precision matrix of visible units self.lambd = sharedX(self.iscales['lambd'] * numpy.ones(self.n_v), name='lambd') self.lambd_prec = T.nnet.softplus(self.lambd)
def __init__(self, conf, numpy_rng, W, Lambda): """ :param W: a LinearTransform instance for the weights. :param Lambda: a LinearTransform instance, parametrizing the h-dependent precision information regarding visibles. """ self.conf = conf self.W = W self.Lambda = Lambda if Lambda: if W.col_shape() != Lambda.col_shape(): raise ValueError('col_shape mismatch', (W.col_shape(), Lambda.col_shape())) if W.row_shape() != Lambda.row_shape(): raise ValueError('row_shape mismatch', (W.row_shape(), Lambda.row_shape())) # Energy term has vW(sh), so... h_shp = self.h_shp = W.col_shape() s_shp = self.s_shp = W.col_shape() v_shp = self.v_shp = W.row_shape() logger.info("RBM Shapes h_shp=%s, s_shp=%s, v_shp=%s" % (h_shp, s_shp, v_shp)) # alpha (precision on slab variables) alpha_init = numpy.zeros(s_shp) + conf['alpha0'] if conf['alpha_irange']: alpha_init += (2 * numpy_rng.rand(*s_shp) - 1) * conf['alpha_irange'] if conf['alpha_logdomain']: self.alpha = sharedX(numpy.log(alpha_init), name='alpha') else: self.alpha = sharedX(alpha_init, name='alpha') # mu (mean of slab vars) self.mu = sharedX(conf['mu0'] + numpy_rng.uniform( size=s_shp, low=-conf['mu_irange'], high=conf['mu_irange']), name='mu') # b (bias of spike vars) self.b = sharedX(conf['b0'] + numpy_rng.uniform( size=h_shp, low=-conf['b_irange'], high=conf['b_irange']), name='b') # B (precision on visible vars) if conf['B_full_diag']: B_init = numpy.zeros(v_shp) + conf['B0'] else: B_init = numpy.zeros(()) + conf['B0'] if conf['B_logdomain']: B_init = numpy.log(B_init) self.B = sharedX(B_init, name='B') self._params = [self.mu, self.B, self.b, self.alpha]
def init_parameters(self): # init weight matrices self.Wv = self.init_weight(self.iscales.get('Wv', 1.0), (self.n_v, self.n_h), 'Wv') # allocate shared variables for bias parameters self.vbias = sharedX(self.iscales['vbias'] * numpy.ones(self.n_v), name='vbias') self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') self.cv = sharedX(numpy.zeros(self.n_v), name='cv') ch = numpy.ones(self.n_h) * (0.5 if self.flags['enable_centering'] else 0.) self.ch = sharedX(ch, name='ch')
def init_parameters(self): # init weight matrices self.Wv = self.init_weight(self.iscales.get('Wv', 1.0), (self.n_v, self.n_h), 'Wv') # allocate shared variables for bias parameters self.vbias = sharedX(self.iscales['vbias'] * numpy.ones(self.n_v), name='vbias') self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias')
def __init__(self): rng = numpy.random.RandomState(123) self.Wv = sharedX(0.1 * rng.randn(14*14, 10), name='Wv' ) self.hbias = sharedX(-1 * numpy.ones(10), name='hbias') self.alpha = sharedX(0.1 * rng.rand(10), name='alpha') self.mu = sharedX(0.1 * numpy.ones(10), name='mu') self.lambd = sharedX(1.0 * numpy.ones(10), name='lambd') self.bw_s = 1 self.n_h = 10 self.input = T.matrix('input')
def init_params(options): params = OrderedDict() params['W_users'] = sharedX(normal((options['n_users'],options['n_factors'])), name='W_users') params['W_items'] = sharedX(normal((options['n_items'],options['n_factors'])), name='W_items') params['b_users'] = sharedX(np.zeros((options['n_users'], 1)), name='b_users') params['b_items'] = sharedX(np.zeros((options['n_items'], 1)), name='b_items') params['b'] = sharedX(np.zeros(1), name='b') return params
def __init__(self, conf, rbm, sampler, visible_batch, clippers): self.conf = conf self.rbm = rbm self.sampler = sampler self.visible_batch = visible_batch self.iter = sharedX(0, 'iter') self.annealing_coef = sharedX(0.0, 'annealing_coef') self.lr_dict = lr_dict = {} for p in rbm.params(): lrname = '%s_lr' % p.name lr_dict[p] = sharedX(conf.get(lrname, 1.0), lrname) self.clippers = clippers
def __init__(self, conf, rbm, sampler, visible_batch, clippers): self.conf = conf self.rbm = rbm self.sampler = sampler self.visible_batch = visible_batch self.iter=sharedX(0, 'iter') self.annealing_coef=sharedX(0.0, 'annealing_coef') self.lr_dict = lr_dict = {} for p in rbm.params(): lrname = '%s_lr'%p.name lr_dict[p] = sharedX(conf.get(lrname, 1.0), lrname) self.clippers = clippers
def init_chains(self): """ Allocate shared variable for persistent chain """ # initialize visible unit chains scale = numpy.sqrt(1./softplus(self.lambd.get_value())) neg_v = self.rng.normal(loc=0, scale=scale, size=(self.batch_size, self.n_v)) self.neg_v = sharedX(neg_v, name='neg_v') # initialize s-chain scale = numpy.sqrt(1./softplus(self.alpha.get_value())) neg_s = self.rng.normal(loc=0., scale=scale, size=(self.batch_size, self.n_s)) self.neg_s = sharedX(neg_s, name='neg_s') # initialize binary g-h chains pval_h = sigm(self.hbias.get_value()) neg_h = self.rng.binomial(n=1, p=pval_h, size=(self.batch_size, self.n_h)) self.neg_h = sharedX(neg_h, name='neg_h')
def get_updates(grads, momentum_lambda=None): """ Returns an updates dictionary corresponding to a single step of SGD. The learning rate for each parameter is computed as lr * multipliers[param] :param lr: base learning rate (common to all parameters) :param multipliers: dictionary of learning rate multipliers, each being a shared var e.g. {'hbias': sharedX(0.1), 'Wf': sharedX(0.01)} """ updates = OrderedDict() momentum = OrderedDict() for (param, gparam) in grads.iteritems(): if momentum_lambda: # create storage for momentum term momentum[param] = sharedX(numpy.zeros_like(param.get_value()), name=param.name + '_old') new_grad = (1. - momentum_lambda ) * gparam + momentum_lambda * momentum[param] updates[param] = param - new_grad updates[momentum[param]] = new_grad else: updates[param] = param - gparam return updates
def load_penn_treebank(path, sequence_length=100, return_raw=False): ''' Loads the Penn Treebank dataset Parameters ---------- path : str The path to the dataset file (.npz). sequence_length : int, optional All sequences of characters will have the same length. References ---------- This dataset comes from https://github.com/GabrielPereyra/norm-rnn/tree/master/data. ''' if not os.path.isfile(path): # Download the dataset. data_dir, data_file = os.path.split(path) ptb_zipfile = os.path.join(data_dir, 'ptb.zip') if not os.path.isfile(ptb_zipfile): import urllib origin = 'https://www.dropbox.com/s/9hwo2392mfgnnlu/ptb.zip?dl=1' # Marc's dropbox, TODO: put that somewhere else. print("Downloading data (2 Mb) from {} ...".format(origin)) urllib.urlretrieve(origin, ptb_zipfile) # Load the dataset and process it. print("Processing data ...") with zipfile.ZipFile(ptb_zipfile) as f: train = "\n".join((l.lstrip() for l in f.read('ptb.train.txt').split('\n'))) valid = "\n".join((l.lstrip() for l in f.read('ptb.valid.txt').split('\n'))) test = "\n".join((l.lstrip() for l in f.read('ptb.test.txt').split('\n'))) chars = list(set(train) | set(valid) | set(test)) data_size = len(train) + len(valid) + len(test) vocab_size = len(chars) print("Dataset has {:,} characters ({:,} | {:,} | {:,}), {:,} unique.".format(data_size, len(train), len(valid), len(test), vocab_size)) words = train.split(), valid.split(), test.split() n_words = len(words[0]) + len(words[1]) + len(words[2]) print("Dataset has {:,} words ({:,} | {:,} | {:,}), {:,} unique.".format(n_words, len(words[0]), len(words[1]), len(words[2]), len(set(words[0]) | set(words[1]) | set(words[2])))) chr2idx = {c: i for i, c in enumerate(chars)} idx2chr = {i: c for i, c in enumerate(chars)} train = np.array([chr2idx[c] for c in train], dtype=np.int8) valid = np.array([chr2idx[c] for c in valid], dtype=np.int8) test = np.array([chr2idx[c] for c in test], dtype=np.int8) np.savez(path, train=train, valid=valid, test=test, chr2idx=chr2idx, idx2chr=idx2chr) print("Loading data ...") ptb = np.load(path) if return_raw: return (ptb['train'], ptb['valid'], ptb['test']), ptb['idx2chr'].item() # datasets = [_shared_dataset(_split_into_sequences(d, sequence_length)) for d in [ptb['train'], ptb['valid'], ptb['test']]] datasets = [utils.sharedX(_split_into_sequences(d, sequence_length)) for d in [ptb['train'], ptb['valid'], ptb['test']]] return datasets, ptb['idx2chr'].item()
def cd_updates(self, pos_v, neg_v, lr, other_cost=0): grads = contrastive_grad(self.free_energy_given_v, pos_v, neg_v, wrt=self.params(), other_cost=other_cost) stepsizes=lr if self.conf.get('momentum', 0.0): logger.info('Using momentum %s'%self.conf['momentum']) rval = dict( sgd_momentum_updates( self.params(), grads, stepsizes=stepsizes, momentum=self.conf['momentum'])) else: rval = dict( sgd_updates( self.params(), grads, stepsizes=stepsizes)) #DEBUG STORE GRADS grad_shared_vars = [sharedX(0*p.get_value(),'') for p in self.params()] self.grad_shared_vars = grad_shared_vars rval.update(dict(zip(grad_shared_vars, grads))) return rval
def orthogonal(shape, scale=1.1): """ benanne lasagne ortho init (faster than qr approach)""" flat_shape = (shape[0], np.prod(shape[1:])) a = np.random.normal(0.0, 1.0, flat_shape) u, _, v = np.linalg.svd(a, full_matrices=False) q = u if u.shape == flat_shape else v # pick the one with the correct shape q = q.reshape(shape) return sharedX(scale * q[:shape[0], :shape[1]])
def __call__(self, shape, name=None): if len(shape) == 2: scale = np.sqrt(2./shape[0]) elif len(shape) == 4: scale = np.sqrt(2./np.prod(shape[1:])) else: raise NotImplementedError return sharedX(np_rng.normal(size=shape, scale=scale), name=name)
def __call__(self, shape, name=None): print 'called orthogonal init with shape', shape flat_shape = (shape[0], np.prod(shape[1:])) a = np_rng.normal(0.0, 1.0, flat_shape) u, _, v = np.linalg.svd(a, full_matrices=False) q = u if u.shape == flat_shape else v # pick the one with the correct shape q = q.reshape(shape) return sharedX(self.scale * q[:shape[0], :shape[1]], name=name)
def train(options, train_data, valid_data, test_data): np.random.seed(12345) if not os.path.exists(options['saveto']): os.makedirs(options['saveto']) print 'Building the model...' params = init_params(options) users_id, items_id, bow, y, y_pred, bow_pred, mse, nll, cost = build_model(options, params) print 'Computing gradients...' lrt = sharedX(options['lr']) grads = T.grad(cost, params.values()) updates = sgd(params.values(), grads, lrt) print 'Compiling theano functions...' eval_fn = theano.function([users_id, items_id, y], mse) train_fn = theano.function([users_id, items_id, bow, y], [cost, mse, nll], updates=updates) print "Training..." train_iter = MultiFixDimIterator(*train_data, batch_size=options['batch_size'], shuffle=True) valid_iter = MultiFixDimIterator(*valid_data, batch_size=100) test_iter = MultiFixDimIterator(*test_data, batch_size=100) best_valid = float('inf') best_test = float('inf') n_batches = np.ceil(train_data[0].shape[0]*1./options['batch_size']).astype('int') disp_str = ['Train COST', 'Train MSE', 'Train NLL'] for eidx in range(options['n_epochs']): accum_cost, accum_mse, accum_nll = 0., 0., 0. for batch in train_iter: batch = prepare_batch_data(options, batch) b_cost, b_mse, b_nll = train_fn(*batch) accum_cost += b_cost accum_mse += b_mse accum_nll += b_nll disp_val = [val/n_batches for val in [accum_cost, accum_mse, accum_nll]] res_str = ('[%d] ' % eidx) + ", ".join("%s: %.4f" %(s,v) for s,v in zip(disp_str, disp_val)) print res_str if (eidx+1) % options['valid_freq'] == 0: disp_val = [np.mean([eval_fn(*vbatch) for vbatch in valid_iter]), np.mean([eval_fn(*tbatch) for tbatch in test_iter])] res_str = ", ".join("%s: %.4f" %(s,v) for s,v in zip(['Valid MSE', 'Test MSE'], disp_val)) print res_str if best_valid > disp_val[0]: best_valid, best_test = disp_val dump_params(options['saveto'], eidx, "best_params", params) print "Done training..." print "Best Valid MSE: %.4f and Test MSE: %.4f" % best_test
def init_chains(self): """ Allocate shared variable for persistent chain """ # initialize s-chain loc = self.mu.get_value() scale = numpy.sqrt(1./softplus(self.alpha.get_value())) neg_s = self.rng.normal(loc=loc, scale=scale, size=(self.batch_size, self.n_s)) self.neg_s = sharedX(neg_s, name='neg_s') # initialize binary v chains pval_v = sigm(self.vbias.get_value()) neg_v = self.rng.binomial(n=1, p=pval_v, size=(self.batch_size, self.n_v)) self.neg_v = sharedX(neg_v, name='neg_v') # initialize binary h chains pval_h = sigm(self.hbias.get_value()) neg_h = self.rng.binomial(n=1, p=pval_h, size=(self.batch_size, self.n_h)) self.neg_h = sharedX(neg_h, name='neg_h') # moving average values for sparsity self.sp_pos_v = sharedX(neg_v, name='sp_pos_v') self.sp_pos_h = sharedX(neg_h, name='sp_pos_h')
def __call__(self, shape, name=None): if shape[0] != shape[1]: w = np.zeros(shape) o_idxs = np.arange(shape[0]) i_idxs = np.random.permutation(np.tile(np.arange(shape[1]), shape[0]/shape[1]+1))[:shape[0]] w[o_idxs, i_idxs] = self.scale else: w = np.identity(shape[0]) * self.scale return sharedX(w, name=name)
def __init__(self, rbm, particles, rng): if not hasattr(rng, 'randn'): rng = numpy.random.RandomState(rng) seed = int(rng.randint(2**30)) self.rbm = rbm self.n_particles = particles.shape[0] assert particles.shape[1:] == rbm.v_shp self.particles = sharedX(particles, name='particles') self.s_rng = RandomStreams(seed)
def get_updates(self, cost, params): grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): d = sharedX(p.get_value() * 0.0) new_d = self.mm * d - self.lr * (g + self.wd * p) updates.append((d, new_d)) updates.append((p, p + new_d)) return updates
def get_updates(self, cost, params): grads = T.grad(cost=cost, wrt=params) updates = [] for p, dx in zip(params, grads): cache = sharedX(0) new_d = self.lr * dx / (T.sqrt(cache) + self.eps) updates.append((cache, T.sum(dx * dx))) updates.append((p, p - new_d)) return updates
def get_updates(self, cost, params): grads = T.grad(cost=cost, wrt=params) updates = [] for p, dx in zip(params, grads): cache = sharedX(0) delta = self.lr * dx / (T.sqrt(cache) + self.eps) updates.append((cache, dx.norm(2))) updates.append((p, p - delta)) return updates
def init_chains(self): """ Allocate shared variable for persistent chain """ # initialize visible unit chains scale = numpy.sqrt(1. / softplus(self.lambd.get_value())) neg_v = self.rng.normal(loc=0, scale=scale, size=(self.batch_size, self.n_v)) self.neg_v = sharedX(neg_v, name='neg_v') # initialize s-chain scale = numpy.sqrt(1. / softplus(self.alpha.get_value())) neg_s = self.rng.normal(loc=0., scale=scale, size=(self.batch_size, self.n_s)) self.neg_s = sharedX(neg_s, name='neg_s') # initialize binary g-h chains pval_h = sigm(self.hbias.get_value()) neg_h = self.rng.binomial(n=1, p=pval_h, size=(self.batch_size, self.n_h)) self.neg_h = sharedX(neg_h, name='neg_h')
def init_chains(self): """ Allocate shared variable for persistent chain """ self.neg_ev = sharedX(self.rng.rand(self.batch_size, self.n_v), name='neg_ev') self.neg_h = sharedX(self.rng.rand((self.cratio+1)*self.batch_size, self.n_h), name='neg_h') self.neg_v = sharedX(self.rng.rand((self.cratio+1)*self.batch_size, self.n_v), name='neg_v') self.beta = sharedX(numpy.ones((self.cratio+1)*self.batch_size), name='betas') self.beta_mat = T.shape_padright(self.beta) ### CAST is mostly implemented in numpy ### # Generate range of possible temperatures self._betas = numpy.linspace(1.0, self.min_beta, self.num_beta).astype(floatX) # Chain i is at inverse temperatures betas[beta_idx[i]]. self.beta_idx = self.rng.random_integers(low=0, high=self.num_beta-1, size=(self.cratio * self.batch_size)) self.beta_logw = numpy.zeros(self.num_beta) self.swap_timer = 1 # Beta weights (adaptive weights for WL) self.update_temperatures()
def __init__(self, rbm, particles, rng): if not hasattr(rng, 'randn'): rng = numpy.random.RandomState(rng) seed=int(rng.randint(2**30)) self.rbm = rbm self.n_particles = particles.shape[0] assert particles.shape[1:] == rbm.v_shp self.particles = sharedX( particles, name='particles') self.s_rng = RandomStreams(seed)
def init_centering(self): self.avg_pos_g = sharedX(0.5 * numpy.ones(self.n_g), name='avg_pos_g') self.avg_pos_h = sharedX(0.5 * numpy.ones(self.n_h), name='avg_pos_h') self.avg_pos_v = sharedX(numpy.zeros(self.n_v), name='avg_pos_v') self.avg_pos_g_tm1 = sharedX(0. * numpy.ones(self.n_g), name='avg_pos_g_tm1') self.avg_pos_h_tm1 = sharedX(0. * numpy.ones(self.n_h), name='avg_pos_h_tm1') self.avg_pos_v_tm1 = sharedX(numpy.zeros(self.n_v), name='avg_pos_v_tm1')
def init_parameters(self): assert self.sparse_hmask # init scalar norm for each entry of Wv sn_val = self.iscales['scalar_norms'] * numpy.ones(self.n_s) self.scalar_norms = sharedX(sn_val, name='scalar_norms') if self.flags['igo_init']: print 'Overriding iscales initialization with 1./sqrt(nv x nh)' self.iscales['Wv'] = 1./numpy.sqrt(max(self.n_v, self.n_s)) self.iscales['Wg'] = 1./numpy.sqrt(max(self.n_g, self.n_s)) self.iscales['Wh'] = 1./numpy.sqrt(max(self.n_h, self.n_s)) # Init (visible, slabs) weight matrix. self.Wv = self.init_weight(self.iscales['Wv'], (self.n_v, self.n_s), 'Wv', normalize= (self.flags['wv_norm'] == 'unit')) # Initialize (slab, hidden) pooling matrix self.Wh = sharedX(self.sparse_hmask.mask.T * self.iscales.get('Wh', 1.0), name='Wh') # Initialize (slabs, g-unit) weight matrix. if self.sparse_gmask: self.Wg = sharedX(self.sparse_gmask.mask.T * self.iscales.get('Wg', 1.0), name='Wg') else: self.Wg = self.init_weight(self.iscales['Wg'], (self.n_s, self.n_g), 'Wg') # allocate shared variables for bias parameters self.gbias = sharedX(self.iscales['gbias'] * numpy.ones(self.n_g), name='gbias') self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') self.cg = sharedX(0.5 * numpy.ones(self.n_g), name='cg') self.ch = sharedX(0.5 * numpy.ones(self.n_h), name='ch') # mean (mu) and precision (alpha) parameters on s self.mu = sharedX(self.iscales['mu'] * numpy.ones(self.n_s), name='mu') self.alpha = sharedX(self.iscales['alpha'] * numpy.ones(self.n_s), name='alpha') self.alpha_prec = T.nnet.softplus(self.alpha) # diagonal of precision matrix of visible units self.lambd = sharedX(self.iscales['lambd'] * numpy.ones(self.n_v), name='lambd') self.lambd_prec = T.nnet.softplus(self.lambd)
def init_parameters(self): assert self.sparse_hmask # Init (visible, slabs) weight matrix. self.Wv = self.init_weight(self.iscales['Wv'], (self.n_v, self.n_s), 'Wv', normalize=True) self.norm_wv = T.sqrt(T.sum(self.Wv**2, axis=0)) self.mu = sharedX(self.iscales['mu'] * numpy.ones(self.n_s), name='mu') # Initialize (slab, hidden) pooling matrix self.Wh = sharedX(self.sparse_hmask.mask.T * self.iscales.get('Wh', 1.0), name='Wh') # allocate shared variables for bias parameters self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') self.ch = sharedX(0.5 * numpy.ones(self.n_h), name='ch') # precision (alpha) parameters on s self.alpha = sharedX(self.iscales['alpha'] * numpy.ones(self.n_s), name='alpha') self.alpha_prec = T.nnet.softplus(self.alpha) # diagonal of precision matrix of visible units self.lambd = sharedX(self.iscales['lambd'] * numpy.ones(self.n_v), name='lambd') self.lambd_prec = T.nnet.softplus(self.lambd)
def init_parameters_from_model(self, model): self.scalar_norms = model.scalar_norms self.Wv = model.Wv self.Wg = model.Wg self.Wh = model.Wh self.avg_norm_wg = model.avg_norm_wg self.avg_norm_wh = model.avg_norm_wh self.gbias = model.gbias self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') self.vbias = model.vbias self.mu = model.mu self.alpha = model.alpha self.alpha_prec = model.alpha_prec
def __call__(self, shape, name=None): w = np.zeros(shape) ycenter = shape[2]//2 xcenter = shape[3]//2 if shape[0] == shape[1]: o_idxs = np.arange(shape[0]) i_idxs = np.arange(shape[1]) elif shape[1] < shape[0]: o_idxs = np.arange(shape[0]) i_idxs = np.random.permutation(np.tile(np.arange(shape[1]), shape[0]/shape[1]+1))[:shape[0]] w[o_idxs, i_idxs, ycenter, xcenter] = self.scale return sharedX(w, name=name)
def get_updates(self, cost, params): # Your codes here grads = T.grad(cost=cost, wrt=params) updates = [] #cache = sharedX(params.get_value() * 0.0) for p, g in zip(params, grads): cache = sharedX(p.get_value() * 0.0) new_cache = self.rho * cache + (1 - self.rho) * g**2 new_p = p - self.lr * g / (np.sqrt(new_cache) + self.eps) updates.append((cache, new_cache)) updates.append((p, new_p)) return updates '''
def __setattr__(self, name, array): params = self.get_dict() if name not in params: params[name] = sharedX(array, name=name) else: print "%s already assigned" % name if array.shape != params[name].get_value().shape: raise ValueError( 'The shape mismatch for the new value you want to assign' 'to %s' % name) params[name].set_value(np.asarray(array, dtype=theano.config.floatX), borrow=True)
def init_chains(self): """ Allocate shared variable for persistent chain """ self.neg_g = sharedX(self.rng.rand(self.batch_size, self.n_g), name='neg_g') self.neg_s = sharedX(self.rng.rand(self.batch_size, self.n_g), name='neg_s') self.neg_h = sharedX(self.rng.rand(self.batch_size, self.n_h), name='neg_h') self.neg_t = sharedX(self.rng.rand(self.batch_size, self.n_h), name='neg_t') self.neg_v = sharedX(self.rng.rand(self.batch_size, self.n_v), name='neg_v') self.neg_ev = sharedX(self.rng.rand(self.batch_size, self.n_v), name='neg_ev')
def updates(self, with_s_mu=False): new_particles, _locals = self.rbm.gibbs_step_for_v(self.particles, self.s_rng, return_locals=True) if with_s_mu: if not hasattr(self.rbm, 's_sample'): shp = (self.n_particles, ) + self.rbm.s_shp self.rbm.s_sample = sharedX(numpy.zeros(shp), 's_sample') return { self.particles: new_particles, self.rbm.s_sample: _locals['s_mu'] } else: return {self.particles: new_particles}
def updates(self, with_s_mu=False): new_particles, _locals = self.rbm.gibbs_step_for_v( self.particles, self.s_rng, return_locals=True) if with_s_mu: if not hasattr(self.rbm, 's_sample'): shp = (self.n_particles,)+self.rbm.s_shp self.rbm.s_sample = sharedX(numpy.zeros(shp), 's_sample') return {self.particles: new_particles, self.rbm.s_sample: _locals['s_mu'] } else: return {self.particles: new_particles}
def __setattr__(self, name, array): params = self.__dict__['params'] if name not in params: params[name] = sharedX(array, name=name) else: print "%s already assigned" % name if array.shape != params[name].get_value().shape: raise ValueError('The shape mismatch for the new value you want to assign' 'to %s' % name) params[name].set_value(np.asarray( array, dtype = theano.config.floatX ), borrow=True)
def init_parameters(self): assert self.sparse_hmask # Init (visible, slabs) weight matrix. self.Wv = self.init_weight(self.iscales['Wv'], (self.n_v, self.n_s), 'Wv', normalize = self.flags['wv_norm'] == 'unit') self.gamma = sharedX(numpy.ones(self.n_s), 'gamma') self._Wv = 1./self.gamma * self.Wv self.norm_wv = T.sqrt(T.sum(self.Wv**2, axis=0)) self.mu = sharedX(self.iscales['mu'] * numpy.ones(self.n_s), name='mu') self._mu = self.gamma * self.mu # Initialize (slab, hidden) pooling matrix self.Wh = sharedX(self.sparse_hmask.mask.T * self.iscales.get('Wh', 1.0), name='Wh') # Initialize (slabs, g-unit) weight matrix. self.Ug = self.init_weight(self.iscales['Ug'], (self.n_s, self.n_s), 'Ug') if self.sparse_gmask: self.Wg = sharedX(self.sparse_gmask.mask.T * self.iscales.get('Wg', 1.0), name='Wg') else: self.Wg = self.init_weight(self.iscales['Wg'], (self.n_s, self.n_g), 'Wg') self._Wg = T.dot(self.Ug, self.Wg) # allocate shared variables for bias parameters self.gbias = sharedX(self.iscales['gbias'] * numpy.ones(self.n_g), name='gbias') self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') self.cg = sharedX(0.5 * numpy.ones(self.n_g), name='cg') self.ch = sharedX(0.5 * numpy.ones(self.n_h), name='ch') # precision (alpha) parameters on s self.alpha = sharedX(self.iscales['alpha'] * numpy.ones(self.n_s), name='alpha') self.alpha_prec = T.nnet.softplus(self.alpha) # diagonal of precision matrix of visible units self.lambd = sharedX(self.iscales['lambd'] * numpy.ones(self.n_v), name='lambd') self.lambd_prec = T.nnet.softplus(self.lambd)
def init_samples(self): # allocate shared variable for persistent chain self.neg_v = sharedX(self.rng.rand(self.batch_size, self.n_v), name='neg_v') self.neg_ev = sharedX(self.rng.rand(self.batch_size, self.n_v), name='neg_ev') self.neg_s = sharedX(self.rng.rand(self.batch_size, self.n_s), name='neg_s') self.neg_h = sharedX(self.rng.rand(self.batch_size, self.n_h), name='neg_h') # moving average values for sparsity self.sp_pos_v = sharedX(self.rng.rand(1,self.n_v), name='sp_pos_v') self.sp_pos_h = sharedX(self.rng.rand(1,self.n_h), name='sp_pog_h')
def init_params(options): params = OrderedDict() # LF model params params['W_users'] = sharedX(normal((options['n_users'],options['n_factors'])), name='W_users') params['W_items'] = sharedX(normal((options['n_items'],options['n_factors'])), name='W_items') params['b_users'] = sharedX(np.zeros((options['n_users'],)), name='b_users') params['b_items'] = sharedX(np.zeros((options['n_items'],)), name='b_items') params['b'] = sharedX(0., name='b') # distributed BOW params params['W_bow'] = sharedX(normal((options['n_factors'],options['vocab_size'])), name='W_bow') params['b_bow'] = sharedX(np.zeros((options['vocab_size'],)), name='b_bow') return params
def init_parameters(self): self.n_s = self.n_h * self.bw_s self.scalar_norms = sharedX(1.0 * numpy.ones(self.n_s), name='scalar_norms') wv_val = self.rng.randn(self.n_v, self.n_s) * self.iscales['Wv'] self.Wv = sharedX(wv_val, name='Wv') self.Wh = numpy.zeros((self.n_h, self.n_s), dtype=floatX) for i in xrange(self.n_h): self.Wh[i, i*self.bw_s:(i+1)*self.bw_s] = 1. # allocate shared variables for bias parameters self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') self.vbias = sharedX(self.iscales['vbias'] * numpy.ones(self.n_v), name='vbias') # mean (mu) and precision (alpha) parameters on s self.mu = sharedX(self.iscales['mu'] * numpy.ones(self.n_s), name='mu') self.alpha = sharedX(self.iscales['alpha'] * numpy.ones(self.n_s), name='alpha') self.alpha_prec = T.nnet.softplus(self.alpha)
def cd_updates(self, pos_v, neg_v, lr, other_cost=0): grads = contrastive_grad(self.free_energy_given_v, pos_v, neg_v, wrt=self.params(), other_cost=other_cost) stepsizes = lr if self.conf.get('momentum', 0.0): logger.info('Using momentum %s' % self.conf['momentum']) rval = dict( sgd_momentum_updates(self.params(), grads, stepsizes=stepsizes, momentum=self.conf['momentum'])) else: rval = dict(sgd_updates(self.params(), grads, stepsizes=stepsizes)) #DEBUG STORE GRADS grad_shared_vars = [ sharedX(0 * p.get_value(), '') for p in self.params() ] self.grad_shared_vars = grad_shared_vars rval.update(dict(zip(grad_shared_vars, grads))) return rval
def init_chains(self): """ Allocate shared variable for persistent chain """ # initialize s-chain loc = self.mu.get_value() scale = numpy.sqrt(1./softplus(self.alpha.get_value())) neg_s = self.rng.normal(loc=loc, scale=scale, size=(self.batch_size, self.n_s)) self.neg_s = sharedX(neg_s, name='neg_s') # initialize binary g-h-v chains pval_g = sigm(self.gbias.get_value()) pval_h = sigm(self.hbias.get_value()) pval_l = softmax(self.lbias.get_value()) neg_g = self.rng.binomial(n=1, p=pval_g, size=(self.batch_size, self.n_g)) neg_h = self.rng.binomial(n=1, p=pval_h, size=(self.batch_size, self.n_h)) neg_v = self.rng.binomial(n=1, p=pval_v, size=(self.batch_size, self.n_v)) neg_l = self.rng.multinomial(n=1, pvals=pval_l, size=(self.batch_size)) self.neg_h = sharedX(neg_h, name='neg_h') self.neg_g = sharedX(neg_g, name='neg_g') self.neg_v = sharedX(neg_v, name='neg_v') self.neg_l = sharedX(neg_l, name='neg_l') # other misc. self.pos_counter = sharedX(0., name='pos_counter') self.odd_even = sharedX(0., name='odd_even')
def get_updates(grads, momentum_lambda=None): """ Returns an updates dictionary corresponding to a single step of SGD. The learning rate for each parameter is computed as lr * multipliers[param] :param lr: base learning rate (common to all parameters) :param multipliers: dictionary of learning rate multipliers, each being a shared var e.g. {'hbias': sharedX(0.1), 'Wf': sharedX(0.01)} """ updates = OrderedDict() momentum = OrderedDict() for (param, gparam) in grads.iteritems(): if momentum_lambda: # create storage for momentum term momentum[param] = sharedX(numpy.zeros_like(param.get_value()), name=param.name + '_old') new_grad = (1.-momentum_lambda) * gparam + momentum_lambda * momentum[param] updates[param] = param - new_grad updates[momentum[param]] = new_grad else: updates[param] = param - gparam return updates
def init_parameters(self): # init scalar norm for each entry of Wv sn_val = self.iscales['scalar_norms'] * numpy.ones(self.n_s) self.scalar_norms = sharedX(sn_val, name='scalar_norms') # init weight matrices self.Wv = self.init_weight(self.iscales.get('Wv', 1.0), (self.n_v, self.n_s), 'Wv', normalize = self.flags['split_norm']) if self.sparse_gmask or self.sparse_hmask: assert self.sparse_gmask and self.sparse_hmask self.Wg = sharedX(self.sparse_gmask.mask * self.iscales.get('Wg', 1.0), name='Wg') self.Wh = sharedX(self.sparse_hmask.mask * self.iscales.get('Wh', 1.0), name='Wh') else: self.Wg = self.init_weight(1.0, (self.n_g, self.n_s), 'Wg') self.Wh = self.init_weight(1.0, (self.n_h, self.n_s), 'Wh') # allocate shared variables for bias parameters self.gbias = sharedX(self.iscales['gbias'] * numpy.ones(self.n_g), name='gbias') self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') # diagonal of precision matrix of visible units self.beta = sharedX(self.iscales['beta'] * numpy.ones(self.n_v), name='beta') self.beta_prec = T.nnet.softplus(self.beta)
def __init__(self, numpy_rng = None, theano_rng = None, n_h=99, n_v=100, init_from=None, min_beta=0.9, num_beta=20, gamma=10, cratio=1, cdelay=0, neg_sample_steps=1, lr_spec=None, lr_mults = {}, iscales={}, clip_min={}, clip_max={}, l1 = {}, l2 = {}, sp_weight={}, sp_targ={}, batch_size = 13, compile=True, debug=False, seed=1241234, flags = {}, max_updates = 5e5, **kwargs): """ :param n_h: number of h-hidden units :param n_v: number of visible units :param iscales: optional dictionary containing initialization scale for each parameter :param neg_sample_steps: number of sampling updates to perform in negative phase. :param l1: hyper-parameter controlling amount of L1 regularization :param l2: hyper-parameter controlling amount of L2 regularization :param batch_size: size of positive and negative phase minibatch :param compile: compile sampling and learning functions :param seed: seed used to initialize numpy and theano RNGs. """ Model.__init__(self) Block.__init__(self) assert lr_spec is not None for k in ['h']: assert k in sp_weight.keys() for k in ['h']: assert k in sp_targ.keys() self.validate_flags(flags) self.jobman_channel = None self.jobman_state = {} self.register_names_to_del(['jobman_channel']) ### make sure all parameters are floatX ### for (k,v) in l1.iteritems(): l1[k] = npy_floatX(v) for (k,v) in l2.iteritems(): l2[k] = npy_floatX(v) for (k,v) in sp_weight.iteritems(): sp_weight[k] = npy_floatX(v) for (k,v) in sp_targ.iteritems(): sp_targ[k] = npy_floatX(v) for (k,v) in clip_min.iteritems(): clip_min[k] = npy_floatX(v) for (k,v) in clip_max.iteritems(): clip_max[k] = npy_floatX(v) # dump initialization parameters to object for (k,v) in locals().iteritems(): if k!='self': setattr(self,k,v) # allocate random number generators self.rng = numpy.random.RandomState(seed) if numpy_rng is None else numpy_rng self.theano_rng = RandomStreams(self.rng.randint(2**30)) if theano_rng is None else theano_rng ############### ALLOCATE PARAMETERS ################# # allocate symbolic variable for input self.input = T.matrix('input') self.init_parameters() self.init_chains() # learning rate, with deferred 1./t annealing self.iter = sharedX(0.0, name='iter') if lr_spec['type'] == 'anneal': num = lr_spec['init'] * lr_spec['start'] denum = T.maximum(lr_spec['start'], lr_spec['slope'] * self.iter) self.lr = T.maximum(lr_spec['floor'], num/denum) elif lr_spec['type'] == '1_t': self.lr = npy_floatX(lr_spec['num']) / (self.iter + npy_floatX(lr_spec['denum'])) elif lr_spec['type'] == 'linear': lr_start = npy_floatX(lr_spec['start']) lr_end = npy_floatX(lr_spec['end']) self.lr = lr_start + self.iter * (lr_end - lr_start) / npy_floatX(self.max_updates) elif lr_spec['type'] == 'constant': self.lr = sharedX(lr_spec['value'], name='lr') else: raise ValueError('Incorrect value for lr_spec[type]') # configure input-space (new pylearn2 feature?) self.input_space = VectorSpace(n_v) self.output_space = VectorSpace(n_h) self.batches_seen = 0 # incremented on every batch self.examples_seen = 0 # incremented on every training example self.logz = sharedX(0.0, name='logz') self.cpu_time = 0 self.error_record = [] if compile: self.do_theano() if init_from: raise NotImplementedError()
def init_weight(self, iscale, shape, name, normalize=False, axis=0): value = self.rng.normal(size=shape) * iscale if normalize: value /= numpy.sqrt(numpy.sum(value**2, axis=axis)) return sharedX(value, name=name)
def init_parameters(self): assert self.sparse_hmask # init scalar norm for each entry of Wv sn_val = self.iscales['scalar_norms'] * numpy.ones(self.n_s) self.scalar_norms = sharedX(sn_val, name='scalar_norms') if self.flags['igo_init']: print 'Overriding iscales initialization with 1./sqrt(nv x nh)' self.iscales['Wv'] = 1./numpy.sqrt(max(self.n_v, self.n_s)) self.iscales['Wg'] = 1./numpy.sqrt(max(self.n_g, self.n_s)) self.iscales['Wh'] = 1./numpy.sqrt(max(self.n_h, self.n_s)) # init weight matrices self.Wv = self.init_weight(self.iscales['Wv'], (self.n_v, self.n_s), 'Wv') self.Wh = sharedX(self.sparse_hmask.mask.T * self.iscales.get('Wh', 1.0), name='Wh') if self.sparse_gmask: self.Wg = sharedX(self.sparse_gmask.mask.T * self.iscales.get('Wg', 1.0), name='Wg') else: self.Wg = self.init_weight(self.iscales['Wg'], (self.n_s, self.n_g), 'Wg') # avg norm (for wgh_norm='roland') norm_wg = numpy.sqrt(numpy.sum(self.Wg.get_value()**2, axis=1)).mean() norm_wh = numpy.sqrt(numpy.sum(self.Wh.get_value()**2, axis=0)).mean() norm_wv = numpy.sqrt(numpy.sum(self.Wv.get_value()**2, axis=0)).mean() self.avg_norm_wg = sharedX(norm_wg, name='avg_norm_wg') self.avg_norm_wh = sharedX(norm_wh, name='avg_norm_wh') self.avg_norm_wv = sharedX(norm_wv, name='avg_norm_wv') # allocate shared variables for bias parameters self.gbias = sharedX(self.iscales['gbias'] * numpy.ones(self.n_g), name='gbias') self.hbias = sharedX(self.iscales['hbias'] * numpy.ones(self.n_h), name='hbias') # mean (mu) and precision (alpha) parameters on s self.mu = sharedX(self.iscales['mu'] * numpy.ones(self.n_s), name='mu') self.alpha = sharedX(self.iscales['alpha'] * numpy.ones(self.n_s), name='alpha') self.alpha_prec = T.nnet.softplus(self.alpha) # diagonal of precision matrix of visible units self.lambd = sharedX(self.iscales['lambd'] * numpy.ones(self.n_v), name='lambd') self.lambd_prec = T.nnet.softplus(self.lambd)
def uniform(shape, scale=0.1, name=None): return sharedX(np.random.uniform(low=-scale, high=scale, size=shape), name=name)
def __init__(self, input=None, Wv=None, vbias=None, hbias=None, numpy_rng=None, theano_rng=None, n_h=100, bw_s=1, n_v=100, init_from=None, neg_sample_steps=1, lr=None, lr_timestamp=None, lr_mults={}, iscales={}, clip_min={}, clip_max={}, vbound=5., l1={}, l2={}, orth_lambda=0., var_param_alpha='exp', var_param_beta='linear', sp_type='kl', sp_weight={}, sp_targ={}, batch_size=13, scalar_b=False, compile=True, debug=False, seed=1241234, my_save_path=None, save_at=None, save_every=None, flags={}, max_updates=5e5): """ :param n_h: number of h-hidden units :param n_v: number of visible units :param iscales: optional dictionary containing initialization scale for each parameter :param neg_sample_steps: number of sampling updates to perform in negative phase. :param l1: hyper-parameter controlling amount of L1 regularization :param l2: hyper-parameter controlling amount of L2 regularization :param batch_size: size of positive and negative phase minibatch :param compile: compile sampling and learning functions :param seed: seed used to initialize numpy and theano RNGs. """ Model.__init__(self) Block.__init__(self) assert lr is not None for k in ['Wv', 'vbias', 'hbias']: assert k in iscales.keys() iscales.setdefault('mu', 1.) iscales.setdefault('alpha', 0.) iscales.setdefault('beta', 0.) for k in ['h']: assert k in sp_weight.keys() for k in ['h']: assert k in sp_targ.keys() self.jobman_channel = None self.jobman_state = {} self.register_names_to_del(['jobman_channel']) ### make sure all parameters are floatX ### for (k, v) in l1.iteritems(): l1[k] = npy_floatX(v) for (k, v) in l2.iteritems(): l2[k] = npy_floatX(v) for (k, v) in sp_weight.iteritems(): sp_weight[k] = npy_floatX(v) for (k, v) in sp_targ.iteritems(): sp_targ[k] = npy_floatX(v) for (k, v) in clip_min.iteritems(): clip_min[k] = npy_floatX(v) for (k, v) in clip_max.iteritems(): clip_max[k] = npy_floatX(v) # dump initialization parameters to object for (k, v) in locals().iteritems(): if k != 'self': setattr(self, k, v) # allocate random number generators self.rng = numpy.random.RandomState( seed) if numpy_rng is None else numpy_rng self.theano_rng = RandomStreams(self.rng.randint( 2**30)) if theano_rng is None else theano_rng ############### ALLOCATE PARAMETERS ################# self.n_s = self.n_h * self.bw_s self.wv_norms = sharedX(1.0 * numpy.ones(self.n_s), name='wv_norms') if Wv is None: wv_val = self.rng.randn(n_v, self.n_s) * iscales['Wv'] self.Wv = sharedX(wv_val, name='Wv') else: self.Wv = Wv self.Wh = numpy.zeros((self.n_s, self.n_h), dtype=floatX) for i in xrange(self.n_h): self.Wh[i * bw_s:(i + 1) * bw_s, i] = 1. # allocate shared variables for bias parameters if hbias is None: self.hbias = sharedX(iscales['hbias'] * numpy.ones(n_h), name='hbias') else: self.hbias = hbias # mean (mu) and precision (alpha) parameters on s self.mu = sharedX(iscales['mu'] * numpy.ones(self.n_s), name='mu') self.alpha = sharedX(iscales['alpha'] * numpy.ones(self.n_s), name='alpha') var_param_func = { 'exp': T.exp, 'softplus': T.nnet.softplus, 'linear': lambda x: x } self.alpha_prec = var_param_func[self.var_param_alpha](self.alpha) # diagonal of precision matrix of visible units self.vbound = sharedX(vbound, name='vbound') self.beta = sharedX(iscales['beta'] * numpy.ones(n_v), name='beta') self.beta_prec = var_param_func[self.var_param_beta](self.beta) # allocate shared variable for persistent chain self.neg_v = sharedX(self.rng.rand(batch_size, n_v), name='neg_v') self.neg_ev = sharedX(self.rng.rand(batch_size, n_v), name='neg_ev') self.neg_s = sharedX(self.rng.rand(batch_size, self.n_s), name='neg_s') self.neg_h = sharedX(self.rng.rand(batch_size, n_h), name='neg_h') # moving average values for sparsity self.sp_pos_v = sharedX(self.rng.rand(1, self.n_v), name='sp_pos_v') self.sp_pos_h = sharedX(self.rng.rand(1, self.n_h), name='sp_pog_h') # learning rate, with deferred 1./t annealing self.iter = sharedX(0.0, name='iter') if lr['type'] == 'anneal': num = lr['init'] * lr['start'] denum = T.maximum(lr['start'], lr['slope'] * self.iter) self.lr = T.maximum(lr['floor'], num / denum) elif lr['type'] == 'linear': lr_start = npy_floatX(lr['start']) lr_end = npy_floatX(lr['end']) self.lr = lr_start + self.iter * (lr_end - lr_start) / npy_floatX( self.max_updates) else: raise ValueError('Incorrect value for lr[type]') # learning rate - implemented as shared parameter for GPU self.lr_mults_it = {} self.lr_mults_shrd = {} for (k, v) in lr_mults.iteritems(): # make sure all learning rate multipliers are float64 self.lr_mults_it[k] = tools.HyperParamIterator( lr_timestamp, lr_mults[k]) self.lr_mults_shrd[k] = sharedX(self.lr_mults_it[k].value, name='lr_mults_shrd' + k) # allocate symbolic variable for input self.input = T.matrix('input') if input is None else input # configure input-space (new pylearn2 feature?) self.input_space = VectorSpace(n_v) self.output_space = VectorSpace(n_h) self.batches_seen = 0 # incremented on every batch self.examples_seen = 0 # incremented on every training example self.force_batch_size = batch_size # force minibatch size self.error_record = [] if compile: self.do_theano() #### load layer 1 parameters from file #### if init_from: self.load_params(init_from)
def __init__(self, numpy_rng = None, theano_rng = None, n_h=100, bw_s=1, n_v=100, init_from=None, neg_sample_steps=1, lr_spec=None, lr_timestamp=None, lr_mults = {}, iscales={}, clip_min={}, clip_max={}, truncation_bound={}, l1 = {}, l2 = {}, orth_lambda=0., var_param_alpha='exp', var_param_lambd='linear', sp_type='kl', sp_weight={}, sp_targ={}, batch_size = 13, compile=True, debug=False, seed=1241234, my_save_path=None, save_at=None, save_every=None, flags = {}, max_updates = 5e5): """ :param n_h: number of h-hidden units :param n_v: number of visible units :param iscales: optional dictionary containing initialization scale for each parameter :param neg_sample_steps: number of sampling updates to perform in negative phase. :param l1: hyper-parameter controlling amount of L1 regularization :param l2: hyper-parameter controlling amount of L2 regularization :param batch_size: size of positive and negative phase minibatch :param compile: compile sampling and learning functions :param seed: seed used to initialize numpy and theano RNGs. """ Model.__init__(self) Block.__init__(self) assert lr_spec is not None for k in ['Wv', 'hbias']: assert k in iscales.keys() iscales.setdefault('mu', 1.) iscales.setdefault('alpha', 0.) iscales.setdefault('lambd', 0.) for k in ['h']: assert k in sp_weight.keys() for k in ['h']: assert k in sp_targ.keys() self.validate_flags(flags) self.jobman_channel = None self.jobman_state = {} self.register_names_to_del(['jobman_channel']) ### make sure all parameters are floatX ### for (k,v) in l1.iteritems(): l1[k] = npy_floatX(v) for (k,v) in l2.iteritems(): l2[k] = npy_floatX(v) for (k,v) in sp_weight.iteritems(): sp_weight[k] = npy_floatX(v) for (k,v) in sp_targ.iteritems(): sp_targ[k] = npy_floatX(v) for (k,v) in clip_min.iteritems(): clip_min[k] = npy_floatX(v) for (k,v) in clip_max.iteritems(): clip_max[k] = npy_floatX(v) # dump initialization parameters to object for (k,v) in locals().iteritems(): if k!='self': setattr(self,k,v) # allocate random number generators self.rng = numpy.random.RandomState(seed) if numpy_rng is None else numpy_rng self.theano_rng = RandomStreams(self.rng.randint(2**30)) if theano_rng is None else theano_rng # allocate symbolic variable for input self.input = T.matrix('input') self.init_parameters() self.init_chains() # learning rate, with deferred 1./t annealing self.iter = sharedX(0.0, name='iter') if lr_spec['type'] == 'anneal': num = lr_spec['init'] * lr_spec['start'] denum = T.maximum(lr_spec['start'], lr_spec['slope'] * self.iter) self.lr = T.maximum(lr_spec['floor'], num/denum) elif lr_spec['type'] == 'linear': lr_start = npy_floatX(lr_spec['start']) lr_end = npy_floatX(lr_spec['end']) self.lr = lr_start + self.iter * (lr_end - lr_start) / npy_floatX(self.max_updates) else: raise ValueError('Incorrect value for lr_spec[type]') # configure input-space (new pylearn2 feature?) self.input_space = VectorSpace(n_v) self.output_space = VectorSpace(n_h) self.batches_seen = 0 # incremented on every batch self.examples_seen = 0 # incremented on every training example self.force_batch_size = batch_size # force minibatch size self.error_record = [] if compile: self.do_theano() #### load layer 1 parameters from file #### if init_from: self.load_params(init_from)
def __init__(self, name, kernel_size, num_input, num_output, init_std): super(Convolution, self).__init__(name, trainable=True) W_shape = (num_output, num_input, kernel_size, kernel_size) self.W = sharedX(np.random.randn(*W_shape) * init_std, name=name + '/W') self.b = sharedX(np.zeros(num_output), name=name + '/b')
def __init__(self, name, inputs_dim, num_output, init_std): super(Linear, self).__init__(name, trainable=True) self.W = sharedX(np.random.randn(inputs_dim, num_output) * init_std, name=name + '/W') self.b = sharedX(np.zeros(num_output), name=name + '/b')