def scale(self, grads): """ :param grads: dictionary of (param,gradient) pairs :rval scaled_grad: gradient scaled by inverse variance :rval updates: updates dictionary to locally defined shared variables. """ updates = {} for (param,grad) in grads.iteritems(): assert isinstance(param, T.sharedvar.TensorSharedVariable) pval = param.get_value() avg2_x = sharedX(1*numpy.ones_like(pval), name='avg2_x_%s' % param.name) avg_x2 = sharedX(10*numpy.ones_like(pval), name='avg_x2_%s' % param.name) new_avg2_x = self.mov_avg * avg2_x + (1.-self.mov_avg) * T.mean(grad, axis=0)**2 new_avg_x2 = self.mov_avg * avg_x2 + (1.-self.mov_avg) * T.mean(grad**2, axis=0) grad_var = new_avg_x2 - new_avg2_x # scale by inverse of standard deviation, up to max_factor #grads[param] = Print('scaler')(T.sqrt(1./(grad_var + self.eps))) * grad grads[param] = T.sqrt(1./(grad_var + self.eps)) * grad # store new shared variables self.avg2_x[param] = avg2_x self.avg_x2[param] = avg_x2 # register updates updates[avg2_x] = new_avg2_x updates[avg_x2] = new_avg_x2 return grads, updates
def scale(self, grads): """ :param grads: dictionary of (param,gradient) pairs :rval scaled_grad: gradient scaled by inverse variance :rval updates: updates dictionary to locally defined shared variables. """ updates = OrderedDict() for (param, grad) in grads.iteritems(): assert isinstance(param, T.sharedvar.TensorSharedVariable) pval = param.get_value() avg2_x = sharedX(1 * numpy.ones_like(pval), name='avg2_x_%s' % param.name) avg_x2 = sharedX(10 * numpy.ones_like(pval), name='avg_x2_%s' % param.name) new_avg2_x = self.mov_avg * avg2_x + (1. - self.mov_avg) * T.mean( grad, axis=0)**2 new_avg_x2 = self.mov_avg * avg_x2 + (1. - self.mov_avg) * T.mean( grad**2, axis=0) grad_var = new_avg_x2 - new_avg2_x # scale by inverse of standard deviation, up to max_factor #grads[param] = Print('scaler')(T.sqrt(1./(grad_var + self.eps))) * grad grads[param] = T.sqrt(1. / (grad_var + self.eps)) * grad # store new shared variables self.avg2_x[param] = avg2_x self.avg_x2[param] = avg_x2 # register updates updates[avg2_x] = new_avg2_x updates[avg_x2] = new_avg_x2 return grads, updates
def init_dparameters(self): # Create shared variables for model parameters. self.dW = [] self.dbias = [] for i, nui in enumerate(self.n_u): self.dbias += [sharedX(numpy.zeros(nui), name='dbias%i' % i)] self.dW += [None] if i > 0: wv_val = numpy.zeros((self.n_u[i - 1], nui)) self.dW[i] = sharedX(wv_val, name='dW%i' % i) self.dparams = [dWi for dWi in self.dW[1:]] self.dparams += [dbi for dbi in self.dbias]
def init_dparameters(self): # Create shared variables for model parameters. self.dW = [] self.dbias = [] for i, nui in enumerate(self.n_u): self.dbias += [sharedX(numpy.zeros(nui), name='dbias%i'%i)] self.dW += [None] if i > 0: wv_val = numpy.zeros((self.n_u[i-1], nui)) self.dW[i] = sharedX(wv_val, name='dW%i' % i) self.dparams = [dWi for dWi in self.dW[1:]] self.dparams += [dbi for dbi in self.dbias]
def init_samples(self): self.psamples = [] self.nsamples = [] for i, nui in enumerate(self.n_u): self.psamples += [ sharedX(self.rng.rand(self.batch_size, nui), name='psamples%i' % i) ] self.nsamples += [ sharedX(self.rng.rand(self.batch_size, nui), name='nsamples%i' % i) ]
def init_parameters(self): # Create shared variables for model parameters. self.W = [] self.bias = [] for i, nui in enumerate(self.n_u): self.bias += [sharedX(self.iscales['bias%i' %i] * numpy.ones(nui), name='bias%i'%i)] self.W += [None] if i > 0: wv_val = self.rng.randn(self.n_u[i-1], nui) * self.iscales.get('W%i'%i,1.0) self.W[i] = sharedX(wv_val, name='W%i' % i) # Establish list of learnt model parameters. self.params = [Wi for Wi in self.W[1:]] self.params += [bi for bi in self.bias]
def init_parameters(self): # Create shared variables for model parameters. self.W = [] self.bias = [] for i, nui in enumerate(self.n_u): self.bias += [ sharedX(self.iscales['bias%i' % i] * numpy.ones(nui), name='bias%i' % i) ] self.W += [None] if i > 0: wv_val = self.rng.randn( self.n_u[i - 1], nui) * self.iscales.get('W%i' % i, 1.0) self.W[i] = sharedX(wv_val, name='W%i' % i) # Establish list of learnt model parameters. self.params = [Wi for Wi in self.W[1:]] self.params += [bi for bi in self.bias]
def get_updates(grads, lr, fast_lr=None, multipliers=None, momentum_lambda=None): """ Returns an updates dictionary corresponding to a single step of SGD. The learning rate for each parameter is computed as lr * multipliers[param] :param lr: base learning rate (common to all parameters) :param multipliers: dictionary of learning rate multipliers, each being a shared var e.g. {'hbias': sharedX(0.1), 'Wf': sharedX(0.01)} """ updates = OrderedDict() momentum = OrderedDict() multipliers = OrderedDict() if multipliers is None else multipliers for (param, gparam) in grads.iteritems(): # each parameter can have its own multiplier on the learning rate multiplier = multipliers.get(param.name, 1.0) if param.name.find('fast') == 0 and fast_lr: print 'Using fast-learning rate of %f for %s' % (fast_lr, param.name) lr_param = fast_lr else: lr_param = lr * multiplier # create storage for momentum term momentum[param] = sharedX(numpy.zeros_like(param.get_value()), name=param.name + '_old') if momentum_lambda and param.name != 'fWv': # perform SGD, with momentum (optional) new_grad = (1. - momentum_lambda ) * gparam + momentum_lambda * momentum[param] updates[param] = param - lr_param * new_grad updates[momentum[param]] = new_grad else: updates[param] = param - lr_param * gparam return updates
def compute_gradients(self, momentum_lambda=None): updates = OrderedDict() momentum = OrderedDict() grads = T.grad(self.cost, self.params.keys(), consider_constant=self.constants.keys(), disconnected_inputs='ignore') for param, gparam in zip(self.params.keys(), grads): if momentum_lambda: momentum[param] = sharedX(numpy.zeros_like(param.get_value()), name=param.name + '_mom') new_grad = momentum_lambda * momentum[param] + (1.-momentum_lambda) * gparam updates[momentum[param]] = new_grad else: new_grad = gparam self.grads[param] = new_grad self.computed_cost = True return updates
def compute_gradients(self, momentum_lambda=None): updates = OrderedDict() momentum = OrderedDict() grads = T.grad(self.cost, self.params.keys(), consider_constant=self.constants.keys(), disconnected_inputs='ignore') for param, gparam in zip(self.params.keys(), grads): if momentum_lambda: momentum[param] = sharedX(numpy.zeros_like(param.get_value()), name=param.name + '_mom') new_grad = momentum_lambda * momentum[param] + ( 1. - momentum_lambda) * gparam updates[momentum[param]] = new_grad else: new_grad = gparam self.grads[param] = new_grad self.computed_cost = True return updates
def get_updates(grads, lr, fast_lr=None, multipliers=None, momentum_lambda=None): """ Returns an updates dictionary corresponding to a single step of SGD. The learning rate for each parameter is computed as lr * multipliers[param] :param lr: base learning rate (common to all parameters) :param multipliers: dictionary of learning rate multipliers, each being a shared var e.g. {'hbias': sharedX(0.1), 'Wf': sharedX(0.01)} """ updates = {} momentum = {} multipliers = {} if multipliers is None else multipliers for (param, gparam) in grads.iteritems(): # each parameter can have its own multiplier on the learning rate multiplier = multipliers.get(param.name, 1.0) if param.name.find('fast')==0 and fast_lr: print 'Using fast-learning rate of %f for %s' % (fast_lr, param.name) lr_param = fast_lr else: lr_param = lr * multiplier # create storage for momentum term momentum[param] = sharedX(numpy.zeros_like(param.get_value()), name=param.name + '_old') if momentum_lambda and param.name!='fWv': # perform SGD, with momentum (optional) new_grad = (1.-momentum_lambda) * gparam + momentum_lambda * momentum[param] updates[param] = param - lr_param * new_grad updates[momentum[param]] = new_grad else: updates[param] = param - lr_param * gparam return updates
def init_centering(self): self.offset = [] for i, nui in enumerate(self.n_u): self.offset += [sharedX(numpy.zeros(nui), name='offset%i' % i)]
(opts, args) = parser.parse_args() # load and recompile model model = serial.load(opts.path) model.set_batch_size(opts.batch_size, redo_monitor=False) ### # Function which computes probability of configuration. ## energy = model.energy(model.nsamples) compute_energy = theano.function([], energy) ### # Rebuild sampling function to have mean-field values for layer 0 ## e_nsamples0 = sharedX(model.nsamples[0].get_value(), name='e_nsamples0') new_nsamples, new_e_nsample0 = neg_sampling(model) neg_updates = {e_nsamples0: new_e_nsample0} for (nsample, new_nsample) in zip(model.nsamples, new_nsamples): neg_updates[nsample] = new_nsample sample_neg_func = theano.function([], [], updates=neg_updates) if opts.random: for nsample in model.nsamples: temp = numpy.random.randint(0,2, size=nsample.get_value().shape) nsample.set_value(temp.astype(floatX)) # Burnin of Markov chain. for i in xrange(opts.burnin): model.sample_neg_func()
def __init__(self, input = None, n_u=[100,100], enable={}, load_from=None, iscales=None, clip_min={}, clip_max={}, pos_mf_steps=1, pos_sample_steps=0, neg_sample_steps=1, lr_spec={}, lr_mults = {}, l1 = {}, l2 = {}, l1_inf={}, flags={}, momentum_lambda=0, cg_params = {}, batch_size = 13, computational_bs = 0, compile=True, seed=1241234, sp_targ_h = None, sp_weight_h=None, sp_pos_k = 5, my_save_path=None, save_at=None, save_every=None, max_updates=1e6): """ :param n_u: list, containing number of units per layer. n_u[0] contains number of visible units, while n_u[i] (with i > 0) contains number of hid. units at layer i. :param enable: dictionary of flags with on/off behavior :param iscales: optional dictionary containing initialization scale for each parameter. Key of dictionary should match the name of the associated shared variable. :param pos_mf_steps: number of mean-field iterations to perform in positive phase :param neg_sample_steps: number of sampling updates to perform in negative phase. :param lr: base learning rate :param lr_timestamp: list containing update indices at which to change the lr multiplier :param lr_mults: dictionary, optionally containing a list of learning rate multipliers for parameters of the model. Length of this list should match length of lr_timestamp (the lr mult will transition whenever we reach the associated timestamp). Keys should match the name of the shared variable, whose learning rate is to be adjusted. :param l1: dictionary, whose keys are model parameter names, and values are hyper-parameters controlling degree of L1-regularization. :param l2: same as l1, but for L2 regularization. :param l1_inf: same as l1, but the L1 penalty is centered as -\infty instead of 0. :param cg_params: dictionary with keys ['rtol','damp','maxiter'] :param batch_size: size of positive and negative phase minibatch :param computational_bs: batch size used internaly by natural gradient to reduce memory consumption :param seed: seed used to initialize numpy and theano RNGs. :param my_save_path: if None, do not save model. Otherwise, contains stem of filename to which we will save the model (everything but the extension). :param save_at: list containing iteration counts at which to save model :param save_every: scalar value. Save model every `save_every` iterations. """ Model.__init__(self) Block.__init__(self) ### VALIDATE PARAMETERS AND SET DEFAULT VALUES ### assert lr_spec is not None for (k,v) in clip_min.iteritems(): clip_min[k] = npy_floatX(v) for (k,v) in clip_max.iteritems(): clip_max[k] = npy_floatX(v) [iscales.setdefault('bias%i' % i, 0.) for i in xrange(len(n_u))] [iscales.setdefault('W%i' % i, 0.1) for i in xrange(len(n_u))] flags.setdefault('enable_centering', False) flags.setdefault('enable_natural', False) flags.setdefault('enable_warm_start', False) flags.setdefault('mlbiases', False) flags.setdefault('precondition', None) flags.setdefault('minres', False) flags.setdefault('minresQLP', False) if flags['precondition'] == 'None': flags['precondition'] = None self.jobman_channel = None self.jobman_state = {} self.register_names_to_del(['jobman_channel']) ### DUMP INITIALIZATION PARAMETERS TO OBJECT ### for (k,v) in locals().iteritems(): if k!='self': setattr(self,k,v) assert len(n_u) > 1 self.n_v = n_u[0] self.depth = len(n_u) # allocate random number generators self.rng = numpy.random.RandomState(seed) self.theano_rng = RandomStreams(self.rng.randint(2**30)) # allocate bilinear-weight matrices self.input = T.matrix() self.init_parameters() self.init_dparameters() self.init_centering() self.init_samples() # learning rate, with deferred 1./t annealing self.iter = sharedX(0.0, name='iter') if lr_spec['type'] == 'anneal': num = lr_spec['init'] * lr_spec['start'] denum = T.maximum(lr_spec['start'], lr_spec['slope'] * self.iter) self.lr = T.maximum(lr_spec['floor'], num/denum) elif lr_spec['type'] == 'linear': lr_start = npy_floatX(lr_spec['start']) lr_end = npy_floatX(lr_spec['end']) self.lr = lr_start + self.iter * (lr_end - lr_start) / npy_floatX(self.max_updates) else: raise ValueError('Incorrect value for lr_spec[type]') # counter for CPU-time self.cpu_time = 0. if load_from: self.load_parameters(fname=load_from) # configure input-space (?new pylearn2 feature?) self.input_space = VectorSpace(n_u[0]) self.output_space = VectorSpace(n_u[-1]) self.batches_seen = 0 # incremented on every batch self.examples_seen = 0 # incremented on every training example self.force_batch_size = batch_size # force minibatch size self.error_record = [] if compile: self.do_theano()
def init_samples(self): self.psamples = [] self.nsamples = [] for i, nui in enumerate(self.n_u): self.psamples += [sharedX(self.rng.rand(self.batch_size, nui), name='psamples%i'%i)] self.nsamples += [sharedX(self.rng.rand(self.batch_size, nui), name='nsamples%i'%i)]
def init_centering(self): self.offset = [] for i, nui in enumerate(self.n_u): self.offset += [sharedX(numpy.zeros(nui), name='offset%i'%i)]
def __init__(self, input=None, n_u=[100, 100], enable={}, load_from=None, iscales=None, clip_min={}, clip_max={}, pos_mf_steps=1, pos_sample_steps=0, neg_sample_steps=1, lr_spec={}, lr_mults={}, l1={}, l2={}, l1_inf={}, flags={}, momentum_lambda=0, cg_params={}, batch_size=13, computational_bs=0, compile=True, seed=1241234, sp_targ_h=None, sp_weight_h=None, sp_pos_k=5, my_save_path=None, save_at=None, save_every=None, max_updates=1e6): """ :param n_u: list, containing number of units per layer. n_u[0] contains number of visible units, while n_u[i] (with i > 0) contains number of hid. units at layer i. :param enable: dictionary of flags with on/off behavior :param iscales: optional dictionary containing initialization scale for each parameter. Key of dictionary should match the name of the associated shared variable. :param pos_mf_steps: number of mean-field iterations to perform in positive phase :param neg_sample_steps: number of sampling updates to perform in negative phase. :param lr: base learning rate :param lr_timestamp: list containing update indices at which to change the lr multiplier :param lr_mults: dictionary, optionally containing a list of learning rate multipliers for parameters of the model. Length of this list should match length of lr_timestamp (the lr mult will transition whenever we reach the associated timestamp). Keys should match the name of the shared variable, whose learning rate is to be adjusted. :param l1: dictionary, whose keys are model parameter names, and values are hyper-parameters controlling degree of L1-regularization. :param l2: same as l1, but for L2 regularization. :param l1_inf: same as l1, but the L1 penalty is centered as -\infty instead of 0. :param cg_params: dictionary with keys ['rtol','damp','maxiter'] :param batch_size: size of positive and negative phase minibatch :param computational_bs: batch size used internaly by natural gradient to reduce memory consumption :param seed: seed used to initialize numpy and theano RNGs. :param my_save_path: if None, do not save model. Otherwise, contains stem of filename to which we will save the model (everything but the extension). :param save_at: list containing iteration counts at which to save model :param save_every: scalar value. Save model every `save_every` iterations. """ Model.__init__(self) Block.__init__(self) ### VALIDATE PARAMETERS AND SET DEFAULT VALUES ### assert lr_spec is not None for (k, v) in clip_min.iteritems(): clip_min[k] = npy_floatX(v) for (k, v) in clip_max.iteritems(): clip_max[k] = npy_floatX(v) [iscales.setdefault('bias%i' % i, 0.) for i in xrange(len(n_u))] [iscales.setdefault('W%i' % i, 0.1) for i in xrange(len(n_u))] flags.setdefault('enable_centering', False) flags.setdefault('enable_natural', False) flags.setdefault('enable_warm_start', False) flags.setdefault('mlbiases', False) flags.setdefault('precondition', None) flags.setdefault('minres', False) flags.setdefault('minresQLP', False) if flags['precondition'] == 'None': flags['precondition'] = None self.jobman_channel = None self.jobman_state = {} self.register_names_to_del(['jobman_channel']) ### DUMP INITIALIZATION PARAMETERS TO OBJECT ### for (k, v) in locals().iteritems(): if k != 'self': setattr(self, k, v) assert len(n_u) > 1 self.n_v = n_u[0] self.depth = len(n_u) # allocate random number generators self.rng = numpy.random.RandomState(seed) self.theano_rng = RandomStreams(self.rng.randint(2**30)) # allocate bilinear-weight matrices self.input = T.matrix() self.init_parameters() self.init_dparameters() self.init_centering() self.init_samples() # learning rate, with deferred 1./t annealing self.iter = sharedX(0.0, name='iter') if lr_spec['type'] == 'anneal': num = lr_spec['init'] * lr_spec['start'] denum = T.maximum(lr_spec['start'], lr_spec['slope'] * self.iter) self.lr = T.maximum(lr_spec['floor'], num / denum) elif lr_spec['type'] == 'linear': lr_start = npy_floatX(lr_spec['start']) lr_end = npy_floatX(lr_spec['end']) self.lr = lr_start + self.iter * (lr_end - lr_start) / npy_floatX( self.max_updates) else: raise ValueError('Incorrect value for lr_spec[type]') # counter for CPU-time self.cpu_time = 0. if load_from: self.load_parameters(fname=load_from) # configure input-space (?new pylearn2 feature?) self.input_space = VectorSpace(n_u[0]) self.output_space = VectorSpace(n_u[-1]) self.batches_seen = 0 # incremented on every batch self.examples_seen = 0 # incremented on every training example self.force_batch_size = batch_size # force minibatch size self.error_record = [] if compile: self.do_theano()
(opts, args) = parser.parse_args() # load and recompile model model = serial.load(opts.path) model.set_batch_size(opts.batch_size, redo_monitor=False) ### # Function which computes probability of configuration. ## energy = model.energy(model.nsamples) compute_energy = theano.function([], energy) ### # Rebuild sampling function to have mean-field values for layer 0 ## e_nsamples0 = sharedX(model.nsamples[0].get_value(), name='e_nsamples0') new_nsamples, new_e_nsample0 = neg_sampling(model) neg_updates = {e_nsamples0: new_e_nsample0} for (nsample, new_nsample) in zip(model.nsamples, new_nsamples): neg_updates[nsample] = new_nsample sample_neg_func = theano.function([], [], updates=neg_updates) if opts.random: for nsample in model.nsamples: temp = numpy.random.randint(0, 2, size=nsample.get_value().shape) nsample.set_value(temp.astype(floatX)) # Burnin of Markov chain. for i in xrange(opts.burnin): model.sample_neg_func()