Exemple #1
0
    def scale(self, grads):
        """
        :param grads: dictionary of (param,gradient) pairs
        :rval scaled_grad: gradient scaled by inverse variance
        :rval updates: updates dictionary to locally defined shared variables.
        """
        updates = {}
        for (param,grad) in grads.iteritems():
            assert isinstance(param, T.sharedvar.TensorSharedVariable)
            pval = param.get_value()
            avg2_x = sharedX(1*numpy.ones_like(pval), name='avg2_x_%s' % param.name)
            avg_x2 = sharedX(10*numpy.ones_like(pval), name='avg_x2_%s' % param.name)
            new_avg2_x = self.mov_avg * avg2_x + (1.-self.mov_avg) * T.mean(grad, axis=0)**2
            new_avg_x2 = self.mov_avg * avg_x2 + (1.-self.mov_avg) * T.mean(grad**2, axis=0)

            grad_var = new_avg_x2 - new_avg2_x

            # scale by inverse of standard deviation, up to max_factor
            #grads[param] = Print('scaler')(T.sqrt(1./(grad_var + self.eps))) * grad
            grads[param] = T.sqrt(1./(grad_var + self.eps)) * grad
           
            # store new shared variables
            self.avg2_x[param] = avg2_x
            self.avg_x2[param] = avg_x2

            # register updates
            updates[avg2_x] = new_avg2_x
            updates[avg_x2] = new_avg_x2

        return grads, updates
Exemple #2
0
    def scale(self, grads):
        """
        :param grads: dictionary of (param,gradient) pairs
        :rval scaled_grad: gradient scaled by inverse variance
        :rval updates: updates dictionary to locally defined shared variables.
        """
        updates = OrderedDict()
        for (param, grad) in grads.iteritems():
            assert isinstance(param, T.sharedvar.TensorSharedVariable)
            pval = param.get_value()
            avg2_x = sharedX(1 * numpy.ones_like(pval),
                             name='avg2_x_%s' % param.name)
            avg_x2 = sharedX(10 * numpy.ones_like(pval),
                             name='avg_x2_%s' % param.name)
            new_avg2_x = self.mov_avg * avg2_x + (1. - self.mov_avg) * T.mean(
                grad, axis=0)**2
            new_avg_x2 = self.mov_avg * avg_x2 + (1. - self.mov_avg) * T.mean(
                grad**2, axis=0)

            grad_var = new_avg_x2 - new_avg2_x

            # scale by inverse of standard deviation, up to max_factor
            #grads[param] = Print('scaler')(T.sqrt(1./(grad_var + self.eps))) * grad
            grads[param] = T.sqrt(1. / (grad_var + self.eps)) * grad

            # store new shared variables
            self.avg2_x[param] = avg2_x
            self.avg_x2[param] = avg_x2

            # register updates
            updates[avg2_x] = new_avg2_x
            updates[avg_x2] = new_avg_x2

        return grads, updates
Exemple #3
0
 def init_dparameters(self):
     # Create shared variables for model parameters.
     self.dW = []
     self.dbias = []
     for i, nui in enumerate(self.n_u):
         self.dbias += [sharedX(numpy.zeros(nui), name='dbias%i' % i)]
         self.dW += [None]
         if i > 0:
             wv_val = numpy.zeros((self.n_u[i - 1], nui))
             self.dW[i] = sharedX(wv_val, name='dW%i' % i)
     self.dparams = [dWi for dWi in self.dW[1:]]
     self.dparams += [dbi for dbi in self.dbias]
Exemple #4
0
 def init_dparameters(self):
     # Create shared variables for model parameters.
     self.dW = []
     self.dbias = []
     for i, nui in enumerate(self.n_u):
         self.dbias += [sharedX(numpy.zeros(nui), name='dbias%i'%i)]
         self.dW += [None]
         if i > 0: 
             wv_val = numpy.zeros((self.n_u[i-1], nui))
             self.dW[i] = sharedX(wv_val, name='dW%i' % i)
     self.dparams  = [dWi for dWi in self.dW[1:]]
     self.dparams += [dbi for dbi in self.dbias]
Exemple #5
0
 def init_samples(self):
     self.psamples = []
     self.nsamples = []
     for i, nui in enumerate(self.n_u):
         self.psamples += [
             sharedX(self.rng.rand(self.batch_size, nui),
                     name='psamples%i' % i)
         ]
         self.nsamples += [
             sharedX(self.rng.rand(self.batch_size, nui),
                     name='nsamples%i' % i)
         ]
Exemple #6
0
 def init_parameters(self):
     # Create shared variables for model parameters.
     self.W = []
     self.bias = []
     for i, nui in enumerate(self.n_u):
         self.bias += [sharedX(self.iscales['bias%i' %i] * numpy.ones(nui), name='bias%i'%i)]
         self.W += [None]
         if i > 0: 
             wv_val = self.rng.randn(self.n_u[i-1], nui) * self.iscales.get('W%i'%i,1.0)
             self.W[i] = sharedX(wv_val, name='W%i' % i)
     # Establish list of learnt model parameters.
     self.params  = [Wi for Wi in self.W[1:]]
     self.params += [bi for bi in self.bias]
Exemple #7
0
 def init_parameters(self):
     # Create shared variables for model parameters.
     self.W = []
     self.bias = []
     for i, nui in enumerate(self.n_u):
         self.bias += [
             sharedX(self.iscales['bias%i' % i] * numpy.ones(nui),
                     name='bias%i' % i)
         ]
         self.W += [None]
         if i > 0:
             wv_val = self.rng.randn(
                 self.n_u[i - 1], nui) * self.iscales.get('W%i' % i, 1.0)
             self.W[i] = sharedX(wv_val, name='W%i' % i)
     # Establish list of learnt model parameters.
     self.params = [Wi for Wi in self.W[1:]]
     self.params += [bi for bi in self.bias]
Exemple #8
0
def get_updates(grads,
                lr,
                fast_lr=None,
                multipliers=None,
                momentum_lambda=None):
    """
    Returns an updates dictionary corresponding to a single step of SGD. The learning rate
    for each parameter is computed as lr * multipliers[param]
    :param lr: base learning rate (common to all parameters)
    :param multipliers: dictionary of learning rate multipliers, each being a shared var
                        e.g. {'hbias': sharedX(0.1), 'Wf': sharedX(0.01)}
    """

    updates = OrderedDict()
    momentum = OrderedDict()
    multipliers = OrderedDict() if multipliers is None else multipliers

    for (param, gparam) in grads.iteritems():

        # each parameter can have its own multiplier on the learning rate
        multiplier = multipliers.get(param.name, 1.0)

        if param.name.find('fast') == 0 and fast_lr:
            print 'Using fast-learning rate of %f for %s' % (fast_lr,
                                                             param.name)
            lr_param = fast_lr
        else:
            lr_param = lr * multiplier

        # create storage for momentum term
        momentum[param] = sharedX(numpy.zeros_like(param.get_value()),
                                  name=param.name + '_old')

        if momentum_lambda and param.name != 'fWv':
            # perform SGD, with momentum (optional)
            new_grad = (1. - momentum_lambda
                        ) * gparam + momentum_lambda * momentum[param]
            updates[param] = param - lr_param * new_grad
            updates[momentum[param]] = new_grad
        else:
            updates[param] = param - lr_param * gparam

    return updates
Exemple #9
0
    def compute_gradients(self, momentum_lambda=None):
        updates = OrderedDict()
        momentum = OrderedDict()

        grads =  T.grad(self.cost, self.params.keys(), 
                        consider_constant=self.constants.keys(),
                        disconnected_inputs='ignore')
        for param, gparam in zip(self.params.keys(), grads):

            if momentum_lambda:
                momentum[param] = sharedX(numpy.zeros_like(param.get_value()), name=param.name + '_mom')
                new_grad = momentum_lambda * momentum[param] + (1.-momentum_lambda) * gparam
                updates[momentum[param]] = new_grad
            else:
                new_grad = gparam

            self.grads[param] = new_grad

        self.computed_cost = True
        return updates
Exemple #10
0
    def compute_gradients(self, momentum_lambda=None):
        updates = OrderedDict()
        momentum = OrderedDict()

        grads = T.grad(self.cost,
                       self.params.keys(),
                       consider_constant=self.constants.keys(),
                       disconnected_inputs='ignore')
        for param, gparam in zip(self.params.keys(), grads):

            if momentum_lambda:
                momentum[param] = sharedX(numpy.zeros_like(param.get_value()),
                                          name=param.name + '_mom')
                new_grad = momentum_lambda * momentum[param] + (
                    1. - momentum_lambda) * gparam
                updates[momentum[param]] = new_grad
            else:
                new_grad = gparam

            self.grads[param] = new_grad

        self.computed_cost = True
        return updates
Exemple #11
0
def get_updates(grads, lr, fast_lr=None, multipliers=None, momentum_lambda=None):
    """
    Returns an updates dictionary corresponding to a single step of SGD. The learning rate
    for each parameter is computed as lr * multipliers[param]
    :param lr: base learning rate (common to all parameters)
    :param multipliers: dictionary of learning rate multipliers, each being a shared var
                        e.g. {'hbias': sharedX(0.1), 'Wf': sharedX(0.01)}
    """

    updates = {}
    momentum = {}
    multipliers = {} if multipliers is None else multipliers

    for (param, gparam) in grads.iteritems():

        # each parameter can have its own multiplier on the learning rate
        multiplier = multipliers.get(param.name, 1.0)

        if param.name.find('fast')==0 and fast_lr:
            print 'Using fast-learning rate of %f for %s' % (fast_lr, param.name)
            lr_param = fast_lr
        else:
            lr_param = lr * multiplier
   
        # create storage for momentum term
        momentum[param] = sharedX(numpy.zeros_like(param.get_value()), name=param.name + '_old')

        if momentum_lambda and param.name!='fWv':
            # perform SGD, with momentum (optional)
            new_grad = (1.-momentum_lambda) * gparam + momentum_lambda * momentum[param]
            updates[param] = param - lr_param * new_grad
            updates[momentum[param]] = new_grad
        else:
            updates[param] = param - lr_param * gparam

    return updates
Exemple #12
0
 def init_centering(self):
     self.offset = []
     for i, nui in enumerate(self.n_u):
         self.offset += [sharedX(numpy.zeros(nui), name='offset%i' % i)]
Exemple #13
0
(opts, args) = parser.parse_args()

# load and recompile model
model = serial.load(opts.path)
model.set_batch_size(opts.batch_size, redo_monitor=False)

###
# Function which computes probability of configuration.
##
energy = model.energy(model.nsamples)
compute_energy = theano.function([], energy)

###
# Rebuild sampling function to have mean-field values for layer 0
##
e_nsamples0 = sharedX(model.nsamples[0].get_value(), name='e_nsamples0')
new_nsamples, new_e_nsample0 = neg_sampling(model)
neg_updates = {e_nsamples0: new_e_nsample0}
for (nsample, new_nsample) in zip(model.nsamples, new_nsamples):
    neg_updates[nsample] = new_nsample
sample_neg_func = theano.function([], [], updates=neg_updates)

if opts.random:
    for nsample in model.nsamples:
        temp = numpy.random.randint(0,2, size=nsample.get_value().shape)
        nsample.set_value(temp.astype(floatX))

# Burnin of Markov chain.
for i in xrange(opts.burnin):
    model.sample_neg_func()
Exemple #14
0
    def __init__(self, input = None, n_u=[100,100], enable={}, load_from=None,
            iscales=None, clip_min={}, clip_max={},
            pos_mf_steps=1, pos_sample_steps=0, neg_sample_steps=1, 
            lr_spec={}, lr_mults = {},
            l1 = {}, l2 = {}, l1_inf={}, flags={}, momentum_lambda=0,
            cg_params = {},
            batch_size = 13,
            computational_bs = 0,
            compile=True,
            seed=1241234,
            sp_targ_h = None, sp_weight_h=None, sp_pos_k = 5,
            my_save_path=None, save_at=None, save_every=None,
            max_updates=1e6):
        """
        :param n_u: list, containing number of units per layer. n_u[0] contains number
         of visible units, while n_u[i] (with i > 0) contains number of hid. units at layer i.
        :param enable: dictionary of flags with on/off behavior
        :param iscales: optional dictionary containing initialization scale for each parameter.
               Key of dictionary should match the name of the associated shared variable.
        :param pos_mf_steps: number of mean-field iterations to perform in positive phase
        :param neg_sample_steps: number of sampling updates to perform in negative phase.
        :param lr: base learning rate
        :param lr_timestamp: list containing update indices at which to change the lr multiplier
        :param lr_mults: dictionary, optionally containing a list of learning rate multipliers
               for parameters of the model. Length of this list should match length of
               lr_timestamp (the lr mult will transition whenever we reach the associated
               timestamp). Keys should match the name of the shared variable, whose learning
               rate is to be adjusted.
        :param l1: dictionary, whose keys are model parameter names, and values are
               hyper-parameters controlling degree of L1-regularization.
        :param l2: same as l1, but for L2 regularization.
        :param l1_inf: same as l1, but the L1 penalty is centered as -\infty instead of 0.
        :param cg_params: dictionary with keys ['rtol','damp','maxiter']
        :param batch_size: size of positive and negative phase minibatch
        :param computational_bs: batch size used internaly by natural
               gradient to reduce memory consumption
        :param seed: seed used to initialize numpy and theano RNGs.
        :param my_save_path: if None, do not save model. Otherwise, contains stem of filename
               to which we will save the model (everything but the extension).
        :param save_at: list containing iteration counts at which to save model
        :param save_every: scalar value. Save model every `save_every` iterations.
        """
        Model.__init__(self)
        Block.__init__(self)
        ### VALIDATE PARAMETERS AND SET DEFAULT VALUES ###
        assert lr_spec is not None
        for (k,v) in clip_min.iteritems(): clip_min[k] = npy_floatX(v)
        for (k,v) in clip_max.iteritems(): clip_max[k] = npy_floatX(v)
        [iscales.setdefault('bias%i' % i, 0.) for i in xrange(len(n_u))]
        [iscales.setdefault('W%i' % i, 0.1) for i in xrange(len(n_u))]
        flags.setdefault('enable_centering', False)
        flags.setdefault('enable_natural', False)
        flags.setdefault('enable_warm_start', False)
        flags.setdefault('mlbiases', False)
        flags.setdefault('precondition', None)
        flags.setdefault('minres', False)
        flags.setdefault('minresQLP', False)
        if flags['precondition'] == 'None': flags['precondition'] = None
       
        self.jobman_channel = None
        self.jobman_state = {}
        self.register_names_to_del(['jobman_channel'])

        ### DUMP INITIALIZATION PARAMETERS TO OBJECT ###
        for (k,v) in locals().iteritems():
            if k!='self': setattr(self,k,v)

        assert len(n_u) > 1
        self.n_v = n_u[0]
        self.depth = len(n_u)

        # allocate random number generators
        self.rng = numpy.random.RandomState(seed)
        self.theano_rng = RandomStreams(self.rng.randint(2**30))

        # allocate bilinear-weight matrices
        self.input = T.matrix()
        self.init_parameters()
        self.init_dparameters()
        self.init_centering()
        self.init_samples()

        # learning rate, with deferred 1./t annealing
        self.iter = sharedX(0.0, name='iter')

        if lr_spec['type'] == 'anneal':
            num = lr_spec['init'] * lr_spec['start'] 
            denum = T.maximum(lr_spec['start'], lr_spec['slope'] * self.iter)
            self.lr = T.maximum(lr_spec['floor'], num/denum) 
        elif lr_spec['type'] == 'linear':
            lr_start = npy_floatX(lr_spec['start'])
            lr_end   = npy_floatX(lr_spec['end'])
            self.lr = lr_start + self.iter * (lr_end - lr_start) / npy_floatX(self.max_updates)
        else:
            raise ValueError('Incorrect value for lr_spec[type]')

        # counter for CPU-time
        self.cpu_time = 0.

        if load_from:
            self.load_parameters(fname=load_from)

        # configure input-space (?new pylearn2 feature?)
        self.input_space = VectorSpace(n_u[0])
        self.output_space = VectorSpace(n_u[-1])
        self.batches_seen = 0                    # incremented on every batch
        self.examples_seen = 0                   # incremented on every training example
        self.force_batch_size = batch_size  # force minibatch size
        self.error_record = []
 
        if compile: self.do_theano()
Exemple #15
0
 def init_samples(self):
     self.psamples = []
     self.nsamples = []
     for i, nui in enumerate(self.n_u):
         self.psamples += [sharedX(self.rng.rand(self.batch_size, nui), name='psamples%i'%i)]
         self.nsamples += [sharedX(self.rng.rand(self.batch_size, nui), name='nsamples%i'%i)]
Exemple #16
0
 def init_centering(self):
     self.offset = []
     for i, nui in enumerate(self.n_u):
         self.offset += [sharedX(numpy.zeros(nui), name='offset%i'%i)]
Exemple #17
0
    def __init__(self,
                 input=None,
                 n_u=[100, 100],
                 enable={},
                 load_from=None,
                 iscales=None,
                 clip_min={},
                 clip_max={},
                 pos_mf_steps=1,
                 pos_sample_steps=0,
                 neg_sample_steps=1,
                 lr_spec={},
                 lr_mults={},
                 l1={},
                 l2={},
                 l1_inf={},
                 flags={},
                 momentum_lambda=0,
                 cg_params={},
                 batch_size=13,
                 computational_bs=0,
                 compile=True,
                 seed=1241234,
                 sp_targ_h=None,
                 sp_weight_h=None,
                 sp_pos_k=5,
                 my_save_path=None,
                 save_at=None,
                 save_every=None,
                 max_updates=1e6):
        """
        :param n_u: list, containing number of units per layer. n_u[0] contains number
         of visible units, while n_u[i] (with i > 0) contains number of hid. units at layer i.
        :param enable: dictionary of flags with on/off behavior
        :param iscales: optional dictionary containing initialization scale for each parameter.
               Key of dictionary should match the name of the associated shared variable.
        :param pos_mf_steps: number of mean-field iterations to perform in positive phase
        :param neg_sample_steps: number of sampling updates to perform in negative phase.
        :param lr: base learning rate
        :param lr_timestamp: list containing update indices at which to change the lr multiplier
        :param lr_mults: dictionary, optionally containing a list of learning rate multipliers
               for parameters of the model. Length of this list should match length of
               lr_timestamp (the lr mult will transition whenever we reach the associated
               timestamp). Keys should match the name of the shared variable, whose learning
               rate is to be adjusted.
        :param l1: dictionary, whose keys are model parameter names, and values are
               hyper-parameters controlling degree of L1-regularization.
        :param l2: same as l1, but for L2 regularization.
        :param l1_inf: same as l1, but the L1 penalty is centered as -\infty instead of 0.
        :param cg_params: dictionary with keys ['rtol','damp','maxiter']
        :param batch_size: size of positive and negative phase minibatch
        :param computational_bs: batch size used internaly by natural
               gradient to reduce memory consumption
        :param seed: seed used to initialize numpy and theano RNGs.
        :param my_save_path: if None, do not save model. Otherwise, contains stem of filename
               to which we will save the model (everything but the extension).
        :param save_at: list containing iteration counts at which to save model
        :param save_every: scalar value. Save model every `save_every` iterations.
        """
        Model.__init__(self)
        Block.__init__(self)
        ### VALIDATE PARAMETERS AND SET DEFAULT VALUES ###
        assert lr_spec is not None
        for (k, v) in clip_min.iteritems():
            clip_min[k] = npy_floatX(v)
        for (k, v) in clip_max.iteritems():
            clip_max[k] = npy_floatX(v)
        [iscales.setdefault('bias%i' % i, 0.) for i in xrange(len(n_u))]
        [iscales.setdefault('W%i' % i, 0.1) for i in xrange(len(n_u))]
        flags.setdefault('enable_centering', False)
        flags.setdefault('enable_natural', False)
        flags.setdefault('enable_warm_start', False)
        flags.setdefault('mlbiases', False)
        flags.setdefault('precondition', None)
        flags.setdefault('minres', False)
        flags.setdefault('minresQLP', False)
        if flags['precondition'] == 'None': flags['precondition'] = None

        self.jobman_channel = None
        self.jobman_state = {}
        self.register_names_to_del(['jobman_channel'])

        ### DUMP INITIALIZATION PARAMETERS TO OBJECT ###
        for (k, v) in locals().iteritems():
            if k != 'self': setattr(self, k, v)

        assert len(n_u) > 1
        self.n_v = n_u[0]
        self.depth = len(n_u)

        # allocate random number generators
        self.rng = numpy.random.RandomState(seed)
        self.theano_rng = RandomStreams(self.rng.randint(2**30))

        # allocate bilinear-weight matrices
        self.input = T.matrix()
        self.init_parameters()
        self.init_dparameters()
        self.init_centering()
        self.init_samples()

        # learning rate, with deferred 1./t annealing
        self.iter = sharedX(0.0, name='iter')

        if lr_spec['type'] == 'anneal':
            num = lr_spec['init'] * lr_spec['start']
            denum = T.maximum(lr_spec['start'], lr_spec['slope'] * self.iter)
            self.lr = T.maximum(lr_spec['floor'], num / denum)
        elif lr_spec['type'] == 'linear':
            lr_start = npy_floatX(lr_spec['start'])
            lr_end = npy_floatX(lr_spec['end'])
            self.lr = lr_start + self.iter * (lr_end - lr_start) / npy_floatX(
                self.max_updates)
        else:
            raise ValueError('Incorrect value for lr_spec[type]')

        # counter for CPU-time
        self.cpu_time = 0.

        if load_from:
            self.load_parameters(fname=load_from)

        # configure input-space (?new pylearn2 feature?)
        self.input_space = VectorSpace(n_u[0])
        self.output_space = VectorSpace(n_u[-1])
        self.batches_seen = 0  # incremented on every batch
        self.examples_seen = 0  # incremented on every training example
        self.force_batch_size = batch_size  # force minibatch size
        self.error_record = []

        if compile: self.do_theano()
Exemple #18
0
(opts, args) = parser.parse_args()

# load and recompile model
model = serial.load(opts.path)
model.set_batch_size(opts.batch_size, redo_monitor=False)

###
# Function which computes probability of configuration.
##
energy = model.energy(model.nsamples)
compute_energy = theano.function([], energy)

###
# Rebuild sampling function to have mean-field values for layer 0
##
e_nsamples0 = sharedX(model.nsamples[0].get_value(), name='e_nsamples0')
new_nsamples, new_e_nsample0 = neg_sampling(model)
neg_updates = {e_nsamples0: new_e_nsample0}
for (nsample, new_nsample) in zip(model.nsamples, new_nsamples):
    neg_updates[nsample] = new_nsample
sample_neg_func = theano.function([], [], updates=neg_updates)

if opts.random:
    for nsample in model.nsamples:
        temp = numpy.random.randint(0, 2, size=nsample.get_value().shape)
        nsample.set_value(temp.astype(floatX))

# Burnin of Markov chain.
for i in xrange(opts.burnin):
    model.sample_neg_func()