Esempio n. 1
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):

        updates = OrderedDict()
        for param in grads.keys():

            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            momentum = sharedX(np.zeros_like(param.get_value()))

            if param.name is not None:
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr \
                + (1 - self.averaging_coeff) \
                * T.sqr(grads[param])

            rms_grad_t = T.sqrt(new_avg_grad_sqr)
            rms_grad_t = T.maximum(rms_grad_t, self.stabilizer)
            normalized_grad = grads[param] / (rms_grad_t)
            new_momentum = self.momentum * momentum \
                - learning_rate * normalized_grad

            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[momentum] = new_momentum
            updates[param] = param + new_momentum

        return updates
Esempio n. 2
0
    def __init__(self, nvis, nclasses):
        """Initialize the parameters of the logistic regression instance.

        Parameters
        ----------
        nvis : int
            number of input units, the dimension of the space in which
            the datapoints lie.

        nclasses : int
            number of output units, the dimension of the space in which
            the labels lie.
        """

        super(LogisticRegressionLayer, self).__init__()

        assert nvis >= 0, "Number of visible units must be non-negative"
        assert nclasses >= 0, "Number of classes must be non-negative"

        self.nvis = nvis
        self.nclasses = nclasses

        # initialize with 0 the weights W as a matrix of shape (nvis, nclasses)
        self.W = sharedX(numpy.zeros((nvis, nclasses)), name='W', borrow=True)
        # initialize the biases b as a vector of nclasses 0s
        self.b = sharedX(numpy.zeros((nclasses,)), name='b', borrow=True)

        # parameters of the model
        self._params = [self.W, self.b]
Esempio n. 3
0
def bench(f, m, n):
    #print f

    rng = np.random.RandomState([2012,9,11])

    X = sharedX(rng.randn(m,n))
    Y = sharedX(X.get_value())

    func = theano.function([], updates = { Y : f(X) })

    nodes = func.maker.fgraph.toposort()

    # Make sure the optimizations haven't made us benchmark something different from what we intend
    if f is my_softmax:
        assert True not in [ isinstance(node.op, theano.tensor.nnet.Softmax) for node in nodes ]
    if f is softmax_op:
        assert True in [ isinstance(node.op, theano.tensor.nnet.Softmax) for node in nodes ]
    if f is softmax_with_bias:
        assert True in [ isinstance(node.op, theano.tensor.nnet.SoftmaxWithBias) for node in nodes ]

    # warm up
    for i in xrange(5):
        func()

    # actual time
    times = []
    for i in xrange(5):
        t1 = time.time()
        func()
        t2 = time.time()
        times.append(t2-t1)

    rval = np.asarray(times).mean()
    #print rval
    return rval
Esempio n. 4
0
 def __init__(self, dict_size, dim, context_length, k, irange = 0.1, seed = 22):
     super(vLBLSoft, self).__init__()
     rng = np.random.RandomState(seed)
     self.rng = rng
     self.context_length = context_length
     self.dim = dim
     self.dict_size = dict_size
     C_values = np.asarray(rng.normal(0, math.sqrt(irange),
                                      size=(dim,context_length)),
                           dtype=theano.config.floatX)
     self.C = theano.shared(value=C_values, name='C', borrow=True)
     W_context = rng.uniform(-irange, irange, (dict_size, dim))
     W_context = sharedX(W_context,name='W_context')
     W_target = rng.uniform(-irange, irange, (dict_size, dim))
     W_target = sharedX(W_target,name='W_target')
     self.projector_context = MatrixMul(W_context)
     self.projector_target = MatrixMul(W_target)
     self.W_context = W_context
     self.W_target = W_target
     self.W_target = W_context
     b_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dict_size,)),
                           dtype=theano.config.floatX)
     self.b = theano.shared(value=b_values, name='b', borrow=True)
     self.input_space = IndexSpace(dim = context_length, max_labels = dict_size)
     self.output_space = IndexSpace(dim = 1, max_labels = dict_size)
     self.allY = T.as_tensor_variable(np.arange(dict_size,dtype=np.int64).reshape(dict_size,1))
Esempio n. 5
0
    def createGradientFunctions(self):
        #create
        X = T.dmatrices("X")
        mu, logSigma, u, v, f, R = T.dcols("mu", "logSigma", "u", "v", "f", "R")
        mu = sharedX( np.random.normal(10, 10, (self.dimTheta, 1)), name='mu') 
        logSigma = sharedX(np.random.uniform(0, 4, (self.dimTheta, 1)), name='logSigma')
        logLambd = sharedX(np.matrix(np.random.uniform(0, 10)),name='logLambd')
        logLambd = T.patternbroadcast(T.dmatrix("logLambd"),[1,1])
        negKL = 0.5 * T.sum(1 + 2*logSigma - mu ** 2 - T.exp(logSigma) ** 2)
        theta = mu+T.exp(logSigma)*v
        W=theta
        y=X[:,0]
        X_sim=X[:,1:]
        f = (T.dot(X_sim,W)+u).flatten()
        
        gradvariables = [mu, logSigma, logLambd]
        
        
        logLike = T.sum(-(0.5 * np.log(2 * np.pi) + logLambd) - 0.5 * ((y-f)/(T.exp(logLambd)))**2)

        logp = (negKL + logLike)/self.m

        optimizer = -logp
        
        self.negKL = th.function([mu, logSigma], negKL, on_unused_input='ignore')
        self.f = th.function(gradvariables + [X,u,v], f, on_unused_input='ignore')
        self.logLike = th.function(gradvariables + [X, u, v], logLike,on_unused_input='ignore')
        derivatives = T.grad(logp,gradvariables)
        derivatives.append(logp)

        self.gradientfunction = th.function(gradvariables + [X, u, v], derivatives, on_unused_input='ignore')
        self.lowerboundfunction = th.function(gradvariables + [X, u, v], logp, on_unused_input='ignore')

        self.optimizer = BatchGradientDescent(objective=optimizer, params=gradvariables,inputs = [X,u,v],conjugate=True,max_iter=1)
    def redo_everything(self):
        """ compiles learn_func if necessary
            makes new negative chains
            does not reset weights or biases
            TODO: figure out how to make the semantics of this cleaner / more in line with other models
        """

        #compile learn_func if necessary
        if self.autonomous:
            self.redo_theano()

        #make the negative chains
        if not self.use_cd:
            self.V_chains = self.make_chains(self.bias_vis)
            self.V_chains.name = 'dbm_V_chains'

            self.H_chains = [ self.make_chains(bias_hid) for bias_hid in self.bias_hid ]
            for i, H_chain in enumerate(self.H_chains):
                H_chain.name = 'dbm_H[%d]_chain' % i

            if self.num_classes > 0:
                P = np.zeros((self.negative_chains, self.num_classes)) \
                        + T.nnet.softmax( self.bias_class )
                temp_theano_rng = RandomStreams(87)
                sample_from = Sampler(temp_theano_rng, 'multinomial')
                values = function([],sample_from(P))()
                self.Y_chains = sharedX(values, 'Y_chains')
            else:
                self.Y_chains = None

        if hasattr(self, 'init_beta') and self.init_beta is not None:
            self.beta = sharedX( np.zeros( self.bias_vis.get_value().shape) + self.init_beta, name = 'beta')
Esempio n. 7
0
    def __init__(self, dataset, model, algorithm=None, save_path=None,
                 save_freq=0, extensions=None, allow_overwrite=True):
        """
        Construct a Train instance.

        Parameters
        ----------
        dataset : `pylearn2.datasets.dataset.Dataset`
        model : `pylearn2.models.model.Model`
        algorithm : <Optional>
        `pylearn2.training_algorithms.training_algorithm.TrainingAlgorithm`
        save_path : <Optional> str
            Path to save (with pickle / joblib) the model.
        save_freq : <Optional> int
            Frequency of saves, in epochs. A frequency of zero disables
            automatic saving altogether. A frequency of 1 saves every
            epoch. A frequency of 2 saves every other epoch, etc.
            (default=0, i.e. never save). Note: when automatic saving is
            enabled (eg save_freq > 0), the model is always saved after
            learning, even when the final epoch is not a multiple of
            `save_freq`.
        extensions : <Optional> iterable
            A collection of `TrainExtension` objects whose callbacks are
            triggered at various points in learning.
        allow_overwrite : <Optional> bool
            If `True`, will save the model to save_path even if there is already
            something there. Otherwise, will raise an error if the `save_path`
            is already occupied.
        """
        self.allow_overwrite = allow_overwrite
        self.first_save = True
        self.dataset = dataset
        self.model = model
        self.algorithm = algorithm
        if save_path is not None:
            if save_freq == 0:
                warnings.warn('save_path specified but save_freq is 0 '
                              '(never save). Is this intentional?')
            self.save_path = save_path
        else:
            if save_freq > 0:
                phase_variable = 'PYLEARN2_TRAIN_PHASE'
                if phase_variable in os.environ:
                    phase = 'phase%d' % os.environ[phase_variable]
                    tokens = [os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'],
                              phase, 'pkl']
                else:
                    tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl'
                self.save_path = '.'.join(tokens)
        self.save_freq = save_freq

        if hasattr(self.dataset, 'yaml_src'):
            self.model.dataset_yaml_src = self.dataset.yaml_src
        else:
            warnings.warn("dataset has no yaml src, model won't know what " +
                          "data it was trained on")

        self.extensions = extensions if extensions is not None else []
        self.training_seconds = sharedX(value=0, name='training_seconds_this_epoch')
        self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
Esempio n. 8
0
    def set_input_space(self, space):
        """ Note: this function will reset the parameters! """

        self.input_space = space

        if not isinstance(space, Conv2DSpace):
            raise BadInputSpaceError(self.__class__.__name__ +
                                     ".set_input_space "
                                     "expected a Conv2DSpace, got " +
                                     str(space) + " of type " +
                                     str(type(space)))

        rng = self.get_mlp().rng


        if self.pad != (0,0):
            output_shape = \
                [int(np.ceil((i_sh + 2. * k_pad - k_sh) / float(k_st))) + 1
                 for i_sh, k_sh, k_st, k_pad in izip(self.input_space.shape,
                                                     self.kernel_shape,
                                                     self.kernel_stride,
                                                     self.pad)]

        elif self.border_mode == 'valid':
            output_shape = [(self.input_space.shape[0] - self.kernel_shape[0])
                            / self.kernel_stride[0] + 1,
                            (self.input_space.shape[1] - self.kernel_shape[1])
                            / self.kernel_stride[1] + 1]
        elif self.border_mode == 'full':
            output_shape = [(self.input_space.shape[0] + self.kernel_shape[0])
                            / self.kernel_stride[0] - 1,
                            (self.input_space.shape[1] + self.kernel_shape[1])
                            / self.kernel_stride[1] - 1]

        print "In:", self.layer_name, self.input_space.shape, self.kernel_shape, self.kernel_stride, self.pad
        print "Out:", self.layer_name, output_shape


        self.detector_space = Conv2DSpace(shape=output_shape,
                                          num_channels=self.output_channels,
                                          axes=('b', 'c', 0, 1))

        self.initialize_transformer(rng)

        W, = self.transformer.get_params()
        W.name = self.layer_name + '_W'

        assert self.tied_b
        if self.tied_b:
            self.b = sharedX(np.zeros((self.detector_space.num_channels)) +
                             self.init_bias)
        else:
            self.b = sharedX(self.detector_space.get_origin() + self.init_bias)

        self.b.name = self.layer_name + '_b'

        logger.info('Input shape: {0}'.format(self.input_space.shape))
        logger.info('Detector space: {0}'.format(self.detector_space.shape))

        self.initialize_output_space()
    def set_input_space(self, space):
        self.input_space = space

        if not isinstance(space, Space):
            raise TypeError("Expected Space, got "+
                    str(space)+" of type "+str(type(space)))

        self.input_dim = space.get_total_dimension()
        self.needs_reformat = not isinstance(space, VectorSpace)

        desired_dim = self.input_dim
        self.desired_space = VectorSpace(desired_dim)

        if not self.needs_reformat:
            assert self.desired_space == self.input_space

        rng = self.mlp.rng

        self._params = []
        V = np.zeros((self.n_classes, self.input_dim),dtype=np.float32)
        self.V = sharedX(V,   self.layer_name + "_V" )

        U = np.identity( self.input_dim)
        self.U = sharedX(U, self.layer_name + "_U")

        Q =  np.zeros((self.input_dim, self.input_dim),dtype=np.float32)
        self.Q = sharedX(Q, self.layer_name + "_Q")

        Ui =  np.identity(self.input_dim,dtype=np.float32)
        self.Ui = sharedX(Ui, self.layer_name + "_Ui")

        self._params = [ self.U, self.Ui, self.V, self.Q]
Esempio n. 10
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        updates = OrderedDict()

        for param in grads.keys():

            inc = sharedX(param.get_value() * 0.)
            avg_grad = sharedX(np.zeros_like(param.get_value()))
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))

            if param.name is not None:
                avg_grad.name = 'avg_grad_' + param.name
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            new_avg_grad = self.averaging_coeff * avg_grad \
                + (1 - self.averaging_coeff) * grads[param]
            new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr \
                + (1 - self.averaging_coeff) * grads[param]**2

            normalized_grad = grads[param] / T.sqrt(new_avg_grad_sqr -
                                                    new_avg_grad**2 +
                                                    self.stabilizer)
            updated_inc = self.momentum * inc - learning_rate * normalized_grad

            updates[avg_grad] = new_avg_grad
            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[inc] = updated_inc
            updates[param] = param + updated_inc

        return updates
Esempio n. 11
0
    def get_updates(self, learning_rate, grads, lr_scalers=None):
        """
        .. todo::

            WRITEME
        """
        updates = OrderedDict()
        for param in grads.keys():

            #avg_grad = sharedX(np.zeros_like(param.get_value()))
            avg_grad_sqr = sharedX(np.zeros_like(param.get_value()))
            momentum = sharedX(np.zeros_like(param.get_value()))

            if param.name is not None:
                #avg_grad.name = 'avg_grad_' + param.name
                avg_grad_sqr.name = 'avg_grad_sqr_' + param.name

            #new_avg_grad = self.averaging_coeff * avg_grad \
            #            + (1- self.averaging_coeff) * grads[param]
            new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr \
                + (1 - self.averaging_coeff) * grads[param]**2

            #normalized_grad = grads[param] / T.sqrt(new_avg_grad_sqr \
            #                - new_avg_grad**2 + self.stabilizer)
            normalized_grad = grads[param] / T.sqrt(new_avg_grad_sqr
                                                    + self.stabilizer)
            new_momentum = self.momentum - learning_rate * normalized_grad

            #updates[avg_grad] = new_avg_grad
            updates[avg_grad_sqr] = new_avg_grad_sqr
            updates[momentum] = new_momentum
            updates[param] = param + new_momentum

        return updates
Esempio n. 12
0
    def set_input_space(self, space):
        
        self.input_space = space

        if isinstance(space, VectorSpace):
            self.requires_reformat = False
            self.input_dim = space.dim
        else:
            self.requires_reformat = True
            self.input_dim = space.get_total_dimension()
            self.desired_space = VectorSpace(self.input_dim)

        if self.fprop_code==True:
            self.output_space = VectorSpace(self.dim)
        else:
            self.output_space = VectorSpace(self.input_dim)

        rng = self.mlp.rng
        W = rng.randn(self.input_dim, self.dim)
        self.W = sharedX(W.T, self.layer_name + '_W')
        self.transformer = MatrixMul(self.W)
        self.W, = self.transformer.get_params()
        b = np.zeros((self.input_dim,))
        self.b = sharedX(b, self.layer_name + '_b') # We need both to pass input_dim valid
        X = .001 * rng.randn(self.batch_size, self.dim)
        self.X = sharedX(X, self.layer_name + '_X')
        self._params = [self.W, self.b, self.X]
        self.state_below = T.zeros((self.batch_size, self.input_dim))
Esempio n. 13
0
    def __init__(self, dim, dim_hid, dim_cond, clamp_sigmoid=False, unroll_scan=1):
        """
        Parameters
        ----------
        dim : int
            Number of observed binary variables
        dim_hid : int
            Number of latent binary variables
        dim_cond : int
            Number of conditioning variables
        clamp_sigmoid : bool, optional
            WRITEME. Defaults to `False`.
        unroll_scan : int, optional
            WRITEME. Defaults to 1.
        """
        super(CNADE, self).__init__(dim=dim, dim_hid=dim_hid,
                                    clamp_sigmoid=clamp_sigmoid,
                                    unroll_scan=unroll_scan)

        self.dim_cond = dim_cond

        # Conditioning weights matrix for visible biases
        U_b_value = self._initialize_weights(self.dim_cond, self.dim)
        self.U_b = sharedX(U_b_value, 'U_b')
        # Conditioning weights matrix for hidden biases
        U_c_value = self._initialize_weights(self.dim_cond, self.dim_hid)
        self.U_c = sharedX(U_c_value, 'U_c')
Esempio n. 14
0
    def __init__(self, dim, dim_hid, clamp_sigmoid=False, unroll_scan=1):
        """
        Parameters
        ----------
        dim : int
            Number of observed binary variables
        dim_hid : int
            Number of latent binary variables
        clamp_sigmoid : bool, optional
            WRITEME. Defaults to `False`.
        unroll_scan : int, optional
            WRITEME. Defaults to 1.
        """
        super(NADEBase, self).__init__()

        self.dim = dim
        self.dim_hid = dim_hid
        self.clamp_sigmoid = clamp_sigmoid
        self.unroll_scan = unroll_scan

        self.input_space = VectorSpace(dim=self.dim)

        # Visible biases
        b_value = numpy.zeros(self.dim)
        self.b = sharedX(b_value, 'b')
        # Hidden biases
        c_value = numpy.zeros(self.dim_hid)
        self.c = sharedX(c_value, 'c')
        # Encoder weights
        W_value = self._initialize_weights(self.dim, self.dim_hid)
        self.W = sharedX(W_value, 'W')
        # Decoder weights
        V_value = self._initialize_weights(self.dim_hid, self.dim)
        self.V = sharedX(V_value, 'V')
    def __init__(self, n_vis_units, n_hidden_units):
        Model.__init__(self)

        self._W = sharedX(np.random.uniform(size=(n_vis_units, n_hidden_units)), 'W')
        self._b = sharedX(np.zeros(n_hidden_units), 'b')
        self._b_reconstruction = sharedX(np.zeros(n_vis_units), 'b_reconstruction')
        self.input_space = VectorSpace(dim=n_vis_units)
Esempio n. 16
0
    def get_fixed_var_descr(self, model, X, Y):
        """
        .. todo::

            WRITEME
        """

        assert Y is not None

        batch_size = model.batch_size

        drop_mask_X = sharedX(model.get_input_space().get_origin_batch(batch_size))
        drop_mask_X.name = 'drop_mask'

        X_space = model.get_input_space()

        updates = OrderedDict()
        rval = FixedVarDescr()
        inputs=[X, Y]

        if not self.supervised:
            update_X = self.mask_gen(X, X_space = X_space)
        else:
            drop_mask_Y = sharedX(np.ones(batch_size,))
            drop_mask_Y.name = 'drop_mask_Y'
            update_X, update_Y = self.mask_gen(X, Y, X_space)
            updates[drop_mask_Y] = update_Y
            rval.fixed_vars['drop_mask_Y'] =  drop_mask_Y
        if self.mask_gen.sync_channels:
            n = update_X.ndim
            assert n == drop_mask_X.ndim - 1
            update_X.name = 'raw_update_X'
            zeros_like_X = T.zeros_like(X)
            zeros_like_X.name = 'zeros_like_X'
            update_X = zeros_like_X + update_X.dimshuffle(0,1,2,'x')
            update_X.name = 'update_X'
        updates[drop_mask_X] = update_X

        rval.fixed_vars['drop_mask'] = drop_mask_X

        if hasattr(model.inference_procedure, 'V_dropout'):
            include_prob = model.inference_procedure.include_prob
            include_prob_V = model.inference_procedure.include_prob_V
            include_prob_Y = model.inference_procedure.include_prob_Y

            theano_rng = MRG_RandomStreams(2012+11+20)
            for elem in flatten([model.inference_procedure.V_dropout]):
                updates[elem] = theano_rng.binomial(p=include_prob_V, size=elem.shape, dtype=elem.dtype, n=1) / include_prob_V
            if "Softmax" in str(type(model.hidden_layers[-1])):
                hid = model.inference_procedure.H_dropout[:-1]
                y = model.inference_procedure.H_dropout[-1]
                updates[y] = theano_rng.binomial(p=include_prob_Y, size=y.shape, dtype=y.dtype, n=1) / include_prob_Y
            else:
                hid = model.inference_procedure.H_dropout
            for elem in flatten(hid):
                updates[elem] =  theano_rng.binomial(p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob

        rval.on_load_batch = [utils.function(inputs, updates=updates)]

        return rval
Esempio n. 17
0
 def __init__(self, scale_grads=1, target_scale=.1,
         discriminator_default_input_include_prob = 1.,
         discriminator_input_include_probs=None,
         discriminator_default_input_scale=1.,
         discriminator_input_scales=None,
         generator_default_input_include_prob = 1.,
         generator_default_input_scale=1.,
         inference_default_input_include_prob=None,
         inference_input_include_probs=None,
         inference_default_input_scale=1.,
         inference_input_scales=None,
         init_now_train_generator=True,
         ever_train_discriminator=True,
         ever_train_generator=True,
         ever_train_inference=True,
         no_drop_in_d_for_g=False,
         alternate_g = False,
         infer_layer=None,
         noise_both = 0.,
         g_eps = 0.,
         d_eps =0.):
     self.__dict__.update(locals())
     del self.self
     # These allow you to dynamically switch off training parts.
     # If the corresponding ever_train_* is False, these have
     # no effect.
     self.now_train_generator = sharedX(init_now_train_generator)
     self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32'))
     self.now_train_inference = sharedX(numpy.array(1., dtype='float32'))
Esempio n. 18
0
File: cm.py Progetto: blilbo/cm
    def __init__(self, nvis, nhid, num_S=0, init_W=None):
        super(CMModel, self).__init__()

        self.nvis = nvis
        self.nhid = nhid
        self.num_S = num_S
        assert num_S in {0, 1}, "Currently only num_S == 0 or num_S == 1 is supported!"

        if init_W:
            model = pickle.load(open(init_W, "rb"))
            W = model.W.get_value()
            self.W = sharedX(W)
        else:
            self.W = sharedX(np.random.uniform(-1e-3, 1e-3, (nhid, nvis)))

        self.S = sharedX(np.random.uniform(-1e-3, 1e-3, (nhid, nhid)))
        self.theta = sharedX(np.zeros(nhid))

        if self.num_S > 0:
            self._params = [self.W, self.S, self.theta]
        else:
            self._params = [self.W, self.theta]

        self.input_space = VectorSpace(dim=nvis)
        self.output_space = VectorSpace(dim=nhid)
Esempio n. 19
0
    def __init__(self, dataset, model, algorithm=None, save_path=None,
                 save_freq=0, extensions=None, allow_overwrite=True):
        self.allow_overwrite = allow_overwrite
        self.first_save = True
        self.dataset = dataset
        self.model = model
        self.algorithm = algorithm
        if save_path is not None:
            if save_freq == 0:
                warnings.warn('save_path specified but save_freq is 0 '
                              '(never save). Is this intentional?')
            self.save_path = preprocess(save_path)
        else:
            if save_freq > 0:
                phase_variable = 'PYLEARN2_TRAIN_PHASE'
                if phase_variable in os.environ:
                    phase = 'phase%d' % os.environ[phase_variable]
                    tokens = [os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'],
                              phase, 'pkl']
                else:
                    tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl'
                self.save_path = '.'.join(tokens)
        self.save_freq = save_freq

        if hasattr(self.dataset, 'yaml_src'):
            self.model.dataset_yaml_src = self.dataset.yaml_src
        else:
            warnings.warn("dataset has no yaml src, model won't know what " +
                          "data it was trained on")

        self.extensions = extensions if extensions is not None else []
        self.training_seconds = sharedX(value=0,
                                        name='training_seconds_this_epoch')
        self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
Esempio n. 20
0
def run():
    disturb_mem.disturb_mem()


    b = sharedX(np.zeros((2,)))
    channels = OrderedDict()

    disturb_mem.disturb_mem()

    v_max = b.max(axis=0)
    v_min = b.min(axis=0)
    v_range = v_max - v_min

    updates = []
    for i, val in enumerate([
            v_max.max(),
            v_max.min(),
            v_range.max(),
            ]):
        disturb_mem.disturb_mem()
        s = sharedX(0., name='s_'+str(i))
        updates.append((s, val))

    for var in theano.gof.graph.ancestors(update for var, update in updates):
        if var.name is not None:
            if var.name[0] != 's' or len(var.name) != 2:
                var.name = None

    for key in channels:
        updates.append((s, channels[key]))
    file_path='nondeterminism_6.txt'
    mode = RecordMode(file_path=file_path,
                      replay=0)
    f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f')

    """
    print 'type(f): ',type(f)
    print 'elements of f:'
    for elem in dir(f):
        print '\t',elem
    print 'type(f.fn): ',type(f.fn)
    print 'elements of f.fn:'
    for elem in dir(f.fn):
        print '\t',elem
    """

    trials = 1

    for i in xrange(trials):
        disturb_mem.disturb_mem()
        f()

    mode.record.f.flush()
    mode.record.f.close()

    mode.set_record(Record(file_path=file_path, replay=1))

    for i in xrange(trials):
        disturb_mem.disturb_mem()
        f()
Esempio n. 21
0
    def __init__(self, super_dbm, feature_niter, post_scale = 0.5, input_include_prob = .5,
            remove_y = False):
        self.__dict__.update(locals())
        del self.self
        self.input_space = super_dbm.get_input_space()
        self.output_space = super_dbm.get_output_space()

        self.theano_rng = MRG_RandomStreams(2013+1+27)

        h, g, y = super_dbm.hidden_layers
        vishid = h.get_weights()
        biashid = h.get_biases()
        hidpen = g.get_weights()
        penhid = g.get_weights().T
        biaspen = g.get_biases()
        penlab = y.get_weights()
        labpen = y.get_weights().T
        biaslab = y.get_biases()

        param_names = ['vishid', 'biashid', 'hidpen', 'penhid', 'biaspen', 'penlab', 'labpen', 'biaslab']
        self._params = []
        for name in param_names:
            val = locals()[name]
            scaled_val = val
            if val.ndim == 2:
                scaled_val = val / post_scale
            param = sharedX(scaled_val)
            setattr(self, name, param)
            self._params.append(param)
            fixed = sharedX(val)
            setattr(self, 'feature_'+name, fixed)
        self.hidden_layers = super_dbm.hidden_layers
Esempio n. 22
0
    def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05,
                 non_linearity='sigmoid', use_ground_truth=True):
        allowed_non_linearities = {'sigmoid': T.nnet.sigmoid,
                                   'tanh': T.tanh}
        self.nvis = nvis
        self.nhid = nhid
        self.hidden_transition_model = hidden_transition_model
        self.use_ground_truth = use_ground_truth
        self.alpha = sharedX(1)
        self.alpha_decrease_rate = 0.999

        assert non_linearity in allowed_non_linearities
        self.non_linearity = allowed_non_linearities[non_linearity]

        # Space initialization
        self.input_space = VectorSpace(dim=self.nvis)
        self.hidden_space = VectorSpace(dim=self.nhid)
        self.output_space = VectorSpace(dim=1)
        self.input_source = 'features'
        self.target_source = 'targets'

        # Features-to-hidden matrix
        W_value = numpy.random.uniform(low=-irange, high=irange,
                                       size=(self.nvis, self.nhid))
        self.W = sharedX(W_value, name='W')
        # Hidden biases
        b_value = numpy.zeros(self.nhid)
        self.b = sharedX(b_value, name='b')
        # Hidden-to-out matrix
        U_value = numpy.random.uniform(low=-irange, high=irange,
                                       size=(self.nhid, 1))
        self.U = sharedX(U_value, name='U')
        # Output bias
        c_value = numpy.zeros(1)
        self.c = sharedX(c_value, name='c')
Esempio n. 23
0
        def set_input_space(self, space):
            """ Note: this resets parameters! """

            self.input_space = space

            if isinstance(space, VectorSpace):
                self.requires_reformat = False
                self.input_dim = space.dim
            else:
                self.requires_reformat = True
                self.input_dim = space.get_total_dimension()
                self.desired_space = VectorSpace(self.input_dim)

            self.output_space = VectorSpace(self.dim + self.copy_input \
                                                       * self.input_dim)

            rng = self.mlp.rng
            shape = (self.input_dim, self.dim)

            self.b = sharedX(self.initializer.get_biases(rng, shape),
                              name=self.layer_name + '_b')

            self.W = sharedX(self.initializer.get_weights(rng, shape),
                              name=self.layer_name + '_W')

            self.mask = sharedX(self.initializer.get_mask(),
                                 name=self.layer_name + '_mask')
Esempio n. 24
0
    def set_input_space(self, space):

        self.input_space = space

        if isinstance(space, VectorSpace):
            self.requires_reformat = False
            self.input_dim = space.dim
        else:
            self.requires_reformat = True
            self.input_dim = space.get_total_dimension()
            self.desired_space = VectorSpace(self.input_dim)

        self.output_space = VectorSpace(self.dim)

        self.input_dims = [self.input_dim, self.input_dim, self.hidden_dim]
        self.output_dims = [self.dim, self.hidden_dim, self.gater_dim]
        self.W = [None,None,None]
        self.b = [None,None,None]
        
        for i in range(3):
            self._init_inner_layer(i)
        
        self.stoch_grad = sharedX(0)
        self.kl_grad = sharedX(0)
        self.linear_grad = sharedX(0)
def profile_grad(f):
    print 'profiling gradient of ',f
    rng = np.random.RandomState([2012,7,19])
    batch_size = 80
    rows = 26
    cols = 27
    channels = 30
    pool_rows = 2
    pool_cols = 3
    zv = rng.randn( batch_size, rows, cols, channels ).astype(config.floatX)

    #put the inputs + outputs in shared variables so we don't pay GPU transfer during test
    grad_shared = sharedX(zv)
    z_shared = sharedX(zv)

    p_th, h_th = f( z_shared, (pool_rows, pool_cols) )

    func = function([],updates = { grad_shared : T.grad(p_th.sum() +  h_th.sum(), z_shared)} )

    print 'warming up'
    for i in xrange(10):
        func()

    trials = 10
    results = []

    for i in xrange(trials):
        t1 = time.time()
        for j in xrange(10):
            func()
        t2 = time.time()
        print t2 - t1
        results.append(t2-t1)
    print 'final: ',sum(results)/float(trials)
Esempio n. 26
0
    def _init_inner_layer(self, idx):
        rng = self.mlp.rng
        if self.irange[idx] is not None:
            assert self.istdev[idx] is None
            assert self.sparse_init[idx] is None
            W = rng.uniform(-self.irange[idx], self.irange[idx],
                        (self.input_dims[idx], self.output_dims[idx]))
        elif self.istdev[idx] is not None:
            assert self.sparse_init[idx] is None
            W = rng.randn(self.input_dims[idx], self.output_dims[idx]) \
                    * self.istdev[idx]
        else:
            assert self.sparse_init[idx] is not None
            W = np.zeros((self.input_dims[idx], self.output_dims[idx]))
            for i in xrange(self.output_dims[idx]):
                assert self.sparse_init[idx] <= self.input_dims[idx]
                for j in xrange(self.sparse_init[idx]):
                    idx2 = rng.randint(0, self.input_dims[idx])
                    while W[idx2, i] != 0:
                        idx2 = rng.randint(0, self.input_dims[idx])
                    W[idx2, i] = rng.randn()
            W *= self.sparse_stdev[idx]

        W = sharedX(W)
        W.name = self.layer_name + '_W' + str(idx)
        
        b = sharedX( np.zeros((self.output_dims[idx],)) \
                + self.init_bias[idx], \
                name = self.layer_name + '_b' + str(idx))

        self.W[idx] = W
        self.b[idx] = b
def profile(f):
    print 'profiling ',f
    rng = np.random.RandomState([2012,7,19])
    batch_size = 128
    rows = 30
    cols = 30
    channels = 16
    pool_rows = 3
    pool_cols = 3
    zv = rng.randn(channels, rows, cols, batch_size).astype(config.floatX)

    #put the inputs + outputs in shared variables so we don't pay GPU transfer during test
    p_shared = sharedX(zv[:,0:rows:pool_rows,0:cols:pool_cols,:])
    h_shared = sharedX(zv)
    z_shared = sharedX(zv)

    p_th, h_th = f( z_shared, (pool_rows, pool_cols) )

    func = function([],updates = { p_shared : p_th, h_shared : h_th} )

    print 'warming up'
    for i in xrange(10):
        func()

    trials = 10
    results = []

    for i in xrange(trials):
        t1 = time.time()
        for j in xrange(10):
            func()
        t2 = time.time()
        print t2 - t1
        results.append(t2-t1)
    print 'final: ',sum(results)/float(trials)
Esempio n. 28
0
 def __init__(self,W1, b1,W2,b2, mf_iter):
     self.mf_iter = mf_iter
     self.W1 = sharedX(W1)
     self.W2 = sharedX(W2)
     self.b1 = sharedX(b1)
     self.b2 = sharedX(b2)
     self.dataset_yaml_src = "!obj:pylearn2.datasets.mnist.MNIST { which_set : train }"
Esempio n. 29
0
    def set_input_space(self, space):
        """ Note: this resets parameters! """

        self.input_space = space

        if isinstance(space, VectorSpace):
            self.requires_reformat = False
            self.input_dim = space.dim
        else:
            self.requires_reformat = True
            self.input_dim = space.get_total_dimension()
            self.desired_space = VectorSpace(self.input_dim)


        if not (self.detector_layer_dim % self.pool_size == 0):
            raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" %
                             (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size))

        self.h_space = VectorSpace(self.detector_layer_dim)
        self.pool_layer_dim = self.detector_layer_dim / self.pool_size
        self.output_space = VectorSpace(self.pool_layer_dim)

        rng = self.mlp.rng
        if self.irange is not None:
            assert self.sparse_init is None
            W = rng.uniform(-self.irange,
                            self.irange,
                            (self.input_dim, self.detector_layer_dim)) * \
                (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim))
                 < self.include_prob)
        else:
            assert self.sparse_init is not None
            W = np.zeros((self.input_dim, self.detector_layer_dim))
            def mask_rejects(idx, i):
                if self.mask_weights is None:
                    return False
                return self.mask_weights[idx, i] == 0.
            for i in xrange(self.detector_layer_dim):
                assert self.sparse_init <= self.input_dim
                for j in xrange(self.sparse_init):
                    idx = rng.randint(0, self.input_dim)
                    while W[idx, i] != 0 or mask_rejects(idx, i):
                        idx = rng.randint(0, self.input_dim)
                    W[idx, i] = rng.randn()
            W *= self.sparse_stdev

        W = sharedX(W)
        W.name = self.layer_name + '_W'

        self.transformer = MatrixMul(W)

        W ,= self.transformer.get_params()
        assert W.name is not None

        if self.mask_weights is not None:
            expected_shape =  (self.input_dim, self.detector_layer_dim)
            if expected_shape != self.mask_weights.shape:
                raise ValueError("Expected mask with shape "+str(expected_shape)+" but got "+str(self.mask_weights.shape))
            self.mask = sharedX(self.mask_weights)
Esempio n. 30
0
 def __init__(self):
     self.W1 = [sharedX(rng.randn(num_features, chunk_width)) for i
         in xrange(num_chunks)]
     disturb_mem.disturb_mem()
     self.W2 = [sharedX(rng.randn(chunk_width)) for i in xrange(num_chunks)]
     self._params = safe_union(self.W1, self.W2)
     self.input_space = VectorSpace(num_features)
     self.output_space = VectorSpace(1)
Esempio n. 31
0
 def initialize_parameters(self, nhid):
     self.nhid = nhid
     self.prior_mu = sharedX(numpy.zeros(self.nhid), name="prior_mu")
     self.log_prior_sigma = sharedX(numpy.zeros(self.nhid),
                                    name="prior_log_sigma")
     self._params = [self.prior_mu, self.log_prior_sigma]
Esempio n. 32
0
    def __init__(self,
                 learning_rate,
                 cost=None,
                 batch_size=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None,
                 update_callbacks=None,
                 learning_rule=None,
                 init_momentum=None,
                 set_batch_size=False,
                 train_iteration_mode=None,
                 batches_per_iter=None,
                 theano_function_mode=None,
                 monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
        Parameters
        ----------
        learning_rate : float
            The learning rate to use. Train object callbacks can change the \
            learning rate after each epoch. SGD update_callbacks can change \
            it after each minibatch.
        cost : pylearn2.costs.cost.Cost
            Cost object specifying the objective function to be minimized. \
            Optionally, may be None. In this case, SGD will call the model's \
            get_default_cost method to obtain the objective function.
        batch_size : optional, int
            The size of the batch to be used.
            If not specified, the model will be asked for the batch size, so
            you must have specified the batch size there.
            (Some models are rigidly defined to only work with one batch size)
        monitoring_batches : optional, int
            At the start of each epoch, we run "monitoring", to evaluate
            quantities such as the validation set error.
            monitoring_batches, if specified, determines the number of batches
            to draw from the iterator for each monitoring dataset.
            Unnecessary if not using monitoring or if `monitor_iteration_mode`
            is 'sequential' and `batch_size` is specified (number of
            batches will be calculated based on full dataset size).
            TODO: make it possible to specify different monitoring_batches
            for each monitoring dataset. The Monitor itself already supports
            this.
        monitoring_dataset : optional, a Dataset or dictionary
            If not specified, no monitoring is used.
            If specified to be a Dataset, monitor on that Dataset.
            If specified to be dictionary, the keys should be string names
            of datasets, and the values should be Datasets. All monitoring
            channels will be computed for all monitoring Datasets and will
            have the dataset name and an underscore prepended to them.
        monitor_iteration_mode : optional, str
            The iteration mode used to iterate over the examples in all
            monitoring datasets. If not specified, defaults to 'sequential'.
            TODO: make it possible to specify different modes for different
            datasets.
        termination_criterion : optional, instance of
            pylearn2.termination_criteria.TerminationCriterion
            Used to determine when the algorithm should stop running.
            If not specified, runs forever--or more realistically, until
            external factors halt the python process (Kansas 1977).
        update_callbacks : optional, list
            If specified, each member of the list should be a callable that
            accepts an SGD instance as its only argument.
            All callbacks will be called with this SGD instance after each
            SGD step.
        learning_rule : training_algorithms.learning_rule.LearningRule
            A learning rule computes the new parameter values given old \
            parameters and first-order gradients. If learning_rule is None, \
            sgd.SGD will update parameters according to the standard SGD \
            learning rule:
                param := param - learning_rate * d cost / d param
            This argument allows more sophisticated learning rules, such
            as SGD with momentum.
        init_momentum : **DEPRECATED** option, float
            Use learning_rule instead.
            If None, does not use momentum otherwise, use momentum and \
            initialize the momentum coefficient to init_momentum. Callbacks \
            can change this over time just like the learning rate. If the \
            gradient is the same on every step, then the update taken by the \
            SGD algorithm is scaled by a factor of 1/(1-momentum). See \
            section 9 of Geoffrey Hinton's "A Practical Guide to Training \
            Restricted Boltzmann Machines" for details.
        set_batch_size : optional, bool
            Defaults to False.
            If True, and batch_size conflicts with model.force_batch_size, \
            will call model.set_batch_size(batch_size) in an attempt to \
            change model.force_batch_size
        train_iteration_mode : optional, str
            Defaults to 'shuffled_sequential'.
            The iteration mode to use for iterating through training examples.
        batches_per_iter : optional, int
            The number of batches to draw from the iterator over training
            examples.
            If iterational mode is 'sequential' or 'shuffled_sequential', this
            is unnecessary; when unspecified we will iterate over all examples.
        theano_function_mode : optional, a valid argument to theano.function's
            'mode' parameter.
            The theano mode to compile the updates function with. Note that \
            pylearn2 includes some wraplinker modes that are not bundled with \
            theano. See pylearn2.devtools. These extra modes let you do \
            things like check for NaNs at every step, or record md5 digests \
            of all computations performed by the update function to help \
            isolate problems with nondeterminism.
        monitoring_costs : optional, list
            a list of Cost instances. The Monitor will also include all
            channels defined by these Costs, even though we don't train
            using them.
        seed : optional, valid argument to np.random.RandomState
            The seed used for the random number generate to be passed to the
            training dataset iterator (if any)
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of " +
                            "Costs to represent a sum of Costs. Use " +
                            "pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn(
                "init_momentum interface is deprecated and will "
                "become officially unsuported as of May 9, 2014. Please use the "
                "`learning_rule` parameter instead, providing an object of type "
                "`pylearn2.training_algorithms.learning_rule.Momentum` instead"
            )
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " +
                                 "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = make_np_rng(seed, which_method=["randn", "randint"])
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs
Esempio n. 33
0
 def _initialize_hidbias(self):
     self.hidbias = sharedX(numpy.zeros(self.nhid), name='hb', borrow=True)
Esempio n. 34
0
    def __init__(self,
                 learning_rate,
                 cost=None,
                 batch_size=None,
                 monitoring_batch_size=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None,
                 update_callbacks=None,
                 learning_rule=None,
                 init_momentum=None,
                 set_batch_size=False,
                 train_iteration_mode=None,
                 batches_per_iter=None,
                 theano_function_mode=None,
                 monitoring_costs=None,
                 seed=[2012, 10, 5]):

        if isinstance(cost, (list, tuple, set)):
            raise TypeError("SGD no longer supports using collections of " +
                            "Costs to represent a sum of Costs. Use " +
                            "pylearn2.costs.cost.SumOfCosts instead.")

        if init_momentum:
            warnings.warn(
                "init_momentum interface is deprecated and will "
                "become officially unsuported as of May 9, 2014. Please use the "
                "`learning_rule` parameter instead, providing an object of type "
                "`pylearn2.training_algorithms.learning_rule.Momentum` instead"
            )
            # Convert to new interface under the hood.
            self.learning_rule = Momentum(init_momentum)
        else:
            self.learning_rule = learning_rule

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batch_size = monitoring_batch_size
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batch_size is not None:
                raise ValueError("Specified a monitoring batch size " +
                                 "but not a monitoring dataset.")
            if monitoring_batches is not None:
                raise ValueError("Specified an amount of monitoring batches " +
                                 "but not a monitoring dataset.")
        self.termination_criterion = termination_criterion
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = make_np_rng(seed, which_method=["randn", "randint"])
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs
Esempio n. 35
0
def setup_detector_layer_c01b(layer, input_space, rng, irange="not specified"):
    """
    .. todo::

        WRITEME properly

    Takes steps to set up an object for use as being some kind of convolutional
    layer. This function sets up only the detector layer.

    Does the following:

    * raises a RuntimeError if cuda is not available
    * sets layer.input_space to input_space
    * sets up addition of dummy channels for compatibility with cuda-convnet:

      - layer.dummy_channels: # of dummy channels that need to be added
        (You might want to check this and raise an Exception if it's not 0)
      - layer.dummy_space: The Conv2DSpace representing the input with dummy
        channels added

    * sets layer.detector_space to the space for the detector layer
    * sets layer.transformer to be a Conv2D instance
    * sets layer.b to the right value

    Parameters
    ----------
    layer : object
        Any python object that allows the modifications described below and \
        has the following attributes: \
        * pad: int describing amount of zero padding to add \
        * kernel_shape: 2-element tuple or list describing spatial shape of \
          kernel \
        * fix_kernel_shape: bool, if true, will shrink the kernel shape to \
          make it feasible, as needed (useful for hyperparameter searchers) \
        * detector_channels: The number of channels in the detector layer \
        * init_bias: numeric constant added to a tensor of zeros to \
          initialize the bias \
        * tied_b: If true, biases are shared across all spatial locations
    input_space : WRITEME
        A Conv2DSpace to be used as input to the layer
    rng : WRITEME
        A numpy RandomState or equivalent
    """

    if irange != "not specified":
        raise AssertionError(
            "There was a bug in setup_detector_layer_c01b."
            "It uses layer.irange instead of the irange parameter to the "
            "function. The irange parameter is now disabled by this "
            "AssertionError, so that this error message can alert you that "
            "the bug affected your code and explain why the interface is "
            "changing. The irange parameter to the function and this "
            "error message may be removed after April 21, 2014."
        )

    # Use "self" to refer to layer from now on, so we can pretend we're
    # just running in the set_input_space method of the layer
    self = layer

    # Make sure cuda is available
    check_cuda(str(type(self)))

    # Validate input
    if not isinstance(input_space, Conv2DSpace):
        raise TypeError("The input to a convolutional layer should be a "
                        "Conv2DSpace, but layer " + self.layer_name + " got " +
                        str(type(self.input_space)))

    if not hasattr(self, 'detector_channels'):
        raise ValueError("layer argument must have a 'detector_channels' "
                         "attribute specifying how many channels to put in "
                         "the convolution kernel stack.")

    # Store the input space
    self.input_space = input_space

    # Make sure number of channels is supported by cuda-convnet
    # (multiple of 4 or <= 3)
    # If not supported, pad the input with dummy channels
    ch = self.input_space.num_channels
    rem = ch % 4
    if ch > 3 and rem != 0:
        self.dummy_channels = 4 - rem
    else:
        self.dummy_channels = 0
    self.dummy_space = Conv2DSpace(
        shape=input_space.shape,
        channels=input_space.num_channels + self.dummy_channels,
        axes=('c', 0, 1, 'b')
    )

    if hasattr(self, 'kernel_stride'):
        kernel_stride = self.kernel_stride
    else:
        kernel_stride = [1, 1]

    output_shape = \
        [int(np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st))) + 1
         for i_sh, k_sh, k_st in zip(self.input_space.shape,
                                     self.kernel_shape, kernel_stride)]

    def handle_kernel_shape(idx):
        if self.kernel_shape[idx] < 1:
            raise ValueError("kernel must have strictly positive size on all "
                             "axes but has shape: " + str(self.kernel_shape))
        if output_shape[idx] <= 0:
            if self.fix_kernel_shape:
                self.kernel_shape[idx] = \
                    self.input_space.shape[idx] + 2 * self.pad
                assert self.kernel_shape[idx] != 0
                output_shape[idx] = 1
                warnings.warn("Had to change the kernel shape to make "
                              "network feasible")
            else:
                raise ValueError("kernel too big for input "
                                 "(even with zero padding)")

    map(handle_kernel_shape, [0, 1])

    if self.detector_channels < 16:
        raise ValueError("Cuda-convnet requires the detector layer to have "
                         "at least 16 channels.")

    self.detector_space = Conv2DSpace(shape=output_shape,
                                      num_channels=self.detector_channels,
                                      axes=('c', 0, 1, 'b'))

    if hasattr(self, 'partial_sum'):
        partial_sum = self.partial_sum
    else:
        partial_sum = 1

    if hasattr(self, 'sparse_init') and self.sparse_init is not None:
        self.transformer = \
            checked_call(make_sparse_random_conv2D,
                         OrderedDict([('num_nonzero', self.sparse_init),
                                      ('input_space', self.input_space),
                                      ('output_space', self.detector_space),
                                      ('kernel_shape', self.kernel_shape),
                                      ('pad', self.pad),
                                      ('partial_sum', partial_sum),
                                      ('kernel_stride', kernel_stride),
                                      ('rng', rng)]))
    else:
        self.transformer = make_random_conv2D(
            irange=self.irange,
            input_axes=self.input_space.axes,
            output_axes=self.detector_space.axes,
            input_channels=self.dummy_space.num_channels,
            output_channels=self.detector_space.num_channels,
            kernel_shape=self.kernel_shape,
            pad=self.pad,
            partial_sum=partial_sum,
            kernel_stride=kernel_stride,
            rng=rng
        )

    W, = self.transformer.get_params()
    W.name = self.layer_name + '_W'

    if self.tied_b:
        self.b = sharedX(np.zeros(self.detector_space.num_channels) +
                         self.init_bias)
    else:
        self.b = sharedX(self.detector_space.get_origin() + self.init_bias)
    self.b.name = self.layer_name + '_b'

    logger.info('Input shape: {0}'.format(self.input_space.shape))
    logger.info('Detector space: {0}'.format(self.detector_space.shape))
Esempio n. 36
0
    def __init__(self,
                 objective,
                 params,
                 inputs=None,
                 param_constrainers=None,
                 max_iter=-1,
                 lr_scalers=None,
                 verbose=0,
                 tol=None,
                 init_alpha=None,
                 min_init_alpha=1e-3,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 gradients=None,
                 gradient_updates=None,
                 line_search_mode=None,
                 accumulate=False,
                 theano_function_mode=None):

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            print 'batch gradient class compiling gradient function'
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            print 'done. Took ', t2 - t1

        if self.verbose:
            print 'batch gradient class compiling objective function'
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs,
                                obj,
                                mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            print 'done'

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha],
                                    updates=goto_updates,
                                    mode=self.theano_function_mode,
                                    name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(
            sum([
                T.sqr(elem).sum()
                for elem in self.param_to_grad_shared.values()
            ]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (
            1. - self.new_weight) * self.ave_grad_size

        self._normalize_grad = function(
            [],
            norm,
            updates=normalize_grad_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(),
                                                 'old_' + elem.name)

            self._store_old_grad = function(
                [norm],
                updates=OrderedDict([(grad_to_old_grad[g], g * norm)
                                     for g in grad_to_old_grad]),
                mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                            for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)
            """

            beta_pr is the Polak-Ribiere formula for beta.
            According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            (ie, it is meant to revert to steepest descent when you have traveled far enough that
            the objective function is behaving non-quadratically enough that the conjugate gradient
            formulas aren't working anymore)

            http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            """

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g])
                                      for g in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function(
                [],
                updates=make_conjugate_updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
Esempio n. 37
0
 def __init__(self, rows, cols, channels):
     dim = rows * cols * channels
     self.input_space = Conv2DSpace((rows, cols), channels)
     self.dim = dim
     rng = np.random.RandomState([2012, 9, 25])
     self.P = sharedX(rng.uniform(-1., 1., (dim, )))
Esempio n. 38
0
 def set_input_space(self, space):
     self.input_space = space
     self.output_space = space
     dim = space.get_total_dimension()
     self.D = sharedX(np.zeros((dim, )), self.layer_name + '_D')
     self._params = [self.D]
Esempio n. 39
0
 def __init__(self, num_arms, mean_std = 1.0, std_std = 1.0):
     self.rng = make_np_rng(None, [2013, 11, 12], which_method="randn")
     self.means = sharedX(self.rng.randn(num_arms) * mean_std)
     self.stds = sharedX(np.abs(self.rng.randn(num_arms) * std_std))
     self.theano_rng = make_theano_rng(None, self.rng.randint(2 ** 16), which_method="normal")
Esempio n. 40
0
__email__ = "pylearn-dev@googlegroups"

import warnings
import numpy
import theano.tensor as T

from pylearn2.compat import OrderedDict
from pylearn2.expr.basic import log_sum_exp
from pylearn2.models.model import Model
from pylearn2.models.vae.kl import find_integrator_for
from pylearn2.space import VectorSpace
from pylearn2.utils import wraps, sharedX, safe_update
from pylearn2.utils.rng import make_np_rng

default_seed = 2014 + 9 + 20
pi = sharedX(numpy.pi)


class VAE(Model):
    """
    Implementation of the variational autoencoder (VAE).

    Parameters
    ----------
    nvis : int
        Number of dimensions in the input data
    prior : pylearn2.models.vae.prior.Prior
        Represents the prior distribution :math:`p_\\theta(\\mathbf{z})`
    conditional : pylearn2.models.vae.conditional.Conditional
        Represents the conditional distribution
        :math:`p_\\theta(\\mathbf{x} \\mid \\mathbf{z})`
Esempio n. 41
0
__email__ = "dinhlaur@iro"

import numpy as np
import scipy
import scipy.linalg
import theano
import theano.tensor as T
from pylearn2.models.model import Model
from pylearn2.models.mlp import Layer, Linear, MLP
from pylearn2.space import VectorSpace, CompositeSpace
from pylearn2.utils import sharedX, wraps, as_floatX
from pylearn2.utils.rng import make_theano_rng
from pylearn2.linear.matrixmul import MatrixMul
from theano.compat.python2x import OrderedDict

pi = sharedX(np.pi)
T_inv = T.nlinalg.MatrixInverse()
T_det = T.nlinalg.Det()


class TriangularMLP(MLP):
    """
    Triangular MLP, a MLP of bijective layers.
    (see pylearn2.models.mlp for arguments)

    """
    def inv_fprop(self, state, return_all=False):
        """
        Inversion of the MLP forward propagation.

        Parameters
    def set_input_space(self, space):
        """ Note: this resets parameters! """

        # set up detector space and initialize transformer
        setup_detector_layer_b01tc(layer=self,
                                   input_space=space,
                                   rng=self.mlp.rng,
                                   irange=self.irange)
        rng = self.mlp.rng
        detector_shape = self.detector_space.shape

        #def handle_pool_shape(idx):
        #    if self.pool_shape[idx] < 1:
        #        raise ValueError("bad pool shape: " + str(self.pool_shape))
        #    if self.pool_shape[idx] > detector_shape[idx]:
        #        if self.fix_pool_shape:
        #            assert detector_shape[idx] > 0
        #            self.pool_shape[idx] = detector_shape[idx]
        #        else:
        #            raise ValueError("Pool shape exceeds detector layer shape on axis %d" % idx)
        #map(handle_pool_shape, [0, 1, 2])

        ### Check some precondition
        assert self.pool_shape[0] == self.pool_shape[1]
        assert self.pool_stride[0] == self.pool_stride[1]
        assert all(
            isinstance(elem, py_integer_types) for elem in self.pool_stride)
        for i in xrange(0, 2):
            assert self.pool_stride[i] <= self.pool_shape[i]
        assert all(
            isinstance(elem, py_integer_types) for elem in self.pool_stride)

        dummy_shape = [self.input_space.shape[0], self.input_space.shape[1]]

        # added to find out output space shape after temporal and spatial pooling "max_pool_c01b"
        dummy_output_shape = [
            int(np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st))) + 1
            for i_sh, k_sh, k_st in zip(dummy_shape, self.kernel_shape,
                                        self.kernel_stride)
        ]

        dummy_output_shape = [dummy_output_shape[0], dummy_output_shape[1]]
        #print dummy_output_shape
        dummy_detector_space = Conv2DSpace(shape=dummy_output_shape,
                                           num_channels=self.detector_channels,
                                           axes=('c', 0, 1, 'b'))

        # picked only 16 channels and 1 image in order to do a fast dummy maxpooling (16 because Alex's code needs at least 16 channels)
        dummy_detector = sharedX(
            dummy_detector_space.get_origin_batch(2)[0:16, :, :, :])

        dummy_p = max_pool_c01b(c01b=dummy_detector,
                                pool_shape=self.pool_shape,
                                pool_stride=self.pool_stride)
        dummy_p = dummy_p.eval()

        # set space after temporal pooling with overlap
        if self.pool_temporal_stride[1] > self.pool_temporal_shape[1]:
            if self.fix_pool_stride:
                warnings.warn("Fixing the pool stride")
                ps = self.pool_temporal_shape[1]
                assert isinstance(ps, py_integer_types)
                self.pool_stride = [1, ps]
            else:
                raise ValueError("Stride too big.")
        # (0*1,'t')
        dummy_temp_image = [(dummy_p.shape[1] * dummy_p.shape[2]),
                            self.detector_space.shape[2]]
        #overlapped temporal max pooling image_shape
        self.temp_pool_input_shape = dummy_temp_image
        dummy_temp_space = Conv2DSpace(shape=dummy_temp_image,
                                       num_channels=self.detector_channels,
                                       axes=('c', 0, 1, 'b'))
        temp_input = sharedX(
            dummy_temp_space.get_origin_batch(2)[0:16, :, :, :])
        dummy_temp_p = temporal_max_pool_c01b(
            c01b=temp_input,
            pool_shape=self.pool_temporal_shape,
            pool_stride=self.pool_temporal_stride,
            image_shape=dummy_temp_image)
        dummy_temp_p = dummy_temp_p.eval()

        self.output_space = Conv3DSpace(
            shape=[dummy_p.shape[1], dummy_p.shape[2], dummy_temp_p.shape[2]],
            num_channels=self.num_channels,
            axes=('b', 0, 1, 't', 'c'))

        # Print spaces
        print "Input shape: ", self.input_space.shape
        print "Detector space: ", self.detector_space.shape
        print "Output space: ", self.output_space.shape
Esempio n. 43
0
#stats = SufficientStatistics.from_observations( needed_stats = needed_stats, V = V, ** obs )
#em_functional = model.em_functional( stats = stats, H_hat = obs['H_hat'], S_hat = obs['S_hat'], var_s0_hat = obs['var_s0_hat'], var_s1_hat = obs['var_s1_hat'])

trunc_kl = model.inference_procedure.truncated_KL(V, obs)

if config.compute_test_value != 'off':
    assert not np.any(np.isnan(trunc_kl.tag.test_value))

assert len(trunc_kl.type.broadcastable) == 0

print 'compiling function...'
from theano import function

G = [
    sharedX(np.zeros((batch_size, rbm.nhid), dtype='float32'))
    for rbm in model.dbm.rbms
]
H = sharedX(np.zeros((batch_size, model.s3c.nhid), dtype='float32'))
S = sharedX(np.zeros((batch_size, model.s3c.nhid), dtype='float32'))

new_stats = SufficientStatistics.from_observations(
    needed_stats=needed_stats,
    V=V,
    H_hat=H,
    S_hat=S,
    var_s0_hat=obs['var_s0_hat'],
    var_s1_hat=obs['var_s1_hat'])

obj = model.inference_procedure.truncated_KL(
    V, {
Esempio n. 44
0
from pylearn2.datasets.binarizer import Binarizer
from pylearn2.datasets.mnist import MNIST

print 'Loading data...'
raw = MNIST(which_set='train', one_hot=True)
train = Binarizer(raw)

print 'Compiling cost functions...'
for model in models:
    model.niter = 10

from galatea.dbm.inpaint.super_inpaint import SuperInpaint
from galatea.dbm.inpaint.super_inpaint import MaskGen
from pylearn2.utils import sharedX

mask_gen = MaskGen(drop_prob=sharedX(0.1), balance=0, sync_channels=0)

cost = SuperInpaint(both_directions=0,
                    noise=0,
                    supervised=1,
                    mask_gen=mask_gen)

from pylearn2.utils import function


def get_obj_func(model):
    X = model.get_input_space().make_batch_theano()
    Y = model.get_output_space().make_batch_theano()
    obj = cost(model, X, Y)
    return function([X, Y], obj)
Esempio n. 45
0
 def __init__(self, num_arms, mean_std=1.0, std_std=1.0):
     self.rng = np.random.RandomState([2013, 11, 12])
     self.means = sharedX(self.rng.randn(num_arms) * mean_std)
     self.stds = sharedX(np.abs(self.rng.randn(num_arms) * std_std))
     self.theano_rng = MRG_RandomStreams(self.rng.randint(2**16))
Esempio n. 46
0
def test_softmax_mf_sample_consistent():

    # A test of the Softmax class
    # Verifies that the mean field update is consistent with
    # the sampling function

    # Since a Softmax layer contains only one random variable
    # (with n_classes possible values) the mean field assumption
    # does not impose any restriction so mf_update simply gives
    # the true expected value of h given v.
    # We can thus use mf_update to compute the expected value
    # of a sample of y conditioned on v, and check that samples
    # drawn using the layer's sample method convert to that
    # value.

    rng = np.random.RandomState([2012,11,1,1154])
    theano_rng = MRG_RandomStreams(2012+11+1+1154)
    num_samples = 1000
    tol = .042

    # Make DBM
    num_vis = rng.randint(1,11)
    n_classes = rng.randint(1, 11)

    v = BinaryVector(num_vis)
    v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX))

    y = Softmax(
            n_classes = n_classes,
            layer_name = 'y',
            irange = 1.)
    y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX))

    dbm = DBM(visible_layer = v,
            hidden_layers = [y],
            batch_size = 1,
            niter = 50)

    # Randomly pick a v to condition on
    # (Random numbers are generated via dbm.rng)
    layer_to_state = dbm.make_layer_to_state(1)
    v_state = layer_to_state[v]
    y_state = layer_to_state[y]

    # Infer P(y | v) using mean field
    expected_y = y.mf_update(
            state_below = v.upward_state(v_state))

    expected_y = expected_y[0, :]

    expected_y = expected_y.eval()

    # copy all the states out into a batch size of num_samples
    cause_copy = sharedX(np.zeros((num_samples,))).dimshuffle(0,'x')
    v_state = v_state[0,:] + cause_copy
    y_state = y_state[0,:] + cause_copy

    y_samples = y.sample(state_below = v.upward_state(v_state), theano_rng=theano_rng)

    y_samples = function([], y_samples)()

    check_multinomial_samples(y_samples, (num_samples, n_classes), expected_y, tol)
Esempio n. 47
0
def make_local_rfs(dataset,
                   nhid,
                   rf_shape,
                   stride,
                   irange=.05,
                   draw_patches=False,
                   rng=None):
    """
    Initializes a weight matrix with local receptive fields

    Parameters
    ----------
    dataset : pylearn2.datasets.dataset.Dataset
        Dataset defining the topology of the space (needed to convert 2D
        patches into subsets of pixels in a 1D filter vector)
    nhid : int
        Number of hidden units to make filters for
    rf_shape : list or tuple (2 elements)
        Gives topological shape of a receptive field
    stride : list or tuple (2 elements)
        Gives offset between receptive fields
    irange : float
        If draw_patches is False, weights are initialized in U(-irange,irange)
    draw_patches : bool
        If True, weights are drawn from random examples

    Returns
    -------
    weights : ndarray
        2D ndarray containing the desired weights.
    """
    s = dataset.view_shape()
    height, width, channels = s
    W_img = np.zeros((nhid, height, width, channels))

    last_row = s[0] - rf_shape[0]
    last_col = s[1] - rf_shape[1]

    rng = make_np_rng(rng, [2012, 07, 18], which_method='uniform')

    if stride is not None:
        # local_rf_stride specified, make local_rfs on a grid
        assert last_row % stride[0] == 0
        num_row_steps = last_row / stride[0] + 1

        assert last_col % stride[1] == 0
        num_col_steps = last_col / stride[1] + 1

        total_rfs = num_row_steps * num_col_steps

        if nhid % total_rfs != 0:
            raise ValueError('nhid modulo total_rfs should be 0, but we get '
                             '%d modulo %d = %d' %
                             (nhid, total_rfs, nhid % total_rfs))

        filters_per_rf = nhid / total_rfs

        idx = 0
        for r in xrange(num_row_steps):
            rc = r * stride[0]
            for c in xrange(num_col_steps):
                cc = c * stride[1]

                for i in xrange(filters_per_rf):

                    if draw_patches:
                        img = dataset.get_batch_topo(1)[0]
                        local_rf = img[rc:rc + rf_shape[0],
                                       cc:cc + rf_shape[1], :]
                    else:
                        local_rf = rng.uniform(
                            -irange, irange, (rf_shape[0], rf_shape[1], s[2]))

                    W_img[idx, rc:rc + rf_shape[0],
                          cc:cc + rf_shape[1], :] = local_rf
                    idx += 1
        assert idx == nhid
    else:
        raise NotImplementedError()
        #the case below is copy-pasted from s3c and not generalized yet
        #no stride specified, use random shaped patches
        """
        assert local_rf_max_shape is not None

        for idx in xrange(nhid):
            shape = [ self.rng.randint(min_shape,max_shape+1) for
                    min_shape, max_shape in zip(
                        local_rf_shape,
                        local_rf_max_shape) ]
            loc = [ self.rng.randint(0, bound - width + 1) for
                    bound, width in zip(s, shape) ]

            rc, cc = loc

            if local_rf_draw_patches:
                img = local_rf_src.get_batch_topo(1)[0]
                local_rf = img[rc:rc+shape[0],
                               cc:cc+shape[1],
                               :]
            else:
                local_rf = self.rng.uniform(-self.irange,
                            self.irange,
                            (shape[0], shape[1], s[2]) )

            W_img[idx,rc:rc+shape[0],
                      cc:cc+shape[1],:] = local_rf
        """

    W = dataset.view_converter.topo_view_to_design_mat(W_img).T

    rval = MatrixMul(W=sharedX(W))

    return rval
Esempio n. 48
0
File: mnd.py Progetto: wqren/pylearn
    def redo_everything(self):

        self.beta = sharedX(np.ones((self.nvis, )) * self.init_beta, 'beta')
        self.mu = sharedX(np.ones((self.nvis, )) * self.init_mu, 'mu')
        self.redo_theano()
Esempio n. 49
0
 def __init__(self, dim):
     self.dim = dim
     rng = np.random.RandomState([2012, 9, 25])
     self.P = sharedX(rng.uniform(-1., 1., (dim, )))
    def __init__(self,
                 n_vis,
                 n_hid,
                 layer_name,
                 rng=None,
                 return_indices=None,
                 param_init_range=0.02,
                 forget_gate_init_bias=0.05,
                 input_gate_init_bias=0.,
                 output_gate_init_bias=0.,
                 dropout_prob=0.0):
        if rng is None:
            rng = np.random.RandomState()
        self.rng = rng
        self.n_vis = n_vis
        self.n_hid = n_hid
        self.layer_name = layer_name
        self.param_init_range = param_init_range
        self.return_indices = return_indices
        self.forget_gate_init_bias = forget_gate_init_bias
        self.input_gate_init_bias = input_gate_init_bias
        self.output_gate_init_bias = output_gate_init_bias
        self.dropout_prob = dropout_prob

        # only create random arrays once and reuse via copy()
        irange = self.param_init_range
        init_Wxh = self.rng.uniform(-irange, irange, (self.n_vis, self.n_hid))
        init_Whh = self.rng.uniform(-irange, irange, (self.n_hid, self.n_hid))

        # input-to-hidden (rows, cols) = (n_visible, n_hidden)
        self.Wxh = theano.shared(value=init_Wxh,
                                 name=self.layer_name + '_Wxh',
                                 borrow=True)
        self.bxh = theano.shared(value=np.zeros(self.n_hid),
                                 name='bxh',
                                 borrow=True)
        # hidden-to-hidden (rows, cols) = (n_hidden, n_hidden) for both encoding and decoding ('tied weights')
        self.Whh = theano.shared(value=init_Whh,
                                 name=self.layer_name + '_Whh',
                                 borrow=True)

        # lstm parameters
        # Output gate switch
        self.O_b = sharedX(np.zeros(
            (self.n_hid, )) + self.output_gate_init_bias,
                           name=(self.layer_name + '_O_b'))
        self.O_x = sharedX(init_Wxh, name=(self.layer_name + '_O_x'))
        self.O_h = sharedX(init_Whh, name=(self.layer_name + '_O_h'))
        self.O_c = sharedX(init_Whh.copy(), name=(self.layer_name + '_O_c'))
        # Input gate switch
        self.I_b = sharedX(np.zeros(
            (self.n_hid, )) + self.input_gate_init_bias,
                           name=(self.layer_name + '_I_b'))
        self.I_x = sharedX(init_Wxh.copy(), name=(self.layer_name + '_I_x'))
        self.I_h = sharedX(init_Whh.copy(), name=(self.layer_name + '_I_h'))
        self.I_c = sharedX(init_Whh.copy(), name=(self.layer_name + '_I_c'))
        # Forget gate switch
        self.F_b = sharedX(np.zeros(
            (self.n_hid, )) + self.forget_gate_init_bias,
                           name=(self.layer_name + '_F_b'))
        self.F_x = sharedX(init_Wxh.copy(), name=(self.layer_name + '_F_x'))
        self.F_h = sharedX(init_Whh.copy(), name=(self.layer_name + '_F_h'))
        self.F_c = sharedX(init_Whh.copy(), name=(self.layer_name + '_F_c'))

        self.params = [
            self.Wxh, self.bxh, self.Whh, self.O_b, self.O_x, self.O_h,
            self.O_c, self.I_b, self.I_x, self.I_h, self.I_c, self.F_b,
            self.F_x, self.F_h, self.F_c
        ]
Esempio n. 51
0
 def __init__(self):
     self._params = [sharedX(np.zeros(shape)) for shape in shapes]
     self.input_space = VectorSpace(1)
Esempio n. 52
0
def test_batch_gradient_descent():
    """ Verify that batch gradient descent works by checking that
        it minimizes a quadratic function f(x) = x^T A x + b^T x + c
        correctly for several sampled values of A, b, and c.
        The ground truth minimizer is x = np.linalg.solve(A,-b)"""

    n = 3

    A = T.matrix(name='A')
    b = T.vector(name='b')
    c = T.scalar(name='c')

    x = sharedX(np.zeros((n, )), name='x')

    half = np.cast[config.floatX](0.5)

    obj = half * T.dot(T.dot(x, A), x) + T.dot(b, x) + c

    minimizer = BatchGradientDescent(objective=obj,
                                     params=[x],
                                     inputs=[A, b, c])

    num_samples = 3

    rng = np.random.RandomState([1, 2, 3])

    for i in xrange(num_samples):
        A = np.cast[config.floatX](rng.randn(1.5 * n, n))
        A = np.cast[config.floatX](np.dot(A.T, A))
        A += np.cast[config.floatX](np.identity(n) * .02)
        b = np.cast[config.floatX](rng.randn(n))
        c = np.cast[config.floatX](rng.randn())
        x.set_value(np.cast[config.floatX](rng.randn(n)))

        analytical_x = np.linalg.solve(A, -b)

        actual_obj = minimizer.minimize(A, b, c)
        actual_x = x.get_value()

        #Check that the value returned by the minimize method
        #is the objective function value at the parameters
        #chosen by the minimize method
        cur_obj = minimizer.obj(A, b, c)
        assert np.allclose(actual_obj, cur_obj)

        x.set_value(analytical_x)
        analytical_obj = minimizer.obj(A, b, c)

        #make sure the objective function is accurate to first 4 digits
        condition1 = not np.allclose(analytical_obj, actual_obj)
        condition2 = np.abs(analytical_obj -
                            actual_obj) >= 1e-4 * np.abs(analytical_obj)

        if (config.floatX == 'float64' and condition1) \
                or (config.floatX == 'float32' and condition2):
            print 'objective function value came out wrong on sample ', i
            print 'analytical obj', analytical_obj
            print 'actual obj', actual_obj
            """
                The following section of code was used to verify that numerical
                error can make the objective function look non-convex

                print 'Checking for numerically induced non-convex behavior'
                def f(x):
                    return 0.5 * np.dot(x,np.dot(A,x)) + np.dot(b,x) + c

                x.set_value(actual_x)
                minimizer._compute_grad(A,b,c)
                minimizer._normalize_grad()
                d = minimizer.param_to_grad_shared[x].get_value()

                x = actual_x.copy()
                prev = f(x)
                print prev
                step_size = 1e-4
                x += step_size * d
                cur = f(x)
                print cur
                cur_sgn = np.sign(cur-prev)
                flip_cnt = 0
                for i in xrange(10000):
                    x += step_size * d
                    prev = cur
                    cur = f(x)
                    print cur
                    prev_sgn = cur_sgn
                    cur_sgn = np.sign(cur-prev)
                    if cur_sgn != prev_sgn:
                        print 'flip'
                        flip_cnt += 1
                        if flip_cnt > 1:
                            print "Non-convex!"

                            from matplotlib import pyplot as plt
                            y = []

                            x = actual_x.copy()
                            for j in xrange(10000):
                                y.append(f(x))
                                x += step_size * d

                            plt.plot(y)
                            plt.show()

                            assert False

                print 'None found'
                """

            #print 'actual x',actual_x
            #print 'A:'
            #print A
            #print 'b:'
            #print b
            #print 'c:'
            #print c
            x.set_value(actual_x)
            minimizer._compute_grad(A, b, c)
            x_grad = minimizer.param_to_grad_shared[x]
            actual_grad = x_grad.get_value()
            correct_grad = 0.5 * np.dot(A, x.get_value()) + 0.5 * np.dot(
                A.T, x.get_value()) + b
            if not np.allclose(actual_grad, correct_grad):
                print 'gradient was wrong at convergence point'
                print 'actual grad: '
                print actual_grad
                print 'correct grad: '
                print correct_grad
                print 'max difference: ', np.abs(actual_grad -
                                                 correct_grad).max()
                assert False

            minimizer._normalize_grad()
            d = minimizer.param_to_grad_shared[x].get_value()
            step_len = ( np.dot(b,d) + 0.5 * np.dot(d,np.dot(A,actual_x)) \
                    + 0.5 * np.dot(actual_x,np.dot(A,d)) ) / np.dot(d, np.dot(A,d))

            g = np.dot(A, actual_x) + b
            deriv = np.dot(g, d)

            print 'directional deriv at actual', deriv
            print 'optimal step_len', step_len
            optimal_x = actual_x - d * step_len
            g = np.dot(A, optimal_x) + b
            deriv = np.dot(g, d)

            print 'directional deriv at optimal: ', deriv
            x.set_value(optimal_x)
            print 'obj at optimal: ', minimizer.obj(A, b, c)

            print 'eigenvalue range:'
            val, vec = np.linalg.eig(A)
            print(val.min(), val.max())
            print 'condition number: ', (val.max() / val.min())
            assert False
Esempio n. 53
0
 def shared_dataset(data_x):
     """Function that loads the dataset into shared variables"""
     if conf.get('normalize', True):
         return sharedX(data_x, borrow=True)
     else:
         return theano.shared(theano._asarray(data_x), borrow=True)
Esempio n. 54
0
                T.dot(X, self.W1) + T.dot(y, self.W2.T) + self.b1)
            y = T.nnet.softmax(T.dot(H, self.W2) + self.b2)
        return y

    def mfny_arg(self, X):
        H = T.nnet.sigmoid(T.dot(X, 2 * self.W1) + self.b1)
        y = T.nnet.softmax(T.dot(H, self.W2) + self.b2)

        for i in xrange(mf_iter - 1):
            H = T.nnet.sigmoid(
                T.dot(X, self.W1) + T.dot(y, self.W2.T) + self.b1)
            y = T.nnet.softmax(T.dot(H, self.W2) + self.b2)
        return T.dot(H, self.W2) + self.b2


X = sharedX(dataset.X)
y = sharedX(dataset.y)

idx = T.iscalar()
idx.tag.test_value = 0

Xb = X[idx * batch_size:(idx + 1) * batch_size, :]
yb = y[idx * batch_size:(idx + 1) * batch_size, :]

mf1mod = cRBM(W1, b1, W2, b2)
mfnmod = cRBM(W1, b1, W2, b2)

ymf1_arg = mf1mod.mf1y_arg(Xb)
ymfn_arg = mfnmod.mfny_arg(Xb)

Esempio n. 55
0
idxs = np.arange(num_beta)
pos = idxs / float(num_beta-1)
scaled_shifted = pos * (max_exp-min_exp) + min_exp
betas = 10 ** scaled_shifted


kls = np.zeros((trials,num_beta))
ml_kls = np.zeros((trials,))

for trial in xrange(trials):
#generate the data
    data_distribution = MND( sigma = np.identity(dim) / true_beta,
                            mu = np.zeros((dim,)), seed = 17 * (trial+1) )
    true = DiagonalMND( nvis = dim, init_beta = true_beta, init_mu = 0.,
            min_beta = .1, max_beta = 10.)
    X = sharedX(function([],data_distribution.random_design_matrix(m))())

    Xv = X.get_value()
    mu = Xv.mean(axis=0)
    print 'maximum likelihood mu: ',mu
    diff = Xv - mu
    var = np.square(diff).mean(axis=0)
    mlbeta = 1./var
    print 'maximum likelihood beta: ',mlbeta
    ml_model = DiagonalMND( nvis = dim, init_mu = mu, init_beta = mlbeta,
            min_beta = 0.0,
            max_beta = 1e6)
    ml_kl = kl_divergence( true, ml_model)
    ml_kl = function([],ml_kl)()
    assert ml_kl >= 0.0
    ml_kls[trial] = ml_kl
Esempio n. 56
0
 def __init__(self, W1, b1, W2, b2):
     self.W1 = sharedX(W1)
     self.W2 = sharedX(W2)
     self.b1 = sharedX(b1)
     self.b2 = sharedX(b2)
Esempio n. 57
0
 def _initialize_visbias(self, nvis):
     self.visbias = sharedX(numpy.zeros(nvis), name='vb', borrow=True)
Esempio n. 58
0
    def train_all(self, dataset, mu=None):
        """
        Process kmeans algorithm on the input to localize clusters.

        Parameters
        ----------
        dataset : WRITEME
        mu : WRITEME

        Returns
        -------
        rval : bool
            WRITEME
        """

        #TODO-- why does this sometimes return X and sometimes return nothing?

        X = dataset.get_design_matrix()

        n, m = X.shape
        k = self.k

        if milk is not None:
            #use the milk implementation of k-means if it's available
            cluster_ids, mu = milk.kmeans(X, k)
        else:
            #our own implementation

            # taking random inputs as initial clusters if user does not provide
            # them.
            if mu is not None:
                if not len(mu) == k:
                    raise Exception(
                        'You gave %i clusters, but k=%i were expected' %
                        (len(mu), k))
            else:
                indices = numpy.random.randint(X.shape[0], size=k)
                mu = X[indices]

            try:
                dists = numpy.zeros((n, k))
            except MemoryError:
                raise TypicalMemoryError("dying trying to allocate dists "
                                         "matrix for {0} examples and {1} "
                                         "means".format(n, k))

            old_kills = {}

            iter = 0
            mmd = prev_mmd = float('inf')
            while True:
                if self.verbose:
                    logger.info('kmeans iter {0}'.format(iter))

                #print 'iter:',iter,' conv crit:',abs(mmd-prev_mmd)
                #if numpy.sum(numpy.isnan(mu)) > 0:
                if numpy.any(numpy.isnan(mu)):
                    logger.info('nan found')
                    return X

                #computing distances
                for i in xrange(k):
                    dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)

                if iter > 0:
                    prev_mmd = mmd

                min_dists = dists.min(axis=1)

                #mean minimum distance:
                mmd = min_dists.mean()

                logger.info('cost: {0}'.format(mmd))

                if iter > 0 and (iter >= self.max_iter or \
                                        abs(mmd - prev_mmd) < self.convergence_th):
                    #converged
                    break

                #finding minimum distances
                min_dist_inds = dists.argmin(axis=1)

                #computing means
                i = 0
                blacklist = []
                new_kills = {}
                while i < k:
                    b = min_dist_inds == i
                    if not numpy.any(b):
                        killed_on_prev_iter = True
                        #initializes empty cluster to be the mean of the d data
                        #points farthest from their corresponding means
                        if i in old_kills:
                            d = old_kills[i] - 1
                            if d == 0:
                                d = 50
                            new_kills[i] = d
                        else:
                            d = 5
                        mu[i, :] = 0
                        for j in xrange(d):
                            idx = numpy.argmax(min_dists)
                            min_dists[idx] = 0
                            #chose point idx
                            mu[i, :] += X[idx, :]
                            blacklist.append(idx)
                        mu[i, :] /= float(d)
                        #cluster i was empty, reset it to d far out data points
                        #recomputing distances for this cluster
                        dists[:, i] = numpy.square((X - mu[i, :])).sum(axis=1)
                        min_dists = dists.min(axis=1)
                        for idx in blacklist:
                            min_dists[idx] = 0
                        min_dist_inds = dists.argmin(axis=1)
                        #done
                        i += 1
                    else:
                        mu[i, :] = numpy.mean(X[b, :], axis=0)
                        if numpy.any(numpy.isnan(mu)):
                            logger.info('nan found at {0}'.format(i))
                            return X
                        i += 1

                old_kills = new_kills

                iter += 1

        self.mu = sharedX(mu)
        self._params = [self.mu]
        return True
Esempio n. 59
0
    def __init__(self,
                 learning_rate,
                 cost=None,
                 batch_size=None,
                 monitoring_batches=None,
                 monitoring_dataset=None,
                 monitor_iteration_mode='sequential',
                 termination_criterion=None,
                 update_callbacks=None,
                 init_momentum=None,
                 set_batch_size=False,
                 train_iteration_mode=None,
                 batches_per_iter=None,
                 theano_function_mode=None,
                 monitoring_costs=None,
                 seed=[2012, 10, 5]):
        """
            WRITEME

            learning_rate: The learning rate to use.
                            Train object callbacks can change the learning
                            rate after each epoch. SGD update_callbacks
                            can change it after each minibatch.
            cost: a pylearn2.costs.cost.Cost object specifying the objective
                  function to be minimized.
                  Optionally, may be None. In this case, SGD will call the model's
                  get_default_cost method to obtain the objective function.
            init_momentum: if None, does not use momentum
                            otherwise, use momentum and initialize the
                            momentum coefficient to init_momentum.
                            Callbacks can change this over time just like
                            the learning rate.

                            If the gradient is the same on every step, then
                            the update taken by the SGD algorithm is scaled
                            by a factor of 1/(1-momentum).

                            See section 9 of Geoffrey Hinton's "A Practical
                            Guide to Training Restricted Boltzmann Machines"
                            for details.
            set_batch_size: if True, and batch_size conflicts with
                            model.force_batch_size, will call
                            model.set_batch_size(batch_size) in an attempt
                            to change model.force_batch_size
            theano_function_mode: The theano mode to compile the updates function with.
                            Note that pylearn2 includes some wraplinker modes that are
                            not bundled with theano. See pylearn2.devtools. These
                            extra modes let you do things like check for NaNs at every
                            step, or record md5 digests of all computations performed
                            by the update function to help isolate problems with nondeterminism.

            Parameters are updated by the formula:

            inc := momentum * inc - learning_rate * d cost / d param
            param := param + inc
        """

        if isinstance(cost, (list, tuple, set)):
            raise TypeError(
                "SGD no longer supports using collections of Costs to represent "
                " a sum of Costs. Use pylearn2.costs.cost.SumOfCosts instead.")

        self.learning_rate = sharedX(learning_rate, 'learning_rate')
        self.cost = cost
        self.batch_size = batch_size
        self.set_batch_size = set_batch_size
        self.batches_per_iter = batches_per_iter
        self._set_monitoring_dataset(monitoring_dataset)
        self.monitoring_batches = monitoring_batches
        self.monitor_iteration_mode = monitor_iteration_mode
        if monitoring_dataset is None:
            if monitoring_batches is not None:
                raise ValueError(
                    "Specified an amount of monitoring batches but not a monitoring dataset."
                )
        self.termination_criterion = termination_criterion
        self.init_momentum = init_momentum
        if init_momentum is None:
            self.momentum = None
        else:
            assert init_momentum >= 0.
            assert init_momentum < 1.
            self.momentum = sharedX(init_momentum, 'momentum')
        self._register_update_callbacks(update_callbacks)
        if train_iteration_mode is None:
            train_iteration_mode = 'shuffled_sequential'
        self.train_iteration_mode = train_iteration_mode
        self.first = True
        self.rng = np.random.RandomState(seed)
        self.theano_function_mode = theano_function_mode
        self.monitoring_costs = monitoring_costs
Esempio n. 60
0
    def setup(self, model, dataset):

        if self.cost is None:
            self.cost = model.get_default_cost()

        inf_params = [
            param for param in model.get_params()
            if np.any(np.isinf(param.get_value()))
        ]
        if len(inf_params) > 0:
            raise ValueError("These params are Inf: " + str(inf_params))
        if any([
                np.any(np.isnan(param.get_value()))
                for param in model.get_params()
        ]):
            nan_params = [
                param for param in model.get_params()
                if np.any(np.isnan(param.get_value()))
            ]
            raise ValueError("These params are NaN: " + str(nan_params))
        self.model = model

        batch_size = self.batch_size
        if hasattr(model, "force_batch_size"):
            if model.force_batch_size > 0:
                if batch_size is not None:
                    if batch_size != model.force_batch_size:
                        if self.set_batch_size:
                            model.set_batch_size(batch_size)
                        else:
                            raise ValueError(
                                "batch_size argument to SGD conflicts with model's force_batch_size attribute"
                            )
                else:
                    self.batch_size = model.force_batch_size
        model._test_batch_size = self.batch_size
        self.monitor = Monitor.get_monitor(model)
        self.monitor._sanity_check()

        X = model.get_input_space().make_theano_batch(name="%s[X]" %
                                                      self.__class__.__name__)
        self.topo = not X.ndim == 2

        if config.compute_test_value == 'raise':
            if self.topo:
                X.tag.test_value = dataset.get_batch_topo(self.batch_size)
            else:
                X.tag.test_value = dataset.get_batch_design(self.batch_size)

        Y = T.matrix(name="%s[Y]" % self.__class__.__name__)

        fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y)
        self.on_load_batch = fixed_var_descr.on_load_batch

        if self.cost.supervised:
            if config.compute_test_value == 'raise':
                _, Y.tag.test_value = dataset.get_batch_design(
                    self.batch_size, True)

            self.supervised = True
            cost_value = self.cost(model, X, Y, **fixed_var_descr.fixed_vars)

        else:
            self.supervised = False
            cost_value = self.cost(model, X, **fixed_var_descr.fixed_vars)
        if cost_value is not None and cost_value.name is None:
            if self.supervised:
                cost_value.name = 'objective(' + X.name + ', ' + Y.name + ')'
            else:
                cost_value.name = 'objective(' + X.name + ')'

        # Set up monitor to model the objective value, learning rate,
        # momentum (if applicable), and extra channels defined by
        # the cost
        learning_rate = self.learning_rate
        if self.monitoring_dataset is not None:
            self.monitor.setup(dataset=self.monitoring_dataset,
                               cost=self.cost,
                               batch_size=self.batch_size,
                               num_batches=self.monitoring_batches,
                               extra_costs=self.monitoring_costs,
                               mode=self.monitor_iteration_mode)
            if self.supervised:
                ipt = (X, Y)
            else:
                ipt = X
            dataset_name = self.monitoring_dataset.keys()[0]
            monitoring_dataset = self.monitoring_dataset[dataset_name]
            #TODO: have Monitor support non-data-dependent channels
            self.monitor.add_channel(name='learning_rate',
                                     ipt=ipt,
                                     val=learning_rate,
                                     dataset=monitoring_dataset)
            if self.momentum:
                self.monitor.add_channel(name='momentum',
                                         ipt=ipt,
                                         val=self.momentum,
                                         dataset=monitoring_dataset)

        params = list(model.get_params())
        assert len(params) > 0
        for i, param in enumerate(params):
            if param.name is None:
                param.name = 'sgd_params[%d]' % i

        if self.cost.supervised:
            grads, updates = self.cost.get_gradients(
                model, X, Y, **fixed_var_descr.fixed_vars)
        else:
            grads, updates = self.cost.get_gradients(
                model, X, **fixed_var_descr.fixed_vars)

        for param in grads:
            assert param in params
        for param in params:
            assert param in grads

        for param in grads:
            if grads[param].name is None and cost_value is not None:
                grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {
                    'costname': cost_value.name,
                    'paramname': param.name
                })

        lr_scalers = model.get_lr_scalers()

        for key in lr_scalers:
            if key not in params:
                raise ValueError("Tried to scale the learning rate on " +\
                        str(key)+" which is not an optimization parameter.")

        log.info('Parameter and initial learning rate summary:')
        for param in params:
            param_name = param.name
            if param_name is None:
                param_name = 'anon_param'
            lr = learning_rate.get_value() * lr_scalers.get(param, 1.)
            log.info('\t' + param_name + ': ' + str(lr))

        if self.momentum is None:
            updates.update( dict(safe_zip(params, [param - learning_rate * \
                lr_scalers.get(param, 1.) * grads[param]
                                    for param in params])))
        else:
            for param in params:
                inc = sharedX(param.get_value() * 0.)
                if param.name is not None:
                    inc.name = 'inc_' + param.name
                updated_inc = self.momentum * inc - learning_rate * lr_scalers.get(
                    param, 1.) * grads[param]
                updates[inc] = updated_inc
                updates[param] = param + updated_inc

        for param in params:
            if updates[param].name is None:
                updates[param].name = 'sgd_update(' + param.name + ')'
        model.censor_updates(updates)
        for param in params:
            update = updates[param]
            if update.name is None:
                update.name = 'censor(sgd_update(' + param.name + '))'
            for update_val in get_debug_values(update):
                if np.any(np.isinf(update_val)):
                    raise ValueError("debug value of %s contains infs" %
                                     update.name)
                if np.any(np.isnan(update_val)):
                    raise ValueError("debug value of %s contains nans" %
                                     update.name)

        with log_timing(log, 'Compiling sgd_update'):
            if self.supervised:
                fn_inputs = [X, Y]
            else:
                fn_inputs = [X]

            self.sgd_update = function(fn_inputs,
                                       updates=updates,
                                       name='sgd_update',
                                       on_unused_input='ignore',
                                       mode=self.theano_function_mode)
        self.params = params