Exemple #1
0
    def set_weights(self, wts=None, bs=None, init_method=None, scale_factor=None, seed=None):
        ''' Initializes the weights and biases of the neural network 

        Parameters:
        -----------
        param: wts - weights
        type: np.ndarray, optional

        param: bs - biases
        type: np.ndarray, optional

        param: method - calls some pre-specified weight initialization routines
        type: string, optional
        '''
        # with tied weights, the encoding and decoding matrices are simply transposes
        # of one another

        if self.tied_wts:

            if seed is not None:
                np.random.seed(seed=seed)

            # weights and biases
            if wts is None and bs is None:
                wts = [None]
                bs = [None, None]

                if init_method == 'gauss':
                    wts[0] = scale_factor * \
                        np.random.randn(self.num_nodes[0], self.num_nodes[0])
                    bs[0] = np.zeros(self.num_nodes[1])
                    bs[1] = np.zeros(self.num_nodes[0])

                if init_method == 'fan-io':
                    v = np.sqrt(
                        1. * scale_factor / (self.num_nodes[0] + self.num_nodes[1] + 1))
                    wts[0] = scale_factor * v * \
                        np.random.rand(
                            self.num_nodes[0], self.num_nodes[1] - v)
                    bs[0] = np.zeros(self.num_nodes[1])
                    bs[1] = np.zeros(self.num_nodes[0])
            else:
                assert isinstance(wts, list)
                assert isinstance(bs, list)

            self.wts_ = [
                theano.shared(nu.floatX(wt), borrow=True) for wt in wts]
            self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs]

        # if encoding and decoding matrices are distinct, just default back to the
        # normal case
        else:
            super(Autoencoder, self).set_weights(
                init_method=init_method, scale_factor=scale_factor, seed=seed)
Exemple #2
0
    def compile_multilayer_functions(self, wts=None, bs=None):
        ''' compiles prediction and scoring functions for testing 

        Parameters:
        -----------
        param: wts - weights
        type: numpy ndarray matrix

        param: bs - biases
        type: numpy ndarray matrix
        '''

        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_
        else:
            wts = [nu.floatX(w) for w in wts]
            bs = [nu.floatX(b) for b in bs]

        X = T.matrix()
        y = T.matrix()

        # output probabilities
        y_prob = self.fprop(X, wts, bs)
        self.predict_prob = theano.function(
            inputs=[X],
            outputs=[y_prob],
            mode='FAST_RUN',
            allow_input_downcast=True)

        # prediction probabilities
        y_pred = T.argmax(y_prob, axis=1)
        self.predict_label = theano.function(
            inputs=[X],
            outputs=[y_pred],
            mode='FAST_RUN',
            allow_input_downcast=True)

        # evaluation loss
        eval_loss = self.compute_eval_loss(X, y, wts, bs)
        self.compute_test_loss = theano.function(
            inputs=[X, y],
            outputs=eval_loss,
            mode='FAST_RUN',
            allow_input_downcast=True)

        # scoring (classification accuracy)
        self.score = theano.function(
            inputs=[X, y],
            outputs=1.0 - T.mean(T.neq(y_pred, T.argmax(y, axis=1))),
            mode='FAST_RUN',
            allow_input_downcast=True)
Exemple #3
0
    def set_weights(self,
                    wts=None,
                    bs=None,
                    init_method=None,
                    scale_factor=None,
                    seed=None):
        ''' Initializes the weights and biases of the neural network
        Parameters:
        -----------
        param: wts - weights
        type: np.ndarray, optional
        param: bs - biases
        type: np.ndarray, optional
        param: init_method - calls some pre-specified weight initialization routines
        type: string
        param: scale_factor - additional hyperparameter for weight initialization
        type: float, optional
        param: seed - seeds the random number generator
        type: int, optional
        '''
        if seed is not None:
            np.random.seed(seed=seed)
            self.srng.seed(seed)

        if wts is None and bs is None:
            wts = (len(self.num_nodes) - 1) * [None]
            bs = (len(self.num_nodes) - 1) * [None]

            if init_method == 'gauss':
                for i, (n1, n2) in enumerate(
                        zip(self.num_nodes[:-1], self.num_nodes[1:])):
                    wts[i] = scale_factor * 1. / \
                        np.sqrt(n2) * np.random.randn(n1, n2)
                    bs[i] = np.zeros(n2)

            elif init_method == 'fan-io':
                for i, (n1, n2) in enumerate(
                        zip(self.num_nodes[:-1], self.num_nodes[1:])):
                    v = scale_factor * np.sqrt(6. / (n1 + n2 + 1))
                    wts[i] = 2.0 * v * np.random.rand(n1, n2) - v
                    bs[i] = np.zeros(n2)
            else:
                sys.exit(ne.weight_error())

        else:
            # this scenario occurs most when doing unsupervised pre-training to initialize
            # the weights
            assert isinstance(wts, list)
            assert isinstance(bs, list)

        self.wts_ = [theano.shared(nu.floatX(wt), borrow=True) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs]
Exemple #4
0
    def set_weights(self, wts=None, bs=None, init_method=None, scale_factor=None, seed=None):
        ''' Initializes the weights and biases of the neural network
        Parameters:
        -----------
        param: wts - weights
        type: np.ndarray, optional
        param: bs - biases
        type: np.ndarray, optional
        param: method - calls some pre-specified weight initialization routines
        type: string, optional
        '''
        # with tied weights, the encoding and decoding matrices are simply transposes
        # of one another

        if self.tied_wts:

            if seed is not None:
                np.random.seed(seed=seed)

            # weights and biases
            if wts is None and bs is None:
                wts = [None]
                bs = [None, None]

                if init_method == 'gauss':
                    wts[0] = scale_factor * \
                        np.random.randn(self.num_nodes[0], self.num_nodes[0])
                    bs[0] = np.zeros(self.num_nodes[1])
                    bs[1] = np.zeros(self.num_nodes[0])

                if init_method == 'fan-io':
                    v = np.sqrt(
                        1. * scale_factor / (self.num_nodes[0] + self.num_nodes[1] + 1))
                    wts[0] = scale_factor * v * \
                        np.random.rand(
                            self.num_nodes[0], self.num_nodes[1] - v)
                    bs[0] = np.zeros(self.num_nodes[1])
                    bs[1] = np.zeros(self.num_nodes[0])
            else:
                assert isinstance(wts, list)
                assert isinstance(bs, list)

            self.wts_ = [
                theano.shared(nu.floatX(wt), borrow=True) for wt in wts]
            self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs]

        # if encoding and decoding matrices are distinct, just default back to the
        # normal case
        else:
            super(Autoencoder, self).set_weights(
                init_method=init_method, scale_factor=scale_factor, seed=seed)
    def set_weights(self, wts=None, bs=None, init_method=None, scale_factor=None, seed=None):
        ''' Initializes the weights and biases of the neural network

        Parameters:
        -----------
        param: wts - weights
        type: np.ndarray, optional

        param: bs - biases
        type: np.ndarray, optional

        param: init_method - calls some pre-specified weight initialization routines
        type: string

        param: scale_factor - additional hyperparameter for weight initialization
        type: float, optional

        param: seed - seeds the random number generator
        type: int, optional
        '''
        if seed is not None:
            np.random.seed(seed=seed)
            self.srng.seed(seed)

        if wts is None and bs is None:
            wts = (len(self.num_nodes) - 1) * [None]
            bs = (len(self.num_nodes) - 1) * [None]

            if init_method == 'gauss':
                for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])):
                    wts[i] = scale_factor * 1. / \
                        np.sqrt(n2) * np.random.randn(n1, n2)
                    bs[i] = np.zeros(n2)

            elif init_method == 'fan-io':
                for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])):
                    v = scale_factor * np.sqrt(6. / (n1 + n2 + 1))
                    wts[i] = 2.0 * v * np.random.rand(n1, n2) - v
                    bs[i] = np.zeros(n2)
            else:
                sys.exit(ne.weight_error())

        else:
            # this scenario occurs most when doing unsupervised pre-training to initialize
            # the weights
            assert isinstance(wts, list)
            assert isinstance(bs, list)

        self.wts_ = [theano.shared(nu.floatX(wt), borrow=True) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs]
Exemple #6
0
def momentum(params, grad_params, learn_rate=1., alpha=0.9):
    ''' standard sgd with 'momentum'
    param: params - model parameters
    type: list of theano shared variables
    param: grad_params - derivative of the loss with respect to the model parameters
    type: list of theano variables
    param: learn_rate - 'master' learning rate for the adagrad algorithm
    type: float
    param: alpha - viscosity term for momentum
    type: float
    '''
    updates = []
    for param, grad_param in zip(params, grad_params):

        # velocity term with vicosity
        velocity = theano.shared(nu.floatX(np.zeros(param.get_value().shape)))
        velocity_ = alpha * velocity - learn_rate * grad_param

        # paramater update
        param_ = param + velocity_

        # collected updates of both the parameter and velocity
        updates.append((velocity, velocity_))
        updates.append((param, param_))

    return updates
Exemple #7
0
    def compile_multilayer_functions(self, wts=None, bs=None):
        ''' compiles prediction and scoring functions for testing
        Parameters:
        -----------
        param: wts - weights
        type: numpy ndarray matrix
        param: bs - biases
        type: numpy ndarray matrix
        '''

        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_
        else:
            wts = [nu.floatX(w) for w in wts]
            bs = [nu.floatX(b) for b in bs]

        X = T.matrix()
        y = T.matrix()

        # output probabilities
        y_prob = self.fprop(X, wts, bs)
        self.predict_prob = theano.function(inputs=[X],
                                            outputs=[y_prob],
                                            mode='FAST_RUN',
                                            allow_input_downcast=True)

        # prediction probabilities
        y_pred = T.argmax(y_prob, axis=1)
        self.predict_label = theano.function(inputs=[X],
                                             outputs=[y_pred],
                                             mode='FAST_RUN',
                                             allow_input_downcast=True)

        # evaluation loss
        eval_loss = self.compute_eval_loss(X, y, wts, bs)
        self.compute_test_loss = theano.function(inputs=[X, y],
                                                 outputs=eval_loss,
                                                 mode='FAST_RUN',
                                                 allow_input_downcast=True)

        # scoring (classification accuracy)
        self.score = theano.function(
            inputs=[X, y],
            outputs=1.0 - T.mean(T.neq(y_pred, T.argmax(y, axis=1))),
            mode='FAST_RUN',
            allow_input_downcast=True)
Exemple #8
0
def rmsprop(params,
            grad_params,
            learn_rate=None,
            rho=None,
            eps=1e-6,
            max_norm=False,
            c=3):
    ''' Geoff hinton's '"RMSprop" algorithm - RPROP for mini-batches
    Parameters:
    ----------
    param: params - model parameters
    type: list of theano shared variables
    param: grad_params - derivative of the loss with respect to the model parameters
    type: list of theano variables
    param: learn_rate - learning rate for rmsprop
    type: float
    param: rho - momentum term
    param: eps - fudge factor [need ref here]
    type: float
    param: max_norm - flag to incidate that weights will be L2-norm constrained
    type: boolean
    param: c - L2-norm constraint for max_norm regularization
    type: float
    '''
    updates = []

    for param, grad_param in zip(params, grad_params):

        # accumulated gradient
        acc_grad_param = theano.shared(
            nu.floatX(np.zeros(param.get_value().shape)))  # initial value
        acc_grad_param_ = rho * acc_grad_param + (1 - rho) * grad_param**2

        # parameter update
        param_ = param - learn_rate * grad_param / \
            T.sqrt(acc_grad_param_ + eps)

        # there's probably a better way to check if this is a weight matrix...
        if max_norm and param.get_value().ndim == 2:
            param_ = maxnorm(param_, c)

        # collected updates of both the parameter and the accumulated gradient
        updates.append((acc_grad_param, acc_grad_param_))
        updates.append((param, param_))

    return updates
Exemple #9
0
def adagrad(params, grad_params, learn_rate=1., eps=1e-6, max_norm=False, c=5):
    ''' adaptive gradient method - typically works better than vanilla SGD and has some
    nice theoretical guarantees
    Parameters:
    ----------
    param: params - model parameters
    type: list of theano shared variables
    param: grad_params - derivative of the loss with respect to the model parameters
    type: list of theano variables
    param: learn_rate - 'master' learning rate for the adagrad algorithm
    type: float
    param: eps - fudge factor [need ref here]
    type: float
    param: max_norm - flag to incidate that weights will be L2-norm constrained
    type: boolean
    param: c - L2-norm constraint for max_norm regularization
    type: float
    Returns:
    --------
    None
    Updates:
    --------
    wts,bs
    '''

    updates = []
    for param, grad_param in zip(params, grad_params):

        # accumulated gradient
        acc_grad_param = theano.shared(
            nu.floatX(np.zeros(param.get_value().shape)))
        acc_grad_param_ = acc_grad_param + grad_param**2

        # parameter update
        param_ = param - learn_rate * grad_param / \
            T.sqrt(acc_grad_param_ + eps)

        # there's probably a better way to check if this is a weight matrix...
        if max_norm and param.get_value().ndim == 2:
            param_ = maxnorm(param_, c)

        # collected updates of both the parameter and the accumulated gradient
        updates.append((acc_grad_param, acc_grad_param_))
        updates.append((param, param_))

    return updates
    def check_gradients(self, X_in, Y_in, wts=None, bs=None):
        ''' this seems like overkill, but I suppose it doesn't hurt to have it in here...'''

        # assume that if it's not provided, they will be shared variables - this is
        # probably dangerous, but this is a debugging tool anyway,
        # so...whatever
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_
        else:
            wts = [theano.shared(nu.floatX(w), borrow=True) for w in wts]
            bs = [theano.shared(nu.floatX(b), borrow=True) for b in bs]

        X = T.matrix()  # inputs
        Y = T.matrix()  # labels
        v = T.vector()  # vector of biases and weights
        i = T.lscalar()  # index

        # 1. compile the numerical gradient function
        def compute_numerical_gradient(v, i, X, Y, eps=1e-4):

            # perturb the input
            v_plus = T.inc_subtensor(v[i], eps)
            v_minus = T.inc_subtensor(v[i], -1.0 * eps)

            # roll it back into the weight matrices and bias vectors
            wts_plus, bs_plus = nu.t_reroll(v_plus, self.num_nodes)
            wts_minus, bs_minus = nu.t_reroll(v_minus, self.num_nodes)

            # compute the loss for both sides, and then compute the numerical
            # gradient
            loss_plus = self.compute_optim_loss(X, Y, wts=wts_plus, bs=bs_plus)
            loss_minus = self.compute_optim_loss(X, Y, wts_minus, bs_minus)

            # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps)
            return 1.0 * (loss_plus - loss_minus) / (2 * eps)

        compute_ngrad = theano.function(
            inputs=[v, i, X, Y], outputs=compute_numerical_gradient(v, i, X, Y))

        # 2. compile backprop (theano's autodiff)
        optim_loss = self.compute_optim_loss(X, Y, wts=wts, bs=bs)
        params = [p for param in [wts, bs]
                  for p in param]  # all model parameters in a list
        # gradient of each model param w.r.t training loss
        grad_params = [T.grad(optim_loss, param) for param in params]
        # gradient of the full weight vector
        grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):])

        compute_bgrad = theano.function(inputs=[X, Y], outputs=grad_w)

        # compute the mean difference between the numerical and exact gradients
        v0 = nu.unroll([wt.get_value()
                        for wt in wts], [b.get_value() for b in bs])
        # get the indices of the weights/biases we want to check
        idxs = np.random.permutation(self.num_params)[:(self.num_params / 5)]

        ngrad = [None] * len(idxs)
        for j, idx in enumerate(idxs):
            ngrad[j] = compute_ngrad(v0, idx, X_in, Y_in)
        bgrad = compute_bgrad(X_in, Y_in)[idxs]

        cerr = np.mean(np.abs(ngrad - bgrad))
        assert cerr < 1e-10
    def fullbatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, num_epochs=None, **optim_params):
        ''' Full-batch optimization using scipy's L-BFGS-B and CG

        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix

        param: y_tr - training labels
        type: theano matrix

        param: num_epochs - the number of full runs through the dataset
        type: int

        param: **optim_params
        type: dictionary of optimization parameters
        '''

        X = T.matrix('X')  # input variable
        y = T.matrix('y')  # output variable
        w = T.vector('w')  # weight vector

        # reshape the vector w into weight and bias matrices, and set up the 
        # theano graph to compute the loss and gradient
        wts, bs = nu.t_reroll(w, self.num_nodes)
        optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs)
        params = [p for param in [wts, bs] for p in param]  
        grad_params = [T.grad(optim_loss, param) for param in params]
        grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):])

        compute_loss_grad_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss, grad_w],
            allow_input_downcast=True)

        compute_loss_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss],
            allow_input_downcast=True)

        # initialize the weight vector and perform full-batch optimization
        wts0 = [wt.get_value() for wt in self.wts_]
        bs0 = [b.get_value() for b in self.bs_]
        w0 = nu.unroll(wts0, bs0)

        # print 'Checking gradients...'
        # self.check_gradients(X_tr,y_tr,wts0,bs0)
        # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr)

        try:
            optim_method = optim_params.pop('optim_method')
        except KeyError:
            sys.exit(ne.method_err())

        # very annoying.
        if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32':
            sys.exit('Sorry, L-BFGS-B only works with float64')

        wf = sp.optimize.minimize(compute_loss_grad_from_vector, w0, args=(X_tr, y_tr), method=optim_method, jac=True,
                                  options={'maxiter': num_epochs})


        # re-rolln back into weights and biases
        wts, bs = nu.reroll(wf.x, self.num_nodes)

        self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b)) for b in bs]
    def shared_dataset(self, X, y):
        ''' As per the deep learning tutorial, loading the data all at once (if possible)
        into the GPU will significantly speed things up '''

        return theano.shared(nu.floatX(X)), theano.shared(nu.floatX(y))
Exemple #13
0
    def fullbatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, num_epochs=None, **optim_params):
        ''' Full-batch optimization using scipy's L-BFGS-B and CG; this function is duplicated for
        autoencoders, since the option of tied-weights makes the unrolling/rerolling a bit different
        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix
        param: y_tr - training labels
        type: theano matrix
        param: num_epochs - the number of full runs through the dataset
        type: int
        '''

        X = T.matrix('X')  # input variable
        y = T.matrix('y')  # output variable
        w = T.vector('w')  # weight vector

        # reshape w into wts/biases, taking note of whether tied weights are
        # being used or not
        wts, bs = nu.t_reroll_ae(w, self.num_nodes, self.tied_wts)

        # get the loss
        optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs)

        # compute grad
        params = [p for param in [wts, bs]
                  for p in param]  # all model parameters in a list
        # gradient of each model param w.r.t training loss
        grad_params = [T.grad(optim_loss, param) for param in params]

        # gradient of the full weight vector
        grad_w = nu.t_unroll_ae(
            grad_params[:len(wts)], grad_params[len(wts):], self.tied_wts)

        compute_loss_grad_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss, grad_w],
            allow_input_downcast=True)

        compute_loss_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss],
            allow_input_downcast=True)

        # initial value for the weight vector
        wts0 = [wt.get_value() for wt in self.wts_]
        bs0 = [b.get_value() for b in self.bs_]
        w0 = nu.unroll_ae(wts0, bs0, self.tied_wts)

        # print 'Checking gradients for fun...'
        # self.check_gradients(X_tr,y_tr,wts0,bs0)
        # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr)

        try:
            optim_method = optim_params.pop('optim_method')
        except KeyError:
            sys.exit(ne.method_err())

        # very annoying.
        if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32':
            sys.exit('Sorry, L-BFGS-B only works with float64')

        # scipy optimizer
        wf = sp.optimize.minimize(compute_loss_grad_from_vector, w0, args=(X_tr, y_tr), method=optim_method, jac=True,
                                  options={'maxiter': num_epochs})

        # print 'Post-training loss',compute_loss_from_vector(wf.x,X_tr,y_tr)

        # re-roll this back into weights and biases
        wts, bs = nu.reroll_ae(wf.x, self.num_nodes, self.tied_wts)

        self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b)) for b in bs]
Exemple #14
0
    def fullbatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, num_epochs=None, **optim_params):
        ''' Full-batch optimization using scipy's L-BFGS-B and CG; this function is duplicated for 
        autoencoders, since the option of tied-weights makes the unrolling/rerolling a bit different

        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix

        param: y_tr - training labels
        type: theano matrix

        param: num_epochs - the number of full runs through the dataset
        type: int
        '''

        X = T.matrix('X')  # input variable
        y = T.matrix('y')  # output variable
        w = T.vector('w')  # weight vector

        # reshape w into wts/biases, taking note of whether tied weights are
        # being used or not
        wts, bs = nu.t_reroll_ae(w, self.num_nodes, self.tied_wts)

        # get the loss
        optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs)

        # compute grad
        params = [p for param in [wts, bs]
                  for p in param]  # all model parameters in a list
        # gradient of each model param w.r.t training loss
        grad_params = [T.grad(optim_loss, param) for param in params]

        # gradient of the full weight vector
        grad_w = nu.t_unroll_ae(
            grad_params[:len(wts)], grad_params[len(wts):], self.tied_wts)

        compute_loss_grad_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss, grad_w],
            allow_input_downcast=True)

        compute_loss_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss],
            allow_input_downcast=True)

        # initial value for the weight vector
        wts0 = [wt.get_value() for wt in self.wts_]
        bs0 = [b.get_value() for b in self.bs_]
        w0 = nu.unroll_ae(wts0, bs0, self.tied_wts)

        # print 'Checking gradients for fun...'
        # self.check_gradients(X_tr,y_tr,wts0,bs0)
        # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr)

        try:
            optim_method = optim_params.pop('optim_method')
        except KeyError:
            sys.exit(ne.method_err())

        # very annoying.
        if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32':
            sys.exit('Sorry, L-BFGS-B only works with float64')

        # scipy optimizer
        wf = sp.optimize.minimize(compute_loss_grad_from_vector, w0, args=(X_tr, y_tr), method=optim_method, jac=True,
                                  options={'maxiter': num_epochs})

        # print 'Post-training loss',compute_loss_from_vector(wf.x,X_tr,y_tr)

        # re-roll this back into weights and biases
        wts, bs = nu.reroll_ae(wf.x, self.num_nodes, self.tied_wts)

        self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b)) for b in bs]
Exemple #15
0
    def check_gradients(self, X_in, Y_in, wts=None, bs=None):
        ''' this seems like overkill, but I suppose it doesn't hurt to have it in here...'''

        # assume that if it's not provided, they will be shared variables - this is
        # probably dangerous, but this is a debugging tool anyway,
        # so...whatever
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_
        else:
            wts = [theano.shared(nu.floatX(w), borrow=True) for w in wts]
            bs = [theano.shared(nu.floatX(b), borrow=True) for b in bs]

        X = T.matrix()  # inputs
        Y = T.matrix()  # labels
        v = T.vector()  # vector of biases and weights
        i = T.lscalar()  # index

        # 1. compile the numerical gradient function
        def compute_numerical_gradient(v, i, X, Y, eps=1e-4):

            # perturb the input
            v_plus = T.inc_subtensor(v[i], eps)
            v_minus = T.inc_subtensor(v[i], -1.0 * eps)

            # roll it back into the weight matrices and bias vectors
            wts_plus, bs_plus = nu.t_reroll(v_plus, self.num_nodes)
            wts_minus, bs_minus = nu.t_reroll(v_minus, self.num_nodes)

            # compute the loss for both sides, and then compute the numerical
            # gradient
            loss_plus = self.compute_optim_loss(X, Y, wts=wts_plus, bs=bs_plus)
            loss_minus = self.compute_optim_loss(X, Y, wts_minus, bs_minus)

            # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps)
            return 1.0 * (loss_plus - loss_minus) / (2 * eps)

        compute_ngrad = theano.function(inputs=[v, i, X, Y],
                                        outputs=compute_numerical_gradient(
                                            v, i, X, Y))

        # 2. compile backprop (theano's autodiff)
        optim_loss = self.compute_optim_loss(X, Y, wts=wts, bs=bs)
        params = [p for param in [wts, bs]
                  for p in param]  # all model parameters in a list
        # gradient of each model param w.r.t training loss
        grad_params = [T.grad(optim_loss, param) for param in params]
        # gradient of the full weight vector
        grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):])

        compute_bgrad = theano.function(inputs=[X, Y], outputs=grad_w)

        # compute the mean difference between the numerical and exact gradients
        v0 = nu.unroll([wt.get_value() for wt in wts],
                       [b.get_value() for b in bs])
        # get the indices of the weights/biases we want to check
        idxs = np.random.permutation(self.num_params)[:(self.num_params / 5)]

        ngrad = [None] * len(idxs)
        for j, idx in enumerate(idxs):
            ngrad[j] = compute_ngrad(v0, idx, X_in, Y_in)
        bgrad = compute_bgrad(X_in, Y_in)[idxs]

        cerr = np.mean(np.abs(ngrad - bgrad))
        assert cerr < 1e-10
Exemple #16
0
    def fullbatch_optimize(self,
                           X_tr,
                           y_tr,
                           X_val=None,
                           y_val=None,
                           num_epochs=None,
                           **optim_params):
        ''' Full-batch optimization using scipy's L-BFGS-B and CG
        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix
        param: y_tr - training labels
        type: theano matrix
        param: num_epochs - the number of full runs through the dataset
        type: int
        param: **optim_params
        type: dictionary of optimization parameters
        '''

        X = T.matrix('X')  # input variable
        y = T.matrix('y')  # output variable
        w = T.vector('w')  # weight vector

        # reshape the vector w into weight and bias matrices, and set up the
        # theano graph to compute the loss and gradient
        wts, bs = nu.t_reroll(w, self.num_nodes)
        optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs)
        params = [p for param in [wts, bs] for p in param]
        grad_params = [T.grad(optim_loss, param) for param in params]
        grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):])

        compute_loss_grad_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss, grad_w],
            allow_input_downcast=True)

        compute_loss_from_vector = theano.function(inputs=[w, X, y],
                                                   outputs=[optim_loss],
                                                   allow_input_downcast=True)

        # initialize the weight vector and perform full-batch optimization
        wts0 = [wt.get_value() for wt in self.wts_]
        bs0 = [b.get_value() for b in self.bs_]
        w0 = nu.unroll(wts0, bs0)

        # print 'Checking gradients...'
        # self.check_gradients(X_tr,y_tr,wts0,bs0)
        # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr)

        try:
            optim_method = optim_params.pop('optim_method')
        except KeyError:
            sys.exit(ne.method_err())

        # very annoying.
        if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32':
            sys.exit('Sorry, L-BFGS-B only works with float64')

        wf = sp.optimize.minimize(compute_loss_grad_from_vector,
                                  w0,
                                  args=(X_tr, y_tr),
                                  method=optim_method,
                                  jac=True,
                                  options={'maxiter': num_epochs})

        # re-rolln back into weights and biases
        wts, bs = nu.reroll(wf.x, self.num_nodes)

        self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b)) for b in bs]
Exemple #17
0
    def shared_dataset(self, X, y):
        ''' As per the deep learning tutorial, loading the data all at once (if possible)
        into the GPU will significantly speed things up '''

        return theano.shared(nu.floatX(X)), theano.shared(nu.floatX(y))
X = T.matrix('X')
y = T.matrix('y')

# define the architecture
d = 784
k = 10
h1 = 625; activ1 = na.reLU
h2 = 625; activ2 = na.reLU
activ3 = na.softmax

# initialize weights and biases
wts = [None,None,None]
bs = [None,None,None]
np.random.seed(1234)
wts[0] = theano.shared(nu.floatX(0.01*np.random.randn(d,h1)),borrow=True)
wts[1] = theano.shared(nu.floatX(0.01*np.random.randn(h1,h2)),borrow=True)
wts[2] = theano.shared(nu.floatX(0.01*np.random.randn(h2,k)),borrow=True)
bs[0] = theano.shared(nu.floatX(np.zeros(h1)),borrow=True)
bs[1] = theano.shared(nu.floatX(np.zeros(h2)),borrow=True)
bs[2] = theano.shared(nu.floatX(np.zeros(k)),borrow=True)

# forward propagation
act1 = activ1(T.dot(X,wts[0]) + bs[0])
compute_act1 = theano.function(inputs=[X],outputs=act1,allow_input_downcast=True,mode='FAST_RUN')
act2 = activ2(T.dot(act1,wts[1]) + bs[1])
compute_act2 = theano.function(inputs=[X],outputs=act2,allow_input_downcast=True,mode='FAST_RUN')
y_pred = activ3(T.dot(act2,wts[2]) + bs[2])
compute_y_pred = theano.function(inputs=[X],outputs=y_pred,allow_input_downcast=True,mode='FAST_RUN')

X_tr = np.random.randn(128,784)