Esempi in Python per OutputLayer.gradient

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: layer

Classe/tipologia: OutputLayer

Metodo/funzione: gradient

Esempi su hotexamples.com: 3

OutputLayer.gradient in Python: 3 esempi trovati. Questi sono i migliori esempi reali in Python per layer.OutputLayer.gradient, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

OutputLayer(16)

forward(3)

backprop(2)

gradient(2)

init_weight(2)

loss(2)

predict(2)

set_weights(2)

get_outputs(1)

update_weights(1)

Esempio n. 1

Mostra file

File: nn.py Progetto: jgera/Segmentation-Code

class NN:
    """A class for general purpose neural networks, trained with
    backpropagation. The type of activation functions, number of hidden layers
    and number of units in each layer, the output function, and other options 
    during training can be configured."""
    def __init__(self):
        pass

    def init_net(self, config):
        """config is an instance of class Config"""

        import os

        self.config = config

        if config.is_output and (not os.path.exists(config.output_dir)):
            os.makedirs(config.output_dir)

        self.train_data = self.read_data(config.train_data_file)

        if config.is_val:
            self.val_data = self.read_data(config.val_data_file)
        if config.is_test:
            self.test_data = self.read_data(config.test_data_file)

        [num_total_cases, input_dim] = self.train_data.X.shape
        self.num_total_cases = num_total_cases
        self.input_dim = input_dim

        self.num_minibatches = num_total_cases / config.minibatch_size
        if self.num_minibatches < 1:
            self.num_minibatches = 1

        # initialize the network
        self.num_layers = config.num_layers
        self.layer = []
        in_dim = input_dim
        for i in range(0, self.num_layers):
            self.layer.append(
                Layer(in_dim, config.layer[i].out_dim,
                      config.layer[i].act_type))
            in_dim = config.layer[i].out_dim

        self.output = OutputLayer(in_dim, config.output.out_dim,
                                  config.output.output_type)

        # To use multi-class hinge output, we need to specify the loss function
        if isinstance(self.output.act_type, act.MulticlassHingeOutput):
            if config.loss_file != None:
                self.output.act_type.set_loss(self.read_loss(config.loss_file))
            else:
                self.output.act_type.set_loss(1 - np.eye(self.train_data.K))

        # initialize the weights in every layer
        self._init_weights(config.init_scale, config.random_seed)

    def _init_weights(self, init_scale, random_seed=None):
        if random_seed:
            np.random.seed(random_seed)

        for i in range(0, self.num_layers):
            self.layer[i].init_weight(init_scale)

        self.output.init_weight(init_scale)

    def train(self):
        config = self.config

        # convert t into a matrix in 1-of-K representation if it is a vector
        t = self.train_data.T
        if not self.config.is_regression:
            T_matrix = self.output.act_type.label_vec_to_mat(
                t, self.train_data.K)
        else:
            T_matrix = t

        layer_config = LayerConfig()
        layer_config.learn_rate = config.learn_rate
        layer_config.momentum = config.momentum
        layer_config.weight_decay = config.weight_decay

        nnstore = NNStore()
        nnstore.init_from_net(self)

        self.display_training_info(-1, 0, 0)
        t_start = time.time()

        for epoch in range(0, config.num_epochs):
            # shuffle the dataset
            idx = np.random.permutation(self.num_total_cases)
            train_X = self.train_data.X[idx]
            train_T = T_matrix[idx]

            loss = 0

            for batch in range(0, self.num_minibatches):
                i_start = batch * config.minibatch_size
                if not batch == self.num_minibatches - 1:
                    i_end = i_start + config.minibatch_size
                else:
                    i_end = self.num_total_cases

                X = train_X[i_start:i_end]
                T = train_T[i_start:i_end]
                Xbelow = X

                # forward pass
                for i in range(0, self.num_layers):
                    Xbelow = self.layer[i].forward(Xbelow)
                self.output.forward(Xbelow)

                # compute loss
                loss += self.output.loss(T)

                # backprop
                dLdXabove = self.output.backprop(layer_config)
                for i in range(self.num_layers - 1, -1, -1):
                    dLdXabove = self.layer[i].backprop(dLdXabove, layer_config)

            # statistics
            avg_loss = 1.0 * loss / self.num_total_cases

            if (epoch + 1) % config.epoch_to_display == 0:
                self.display_training_info(epoch, avg_loss,
                                           time.time() - t_start)
                t_start = time.time()

            if (epoch + 1) % config.epoch_to_save == 0:
                nnstore.update_from_net(self)
                nnstore.write(config.output_dir + '/m' + str(epoch + 1) +
                              '.pdata')

    def display_training_info(self, epoch, loss, time):
        """Print training information. Use the config information to determine
        what information to display."""
        if self.config.is_val:
            if self.config.is_test:
                self._display_training_info(epoch,
                                            loss,
                                            time,
                                            self.train_data.X,
                                            self.train_data.T,
                                            val_data=self.val_data.X,
                                            val_labels=self.val_data.T,
                                            test_data=self.test_data.X,
                                            test_labels=self.test_data.T)
            else:
                self._display_training_info(epoch,
                                            loss,
                                            time,
                                            self.train_data.X,
                                            self.train_data.T,
                                            val_data=self.val_data.X,
                                            val_labels=self.val_data.T)
        else:
            if self.config.is_test:
                self._display_training_info(epoch,
                                            loss,
                                            time,
                                            self.train_data.X,
                                            self.train_data.T,
                                            test_data=self.test_data.X,
                                            test_labels=self.test_data.T)
            else:
                self._display_training_info(epoch, loss, time,
                                            self.train_data.X,
                                            self.train_data.T)

    def _display_training_info(self,
                               epoch,
                               loss,
                               time,
                               train_data,
                               train_labels,
                               val_data=None,
                               val_labels=None,
                               test_data=None,
                               test_labels=None):
        """Print training information during training."""
        print 'epoch %d, loss %.4f,' % (epoch + 1, loss),

        # print loss if it is a regression problem
        if self.config.is_regression:
            if val_data != None and val_labels != None:
                self.predict(val_data)
                avg_loss = self.output.loss(val_labels) / val_labels.shape[0]
                print 'val_loss %.4f,' % (avg_loss),
            if test_data != None and test_labels != None:
                self.predict(test_data)
                avg_loss = self.output.loss(test_labels) / test_labels.shape[0]
                print 'test_loss %.4f,' % (avg_loss),
        else:
            # print accuracy if it is a classification problem
            ypred = self.predict(train_data)
            acc = (ypred == train_labels.squeeze()).mean()
            print 'acc %.4f,' % acc,

            if val_data != None and val_labels != None:
                ypred = self.predict(val_data)
                acc = (ypred == val_labels.squeeze()).mean()
                print 'val_acc %.4f,' % acc,
            if test_data != None and test_labels != None:
                ypred = self.predict(test_data)
                acc = (ypred == test_labels.squeeze()).mean()
                print 'test_acc %.4f,' % acc,

        if self.config.display_winc:
            for i in range(0, self.num_layers):
                print 'winc%d %.5f,' % (i + 1, np.abs(
                    self.layer[i].Winc).max()),
            print 'winc_out %.5f,' % np.abs(self.output.Winc).max(),

        print 'time %.2f' % time

    def _forward(self, X):
        """Do a forward pass without computing the output and predictions.
        Used as a subroutine for function predict and check_grad."""
        Xbelow = X
        for i in range(0, self.num_layers):
            Xbelow = self.layer[i].forward(Xbelow)
        self.output.forward(Xbelow)

    def predict(self, X):
        """Make prediction using the current network.
        
        X: N*D data matrix
        ispad: if True, X is padded by an extra dimension of constant 1's

        Return an N-element vector of predicted labels.
        """
        self._forward(X)
        return self.output.predict()

    def read_data(self, data_file_name):
        """(data_file_name) --> data
        Read from the specified data file, return a data object, which is an
        object with three attributes, X, T and K. X and T are the data and
        target matrices respectively, and K is the dimensionality of the output.
        Each of X and T is a matrix with N rows, N is the number of data
        cases."""

        f = open(data_file_name)

        data_dict = pickle.load(f)

        f.close()

        data = Data()
        data.X = data_dict['data']
        data.T = data_dict['labels']
        data.K = data_dict['K']

        return data

    def read_loss(self, loss_file_name):
        """(data_file_name) --> loss
        Read from the specified data file, return a loss matrix.
        """
        f = open(loss_file_name)
        d = pickle.load(f)
        f.close()

        return d['loss']

    def display(self):
        print '%d training cases' % self.train_data.X.shape[0]
        if self.config.is_val:
            print '%d validation cases' % self.val_data.X.shape[0]
        if self.config.is_test:
            print '%d test cases' % self.test_data.X.shape[0]
        print '[' + str(self.output) + ']'
        for i in range(self.num_layers - 1, -1, -1):
            print '[' + str(self.layer[i]) + ']'
        print '[input ' + str(self.input_dim) + ']'

        print 'learn_rate : ' + str(self.config.learn_rate)
        print 'init_scale : ' + str(self.config.init_scale)
        print 'momentum : ' + str(self.config.momentum)
        print 'weight_decay : ' + str(self.config.weight_decay)
        print 'minibatch_size : ' + str(self.config.minibatch_size)
        print 'num_epochs : ' + str(self.config.num_epochs)
        print 'epoch_to_save : ' + str(self.config.epoch_to_save)

    def check_grad(self):
        # check the gradient of the 1st layer weights
        import scipy.optimize as opt

        ncases = 100

        def f(w):
            if self.num_layers == 0:
                Wtemp = self.output.W
                self.output.W = w.reshape(Wtemp.shape)
            else:
                Wtemp = self.layer[0].W
                self.layer[0].W = w.reshape(Wtemp.shape)

            self._forward(self.train_data.X[:ncases, :])

            Z = self.train_data.T[:ncases]
            if not self.config.is_regression:
                Z = self.output.act_type.label_vec_to_mat(Z, self.train_data.K)

            L = self.output.loss(Z) / Z.shape[0]
            if self.num_layers == 0:
                self.output.W = Wtemp
            else:
                self.layer[0].W = Wtemp

            return L

        def fgrad(w):
            if self.num_layers == 0:
                Wtemp = self.output.W
                self.output.W = w.reshape(Wtemp.shape)
            else:
                Wtemp = self.layer[0].W
                self.layer[0].W = w.reshape(Wtemp.shape)

            self._forward(self.train_data.X[:ncases, :])

            Z = self.train_data.T[:ncases]
            if not self.config.is_regression:
                Z = self.output.act_type.label_vec_to_mat(Z, self.train_data.K)
            self.output.loss(Z)

            self.output.gradient()
            dLdXabove = self.output.dLdXtop[:, :-1]
            for i in range(self.num_layers - 1, -1, -1):
                self.layer[i].gradient(dLdXabove)
                dLdXabove = self.layer[i].dLdXbelow[:, :-1]

            if self.num_layers == 0:
                grad_w = self.output.dLdW
            else:
                grad_w = self.layer[0].dLdW

            if self.num_layers == 0:
                self.output.W = Wtemp
            else:
                self.layer[0].W = Wtemp

            return grad_w.reshape(np.prod(grad_w.shape)) / Z.shape[0]

        if self.num_layers == 0:
            #W = np.random.randn(
            #        self.output.W.shape[0], self.output.W.shape[1]) * 1e-3
            W = self.output.W
        else:
            #W = np.random.randn(
            #        self.layer[0].W.shape[0], self.layer[0].W.shape[1]) * 1e-3
            W = self.layer[0].W

        print "wmax: %f" % np.abs(fgrad(W.reshape(np.prod(W.shape)))).max()
        print "check_grad err: %f" % opt.check_grad(
            f, fgrad, W.reshape(np.prod(W.shape)))

Esempio n. 2

Mostra file

File: nn.py Progetto: jgera/Segmentation-Code

class NN:
    """A class for general purpose neural networks, trained with
    backpropagation. The type of activation functions, number of hidden layers
    and number of units in each layer, the output function, and other options 
    during training can be configured."""
    def __init__(self):
        pass

    def init_net(self, config):
        """config is an instance of class Config"""
        
        import os

        self.config = config

        if config.is_output and (not os.path.exists(config.output_dir)):
            os.makedirs(config.output_dir)

        self.train_data = self.read_data(config.train_data_file)

        if config.is_val:
            self.val_data = self.read_data(config.val_data_file)
        if config.is_test:
            self.test_data = self.read_data(config.test_data_file)

        [num_total_cases, input_dim] = self.train_data.X.shape
        self.num_total_cases = num_total_cases
        self.input_dim = input_dim

        self.num_minibatches = num_total_cases / config.minibatch_size
        if self.num_minibatches < 1:
            self.num_minibatches = 1

        # initialize the network
        self.num_layers = config.num_layers
        self.layer = []
        in_dim = input_dim
        for i in range(0, self.num_layers):
            self.layer.append(Layer(
                in_dim, config.layer[i].out_dim, config.layer[i].act_type))
            in_dim = config.layer[i].out_dim

        self.output = OutputLayer(in_dim, config.output.out_dim,
                config.output.output_type)

        # To use multi-class hinge output, we need to specify the loss function
        if isinstance(self.output.act_type, act.MulticlassHingeOutput):
            if config.loss_file != None:
                self.output.act_type.set_loss(self.read_loss(config.loss_file))
            else:
                self.output.act_type.set_loss(1 - np.eye(self.train_data.K))

        # initialize the weights in every layer
        self._init_weights(config.init_scale, config.random_seed)

    def _init_weights(self, init_scale, random_seed=None):
        if random_seed:
            np.random.seed(random_seed)

        for i in range(0, self.num_layers):
            self.layer[i].init_weight(init_scale)

        self.output.init_weight(init_scale)

    def train(self):
        config = self.config

        # convert t into a matrix in 1-of-K representation if it is a vector
        t = self.train_data.T
        if not self.config.is_regression:
            T_matrix = self.output.act_type.label_vec_to_mat(t, self.train_data.K)
        else:
            T_matrix = t

        layer_config = LayerConfig()
        layer_config.learn_rate = config.learn_rate
        layer_config.momentum = config.momentum
        layer_config.weight_decay = config.weight_decay

        nnstore = NNStore()
        nnstore.init_from_net(self)

        self.display_training_info(-1, 0, 0)
        t_start = time.time()

        for epoch in range(0, config.num_epochs):
            # shuffle the dataset 
            idx = np.random.permutation(self.num_total_cases)
            train_X = self.train_data.X[idx]
            train_T = T_matrix[idx]

            loss = 0

            for batch in range(0, self.num_minibatches):
                i_start = batch * config.minibatch_size
                if not batch == self.num_minibatches - 1:
                    i_end = i_start + config.minibatch_size
                else:
                    i_end = self.num_total_cases

                X = train_X[i_start:i_end]
                T = train_T[i_start:i_end]
                Xbelow = X

                # forward pass
                for i in range(0, self.num_layers):
                    Xbelow = self.layer[i].forward(Xbelow)
                self.output.forward(Xbelow)

                # compute loss
                loss += self.output.loss(T)

                # backprop
                dLdXabove = self.output.backprop(layer_config)
                for i in range(self.num_layers-1, -1, -1):
                    dLdXabove = self.layer[i].backprop(dLdXabove, layer_config)

            # statistics
            avg_loss = 1.0 * loss / self.num_total_cases

            if (epoch + 1) % config.epoch_to_display == 0:
                self.display_training_info(epoch, avg_loss, time.time() - t_start)
                t_start = time.time()

            if (epoch + 1) % config.epoch_to_save == 0:
                nnstore.update_from_net(self)
                nnstore.write(config.output_dir + '/m' + str(epoch + 1) + '.pdata')

    def display_training_info(self, epoch, loss, time):
        """Print training information. Use the config information to determine
        what information to display."""
        if self.config.is_val:
            if self.config.is_test:
                self._display_training_info(epoch, loss, time,
                        self.train_data.X, self.train_data.T,
                        val_data=self.val_data.X, val_labels=self.val_data.T,
                        test_data=self.test_data.X, test_labels=self.test_data.T)
            else:
                self._display_training_info(epoch, loss, time,
                        self.train_data.X, self.train_data.T,
                        val_data=self.val_data.X, val_labels=self.val_data.T)
        else:
            if self.config.is_test:
                self._display_training_info(epoch, loss, time,
                        self.train_data.X, self.train_data.T,
                        test_data=self.test_data.X, test_labels=self.test_data.T)
            else:
                self._display_training_info(epoch, loss, time,
                        self.train_data.X, self.train_data.T)

    def _display_training_info(self, epoch, loss, time, 
            train_data, train_labels, val_data=None, val_labels=None, 
            test_data=None, test_labels=None):
        """Print training information during training."""
        print 'epoch %d, loss %.4f,' % (epoch + 1, loss),
        
        # print loss if it is a regression problem
        if self.config.is_regression:
            if val_data != None and val_labels != None:
                self.predict(val_data)
                avg_loss = self.output.loss(val_labels) / val_labels.shape[0]
                print 'val_loss %.4f,' % (avg_loss),
            if test_data != None and test_labels != None:
                self.predict(test_data)
                avg_loss = self.output.loss(test_labels) / test_labels.shape[0]
                print 'test_loss %.4f,' % (avg_loss),
        else:
            # print accuracy if it is a classification problem
            ypred = self.predict(train_data)
            acc = (ypred == train_labels.squeeze()).mean()
            print 'acc %.4f,' % acc,

            if val_data != None and val_labels != None:
                ypred = self.predict(val_data)
                acc = (ypred == val_labels.squeeze()).mean()
                print 'val_acc %.4f,' % acc,
            if test_data != None and test_labels != None:
                ypred = self.predict(test_data)
                acc = (ypred == test_labels.squeeze()).mean()
                print 'test_acc %.4f,' % acc,

        if self.config.display_winc:
            for i in range(0, self.num_layers):
                print 'winc%d %.5f,' % (i+1, np.abs(self.layer[i].Winc).max()),
            print 'winc_out %.5f,' % np.abs(self.output.Winc).max(),

        print 'time %.2f' % time

    def _forward(self, X):
        """Do a forward pass without computing the output and predictions.
        Used as a subroutine for function predict and check_grad."""
        Xbelow = X
        for i in range(0, self.num_layers):
            Xbelow = self.layer[i].forward(Xbelow)
        self.output.forward(Xbelow)
       
    def predict(self, X):
        """Make prediction using the current network.
        
        X: N*D data matrix
        ispad: if True, X is padded by an extra dimension of constant 1's

        Return an N-element vector of predicted labels.
        """
        self._forward(X)
        return self.output.predict()

    def read_data(self, data_file_name):
        """(data_file_name) --> data
        Read from the specified data file, return a data object, which is an
        object with three attributes, X, T and K. X and T are the data and
        target matrices respectively, and K is the dimensionality of the output.
        Each of X and T is a matrix with N rows, N is the number of data
        cases."""

        f = open(data_file_name)

        data_dict = pickle.load(f)

        f.close()

        data = Data()
        data.X = data_dict['data']
        data.T = data_dict['labels']
        data.K = data_dict['K']

        return data

    def read_loss(self, loss_file_name):
        """(data_file_name) --> loss
        Read from the specified data file, return a loss matrix.
        """
        f = open(loss_file_name)
        d = pickle.load(f)
        f.close()

        return d['loss']

    def display(self):
        print '%d training cases' % self.train_data.X.shape[0]
        if self.config.is_val:
            print '%d validation cases' % self.val_data.X.shape[0]
        if self.config.is_test:
            print '%d test cases' % self.test_data.X.shape[0]
        print '[' + str(self.output) + ']'
        for i in range(self.num_layers-1, -1, -1):
            print '[' + str(self.layer[i]) + ']'
        print '[input ' + str(self.input_dim) + ']'

        print 'learn_rate : ' + str(self.config.learn_rate)
        print 'init_scale : ' + str(self.config.init_scale)
        print 'momentum : ' + str(self.config.momentum)
        print 'weight_decay : ' + str(self.config.weight_decay)
        print 'minibatch_size : ' + str(self.config.minibatch_size)
        print 'num_epochs : ' + str(self.config.num_epochs)
        print 'epoch_to_save : ' + str(self.config.epoch_to_save)

    def check_grad(self):
        # check the gradient of the 1st layer weights
        import scipy.optimize as opt

        ncases = 100

        def f(w):
            if self.num_layers == 0:
                Wtemp = self.output.W
                self.output.W = w.reshape(Wtemp.shape)
            else:
                Wtemp = self.layer[0].W
                self.layer[0].W = w.reshape(Wtemp.shape)

            self._forward(self.train_data.X[:ncases,:])

            Z = self.train_data.T[:ncases]
            if not self.config.is_regression:
                Z = self.output.act_type.label_vec_to_mat(Z, self.train_data.K)

            L = self.output.loss(Z) / Z.shape[0]
            if self.num_layers == 0:
                self.output.W = Wtemp
            else:
                self.layer[0].W = Wtemp

            return L

        def fgrad(w):
            if self.num_layers == 0:
                Wtemp = self.output.W
                self.output.W = w.reshape(Wtemp.shape)
            else:
                Wtemp = self.layer[0].W
                self.layer[0].W = w.reshape(Wtemp.shape)

            self._forward(self.train_data.X[:ncases,:])

            Z = self.train_data.T[:ncases]
            if not self.config.is_regression:
                Z = self.output.act_type.label_vec_to_mat(Z, self.train_data.K)
            self.output.loss(Z)

            self.output.gradient()
            dLdXabove = self.output.dLdXtop[:,:-1]
            for i in range(self.num_layers-1, -1, -1):
                self.layer[i].gradient(dLdXabove)
                dLdXabove = self.layer[i].dLdXbelow[:,:-1]

            if self.num_layers == 0:
                grad_w = self.output.dLdW
            else:
                grad_w = self.layer[0].dLdW

            if self.num_layers == 0:
                self.output.W = Wtemp
            else:
                self.layer[0].W = Wtemp

            return grad_w.reshape(np.prod(grad_w.shape)) / Z.shape[0]

        if self.num_layers == 0:
            #W = np.random.randn(
            #        self.output.W.shape[0], self.output.W.shape[1]) * 1e-3
            W = self.output.W
        else:
            #W = np.random.randn(
            #        self.layer[0].W.shape[0], self.layer[0].W.shape[1]) * 1e-3
            W = self.layer[0].W

        print "wmax: %f" % np.abs(fgrad(W.reshape(np.prod(W.shape)))).max()
        print "check_grad err: %f" % opt.check_grad(
                f, fgrad, W.reshape(np.prod(W.shape)))

Esempio n. 3

Mostra file

File: nn.py Progetto: jgera/Segmentation-Code

class NN:
    """A class for general purpose neural networks, trained with
    backpropagation. The type of activation functions, number of hidden layers
    and number of units in each layer, the output function, and other options 
    during training can be configured."""
    def __init__(self):
        self.task_loss_fn = None

    def load_train_data(self, data):
        self.train_data = data
        self.train_data.X = gnp.garray(data.X)

    def load_val_data(self, data):
        self.val_data = data
        self.val_data.X = gnp.garray(data.X)

    def load_test_data(self, data):
        self.test_data = data
        self.test_data.X = gnp.garray(data.X)

    def init_net_without_loading_data(self, config):
        """This should be called after loading all required data."""
        self.config = config

        if config.is_output and (not os.path.exists(config.output_dir)):
            os.makedirs(config.output_dir)

        [num_total_cases, input_dim] = self.train_data.X.shape
        self.num_total_cases = num_total_cases
        self.input_dim = input_dim

        self.num_minibatches = num_total_cases / config.minibatch_size
        if self.num_minibatches < 1:
            self.num_minibatches = 1

        # initialize the network
        self.num_layers = config.num_layers
        self.layer = []
        in_dim = input_dim
        for i in range(self.num_layers):
            layer_spec = config.layer[i]
            self.layer.append(Layer(
                in_dim, layer_spec.out_dim, layer_spec.act_type, 
                layer_spec.weight_decay, layer_spec.weight_constraint, 
                layer_spec.dropout))
            in_dim = layer_spec.out_dim

        self.output = OutputLayer(in_dim, config.output.out_dim,
                config.output.output_type, config.output.weight_decay,
                config.output.weight_constraint, config.output.dropout)

        # if not linear output (regression) load task loss function
        if not isinstance(self.output.act_type, act.LinearOutput):
            if config.task_loss_file != None:
                self.task_loss = self.read_loss(config.task_loss_file)
                print 'Loading task loss from %s' % config.task_loss_file
            else:
                self.task_loss = 1 - np.eye(self.train_data.K)
                print 'No task loss specified, using 0-1 loss.'

        # To use multi-class hinge output, a training loss function is required
        if isinstance(self.output.act_type, act.MulticlassHingeOutput):
            if config.train_loss_file != None:
                self.train_loss = self.read_loss(config.train_loss_file)
                print 'Loading surrogate loss from %s' % config.train_loss_file
            else:
                self.train_loss = 1 - np.eye(self.train_data.K)
                print 'No surrogate loss specified, using 0-1 loss.'
            self.output.act_type.set_loss(self.train_loss)

        # initialize the weights in every layer
        self._init_weights(config.init_scale, config.random_seed)

    def init_net(self, config):
        """config is an instance of class Config"""
        self.train_data = self.read_data(config.train_data_file)
        print 'Loading training data from %s' % config.train_data_file

        if config.is_val:
            self.val_data = self.read_data(config.val_data_file)
            print 'Loading validation data from %s' % config.val_data_file
        if config.is_test:
            self.test_data = self.read_data(config.test_data_file)
            print 'Loading test data from %s' % config.test_data_file

        self.init_net_without_loading_data(config)

    def load_net(self, model_file):
        """Load a saved model from a specified file."""
        nnstore = NNStore()
        nnstore.load(model_file)
        self.build_net_from_copy(nnstore)

    def make_copy(self):
        """
        Make a CPU copy of the net. This copy can be used to recover the net.
        """
        nnstore = NNStore()
        nnstore.init_from_net(self)
        return nnstore

    def build_net_from_copy(self, copy):
        """
        Rebuild the net from a copy made by make_copy.
        """
        nnstore = copy
        self.num_layers = len(nnstore.layer)
        self.layer = []
        for i in range(self.num_layers):
            in_dim, out_dim = nnstore.layer[i].W.shape
            new_layer = Layer(in_dim, out_dim, nnstore.layer[i].act_type)
            new_layer.load_weight(nnstore.layer[i].W, nnstore.layer[i].b)
            self.layer.append(new_layer)

        in_dim, out_dim = nnstore.output.W.shape
        new_layer = OutputLayer(in_dim, out_dim, nnstore.output.act_type)
        new_layer.load_weight(nnstore.output.W, nnstore.output.b)
        self.output = new_layer
        if self.num_layers > 0:
            self.input_dim = self.layer[0].W.shape[0]
        else:
            self.input_dim = self.output.W.shape[0]

    def _init_weights(self, init_scale, random_seed=None):
        if random_seed:
            np.random.seed(random_seed)

        for i in range(0, self.num_layers):
            self.layer[i].init_weight(init_scale)

        self.output.init_weight(init_scale)

    def set_task_loss(self, task_loss_fn):
        """Set the task loss function to be user defined task loss.
        
        task_loss_fn should have a signature like this:
        task_loss_fn(OutputType, Y, Z, A)
        """
        self.task_loss_fn = task_loss_fn

    def _compute_loss(self, X, T, batch_size=1000):
        n_total = X.shape[0]
        n_batches = n_total / batch_size
        loss = 0
        for i in range(n_batches):
            gnp.free_reuse_cache()
            i_start = i * batch_size
            if i < n_batches - 1:
                i_end = i_start + batch_size
            else:
                i_end = n_total

            Xbatch = X[i_start:i_end]
            Tbatch = T[i_start:i_end]

            self._forward(Xbatch)
            loss += self.output.loss(Tbatch)
        
        return loss / n_total

    def train(self):
        config = self.config

        # convert t into a matrix in 1-of-K representation if it is a vector
        t = self.train_data.T
        T_matrix = self.output.act_type.label_vec_to_mat(t, self.train_data.K)

        layer_config = LayerConfig()
        layer_config.learn_rate = config.learn_rate
        layer_config.momentum = config.init_momentum
        layer_config.weight_decay = config.weight_decay

        nnstore = NNStore()
        nnstore.init_from_net(self)

        best_net = NNStore()
        best_net.init_from_net(self)

        train_acc, val_acc, test_acc = self.display_training_info(
                -1, 
                self._compute_loss(
                    self.train_data.X, T_matrix, config.minibatch_size),
                0)
        acc_rec = np.zeros((config.num_epochs / config.epoch_to_display + 1, 4))
        acc_rec[0, 0] = 0
        acc_rec[0, 1] = train_acc
        if config.is_val:
            acc_rec[0, 2] = val_acc
        if config.is_test:
            acc_rec[0, 3] = test_acc

        t_start = time.time()

        best_acc = val_acc
        if self.config.is_test:
            best_test_acc = test_acc
        best_epoch = -1

        for epoch in range(0, config.num_epochs):
            gnp.free_reuse_cache()

            # decrease learning rate over time
            layer_config.learn_rate = config.learn_rate / \
                    (epoch / config.lr_drop_rate + 1)

            # TODO [dirty] special for Lnsvm
            if isinstance(self.output.act_type, act.LnsvmVariantOutput):
                #self.output.act_type.n = 3.0 - (3.0 - 0.5) / 50 * epoch
                self.output.act_type.n = 0.5
                if self.output.act_type.n < 0.5:
                    self.output.act_type.n = 0.5 

                if (epoch + 1) % config.epoch_to_display == 0:
                    print 'n %.4f' % self.output.act_type.n,
            
            if epoch >= config.switch_epoch:
                layer_config.momentum = config.final_momentum

            # shuffle the dataset 
            idx = np.random.permutation(self.num_total_cases)
            #idx = np.arange(self.num_total_cases)
            train_X = self.train_data.X[idx]
            train_T = T_matrix[idx]

            if config.input_noise > 0:
                train_X = train_X * (gnp.rand(train_X.shape) > config.input_noise)
                # train_X = train_X + gnp.randn(train_X.shape) * config.input_noise

            loss = 0

            for batch in range(0, self.num_minibatches):
                i_start = batch * config.minibatch_size
                if not batch == self.num_minibatches - 1:
                    i_end = i_start + config.minibatch_size
                else:
                    i_end = self.num_total_cases

                X = train_X[i_start:i_end]
                T = train_T[i_start:i_end]

                # forward pass
                self._forward(X)

                # compute loss
                loss += self.output.loss(T)

                if self.output.Y.isnan().any():
                    import ipdb
                    ipdb.set_trace()
                    print 'batch #%d <-- nan' % batch

                # backprop
                dLdXabove = self.output.backprop(layer_config)
                for i in range(self.num_layers-1, -1, -1):
                    dLdXabove = self.layer[i].backprop(dLdXabove, layer_config)

            # statistics
            avg_loss = 1.0 * loss / self.num_total_cases

            if (epoch + 1) % config.epoch_to_display == 0:
                train_acc, val_acc, test_acc = self.display_training_info(
                        epoch, avg_loss, time.time() - t_start)

                if val_acc == None:
                    val_acc = train_acc

                if (config.show_task_loss and val_acc < best_acc) or \
                        (not config.show_task_loss and val_acc > best_acc):
                    best_acc = val_acc
                    best_net.update_from_net(self)
                    if config.is_test:
                        best_test_acc = test_acc
                    best_epoch = epoch
                t_start = time.time()
                acc_rec[(epoch + 1) / config.epoch_to_display, 0] = epoch + 1
                acc_rec[(epoch + 1) / config.epoch_to_display, 1] = train_acc
                if config.is_val:
                    acc_rec[(epoch + 1) / config.epoch_to_display, 2] = val_acc
                if config.is_test:
                    acc_rec[(epoch + 1) / config.epoch_to_display, 3] = test_acc

            if (epoch + 1) % config.epoch_to_save == 0:
                nnstore.update_from_net(self)
                nnstore.write(config.output_dir + '/m' + str(epoch + 1) + '.pdata')


        print '----------------------------------------------------------------'

        if config.show_task_loss:
            s = 'loss'
        else:
            s = 'acc'
        
        if config.is_val:
            print 'Best val_%s %.4f' % (s, best_acc),
        else:
            print 'Best train_%s %.4f' % (s, best_acc),

        if config.is_test:
            print '--> test_%s %.4f' % (s, best_test_acc),
        print 'at epoch %d' % (best_epoch + 1)

        if config.is_output:
            f = open('%s/acc_rec.pdata' % config.output_dir, 'w')
            pickle.dump(acc_rec, f, -1)
            f.close()

            self.write_config('%s/cfg.txt' % config.output_dir)

            # save the best net
            fname = config.output_dir + '/best_net.pdata'
            print 'Saving the best model to ' + fname
            best_net.write(fname)

        if config.is_test:
            return (best_acc, best_test_acc)
        else:
            return (best_acc)

    def display_training_info(self, epoch, loss, time):
        """Print training information. Use the config information to determine
        what information to display.

        Return a 3-tuple (train acc, val acc, test acc)
        val acc and test acc will be 0 if no validation/test data are given
        """
        if self.config.is_val:
            if self.config.is_test:
                return self._display_training_info(epoch, loss, time,
                        self.train_data.X, self.train_data.T,
                        val_data=self.val_data.X, val_labels=self.val_data.T,
                        test_data=self.test_data.X, test_labels=self.test_data.T)
            else:
                return self._display_training_info(epoch, loss, time,
                        self.train_data.X, self.train_data.T,
                        val_data=self.val_data.X, val_labels=self.val_data.T)
        else:
            if self.config.is_test:
                return self._display_training_info(epoch, loss, time,
                        self.train_data.X, self.train_data.T,
                        test_data=self.test_data.X, test_labels=self.test_data.T)
            else:
                return self._display_training_info(epoch, loss, time,
                        self.train_data.X, self.train_data.T)

    def _display_training_info(self, epoch, loss, time, 
            train_data, train_labels, val_data=None, val_labels=None, 
            test_data=None, test_labels=None):
        """Print training information during training."""
        print 'epoch %d, surrogate loss %.4f,' % (epoch + 1, loss),

        train_acc = 0
        val_acc = None
        test_acc = None
        acc = 0

        # print loss if it is a regression problem
        if self.config.is_regression:
            # TODO [Dirty code]
            #self.predict(train_data)
            #avg_loss = self.output.task_loss(train_labels, self.task_loss_fn)
            avg_loss = np.sqrt(self._compute_loss(train_data, train_labels) * 2)
            print 'train_loss %.4f,' % avg_loss,

            if val_data != None and val_labels != None:
                #self.predict(val_data)
                #avg_loss = self.output.task_loss(val_labels, self.task_loss_fn)
                avg_loss = np.sqrt(self._compute_loss(val_data, val_labels) * 2)
                print 'val_loss %.4f,' % (avg_loss),
                val_acc = avg_loss
            if test_data != None and test_labels != None:
                #self.predict(test_data)
                #avg_loss = self.output.task_loss(test_labels, self.task_loss_fn)
                avg_loss = np.sqrt(self._compute_loss(test_data, test_labels) * 2)
                print 'test_loss %.4f,' % (avg_loss),
                test_acc = avg_loss
        else:
            # print accuracy if it is a classification problem
            ypred = self.predict(train_data)
            if self.config.show_accuracy:
                acc = (ypred == train_labels.squeeze()).mean()
                print 'acc %.4f,' % acc,
            if self.config.show_task_loss:
                acc = self.task_loss[ypred, train_labels].mean()
                print 'loss %.4f,' % acc,

            train_acc = acc

            if val_data != None and val_labels != None:
                ypred = self.predict(val_data)
                if self.config.show_accuracy:
                    acc = (ypred == val_labels.squeeze()).mean()
                    print 'val_acc %.4f,' % acc,
                if self.config.show_task_loss:
                    acc = self.task_loss[ypred, val_labels].mean()
                    print 'val_loss %.4f,' % acc,
                val_acc = acc
            if test_data != None and test_labels != None:
                ypred = self.predict(test_data)
                if self.config.show_accuracy:
                    acc = (ypred == test_labels.squeeze()).mean()
                    print 'test_acc %.4f,' % acc,
                if self.config.show_task_loss:
                    acc = self.task_loss[ypred, test_labels].mean()
                    print 'test_loss %.4f,' % acc,
                test_acc = acc

        if self.config.display_winc:
            self.display_winc()

        print 'time %.2f' % time

        return (train_acc, val_acc, test_acc)

    def display_winc(self):
        """Display scale of weight updates. This can be used by external
        applications."""
        for i in range(0, self.num_layers):
            print 'winc%d %.5f,' % (i+1, gnp.abs(self.layer[i].Winc).max()),
        print 'winc_out %.5f,' % gnp.abs(self.output.Winc).max(),

    def _forward(self, X):
        """Do a forward pass without computing the output and predictions.
        Used as a subroutine for function predict and check_grad."""
        Xbelow = X
        for i in range(self.num_layers):
            Xbelow = self.layer[i].forward(Xbelow)
        self.output.forward(Xbelow)
       
    def predict(self, X):
        """Make prediction using the current network.
        
        X: N*D data matrix

        Return an N-element vector of predicted labels.
        """
        self._forward(X)
        return self.output.predict()

    def forward(self, X):
        """Compute the activation for each class.
        
        X: N*D data matrix

        Return a N*D activation matrix A.
        """
        self._forward(X)
        return self.output.A

    def _backprop(self, config):
        """Backpropagate through the net from the output layer. This will be
        used as an external interface for semi-supervised application, and the
        backprop starts from the `update_weights` method of the output layer,
        rather than the `backprop` method."""
        dLdXabove = self.output.update_weights(config)
        for i in range(self.num_layers-1, -1, -1):
            dLdXabove = self.layer[i].backprop(dLdXabove, config)

    def eval_task_loss(self, X, z, loss):
        """Evaluate the performance of the net using task specific loss.
        Classification problems only.

        X: N*D data matrix
        z: N-d ground truth matrix.
        loss: K*K matrix, K is the number of classes.

        Return the average loss over all datacases.
        """
        y = self.predict(X)
        return loss[z, y].mean()

    def read_data(self, data_file_name):
        """(data_file_name) --> data
        Read from the specified data file, return a data object, which is an
        object with three attributes, X, T and K. X and T are the data and
        target matrices respectively, and K is the dimensionality of the output.
        Each of X and T is a matrix with N rows, N is the number of data
        cases."""

        f = open(data_file_name)

        data_dict = pickle.load(f)

        f.close()

        data = Data()
        data.X = gnp.garray(data_dict['data'])
        #data.T = data_dict['labels'].astype(np.float)
        data.T = data_dict['labels']
        data.K = data_dict['K']

        return data

    def read_loss(self, loss_file_name):
        """(data_file_name) --> loss
        Read from the specified data file, return a loss matrix.
        """
        f = open(loss_file_name)
        d = pickle.load(f)
        f.close()

        return d

    def write_config(self, filename):
        f = open(filename, 'w')
        f.write('%d training cases\n' % self.train_data.X.shape[0])
        if self.config.is_val:
            f.write('%d validation cases\n' % self.val_data.X.shape[0])
        if self.config.is_test:
            f.write('%d test cases\n' % self.test_data.X.shape[0])
        f.write('[' + str(self.output) + ']\n')
        for i in range(self.num_layers-1, -1, -1):
            f.write('[' + str(self.layer[i]) + ']\n')
        f.write('[input ' + str(self.input_dim) + ']\n')

        f.write('learn_rate : ' + str(self.config.learn_rate) + '\n')
        f.write('init_scale : ' + str(self.config.init_scale) + '\n')
        f.write('init_momentum : ' + str(self.config.init_momentum) + '\n')
        f.write('switch_epoch : ' + str(self.config.switch_epoch) + '\n')
        f.write('final_momentum : ' + str(self.config.final_momentum) + '\n')
        f.write('weight_decay : ' + str(self.config.weight_decay) + '\n')
        f.write('minibatch_size : ' + str(self.config.minibatch_size) + '\n')
        f.write('num_epochs : ' + str(self.config.num_epochs) + '\n')
        f.write('epoch_to_save : ' + str(self.config.epoch_to_save) + '\n')

        f.close()

    def display_structure(self):
        print '[' + str(self.output) + ']'
        for i in range(self.num_layers-1, -1, -1):
            print '[' + str(self.layer[i]) + ']'
        print '[input ' + str(self.input_dim) + ']'

    def display(self):
        print '%d training cases' % self.train_data.X.shape[0]
        if self.config.is_val:
            print '%d validation cases' % self.val_data.X.shape[0]
        if self.config.is_test:
            print '%d test cases' % self.test_data.X.shape[0]

        self.display_structure()

        print 'learn_rate : ' + str(self.config.learn_rate)
        print 'init_scale : ' + str(self.config.init_scale)
        print 'init_momentum : ' + str(self.config.init_momentum)
        print 'switch_epoch : ' + str(self.config.switch_epoch)
        print 'final_momentum : ' + str(self.config.final_momentum)
        print 'weight_decay : ' + str(self.config.weight_decay)
        print 'minibatch_size : ' + str(self.config.minibatch_size)
        print 'num_epochs : ' + str(self.config.num_epochs)
        print 'epoch_to_save : ' + str(self.config.epoch_to_save)

        if self.config.is_output:
            print 'output_dir : ' + self.config.output_dir

    def check_grad(self):
        # check the gradient of the 1st layer weights
        import scipy.optimize as opt

        ncases = 100

        def f(w):
            if self.num_layers == 0:
                Wtemp = self.output.W
                self.output.W = gnp.garray(w.reshape(Wtemp.shape))
            else:
                Wtemp = self.layer[0].W
                self.layer[0].W = gnp.garray(w.reshape(Wtemp.shape))

            self._forward(self.train_data.X[:ncases,:])

            Z = self.train_data.T[:ncases]
            Z = self.output.act_type.label_vec_to_mat(Z, self.train_data.K)

            L = self.output.loss(Z) / Z.shape[0]
            if self.num_layers == 0:
                self.output.W = Wtemp
            else:
                self.layer[0].W = Wtemp

            return L

        def fgrad(w):
            if self.num_layers == 0:
                Wtemp = self.output.W
                self.output.W = gnp.garray(w.reshape(Wtemp.shape))
            else:
                Wtemp = self.layer[0].W
                self.layer[0].W = gnp.garray(w.reshape(Wtemp.shape))

            self._forward(self.train_data.X[:ncases,:])

            Z = self.train_data.T[:ncases]
            Z = self.output.act_type.label_vec_to_mat(Z, self.train_data.K)
            self.output.loss(Z)

            self.output.gradient()
            dLdXabove = self.output.dLdXtop
            for i in range(self.num_layers-1, -1, -1):
                self.layer[i].gradient(dLdXabove)
                dLdXabove = self.layer[i].dLdXbelow

            if self.num_layers == 0:
                grad_w = self.output.dLdW
            else:
                grad_w = self.layer[0].dLdW

            if self.num_layers == 0:
                self.output.W = Wtemp
            else:
                self.layer[0].W = Wtemp

            return grad_w.reshape(np.prod(grad_w.shape)).asarray() / Z.shape[0]

        if self.num_layers == 0:
            W = self.output.W
        else:
            W = self.layer[0].W
        W = W.asarray()

        def finite_diff_grad(f, x0):
            eps = 1e-8
            approx = np.zeros(len(x0))
            for i in xrange(len(x0)):
                x0plus = x0.copy()
                x0minus = x0.copy()
                x0plus[i] += eps
                x0minus[i] -= eps
                approx[i] = (f(x0plus) - f(x0minus)) / (2 * eps)
            return approx

        net_grad = fgrad(W.reshape(W.size))
        fd_grad = finite_diff_grad(f, W.reshape(W.size))
        print "wmax: %f" % np.abs(net_grad).max()
        print "finite difference grad scale: %f" % np.abs(fd_grad).max()
        print "check_grad err: %f" % np.sqrt(((fd_grad - net_grad)**2).sum())