def train_classifier(X, y): """ Trains a classifier using best known parameters on given data / labels. :param X: Samples, a numpy array of (N, n_vis) shape where N is number of samples and n_vis number of visible varliables (sample dimensionality). :param y: Labels, a numpy array of (N, 1) shape. Each lable should be a label index. """ # split data into minibatches X_mnb, y_mnb = util.create_minibatches(X, y, __CLASS_COUNT * 20) # create a DBN and pretrain dbn = DBN([32 * 24, 600, 600], __CLASS_COUNT) pretrain_params = [[80, 0.05, True, 1, 0.085, 0.1], [80, 0.05, True, 1, 0.000, 0.0]] dbn.pretrain(X_mnb, y_mnb, pretrain_params) # fine-tuning mlp = dbn.to_mlp() mlp.train(X_mnb, y_mnb, 1000, 0.1) return mlp
def main(): logging.basicConfig(level=logging.INFO) log.info("Testing logistic regression class") # generate some data # centers, per class, per dimension centers = [[1, 1, 1], [1, -1, 1], [-1, 1, -1]] cls_count = len(centers) n_dim = len(centers[0]) # variances, per class, per dimenzion vars = [[1, 1, 1], [1, 1, 1], [1, 1, 1]] assert (len(vars) == cls_count) N_per_class = 2500 N = N_per_class * cls_count log.info("Generating data, %d classes, %d samples per class", cls_count, N_per_class) X = np.zeros((N, n_dim)) y = np.zeros(N, dtype=np.int32) for i in range(N): cls = i / N_per_class y[i] = cls for dim in range(n_dim): X[i, dim] = np.random.normal(centers[cls][dim], vars[cls][dim]) log.info("Splitting into train and test sets") train_mask = np.random.rand(N) < 0.85 test_mask = np.logical_not(train_mask) X_train = X[train_mask] y_train = y[train_mask] log.info("%d samples in train set", len(X_train)) log.info("Creating minibatches") X_mnb, y_mnb = util.create_minibatches(X_train, y_train, cls_count * 10) log.info("Fitting") estimator = LogisticRegression(T.matrix("input"), n_dim, cls_count) log.info("Init acc: %.2f", util.acc( y[test_mask], estimator.predict(X[test_mask]))) for i in range(10): estimator.train(X_mnb, y_mnb, 1, 0.1) # validate acc = util.acc(y[test_mask], estimator.predict(X[test_mask])) log.info("Current acc: %.2f", acc)
def evaluate(self, x, mnb_size): """ Evaluates model cost on given samples and returns the mean. :param x: Samples, a numpy array of shape (N, model_input). :param mnb_size: Minibatch size, necessary because evaluating all the samples in 'x' at once might be too memory demanding. :return: Mean cost of samples in 'x'. """ evaluate_f = getattr(self, "_evaluate", None) if evaluate_f is None: evaluate_f = theano.function([self.input], self.cost) self._evaluate = evaluate_f # take into account possibly unbalance mnb sizes return np.sum([ evaluate_f(mnb) * mnb.shape[0] for mnb in util.create_minibatches(x, None, mnb_size, False) ]) / x.shape[0]
def test_dbn(): log.info('Testing DBN') # trainset loading cls_count = 9 X, y, classes = get_data(cls_count=None) X_mnb, y_mnb = util.create_minibatches(X, y, 20 * cls_count) # lin_eps = util.lin_reducer(0.05, 0.002, 20) dbn = DBN([32 * 24, 588, 588], cls_count) dbn.train(X_mnb, y_mnb, [{ 'epochs': 50, 'eps': 0.05, 'spars': 0.05, 'spars_cost': 0.3 }, { 'epochs': 1, 'eps': 0.05 }])
def test_rbm(): log.info('Testing RBM') rbm = RBM(32 * 24, 100) analysis.display_RBM(rbm, 32, 24) # trainset loading cls_count = 9 X, y, classes = get_data(cls_count=cls_count) # train the RBM for a while! X_mnb = util.create_minibatches(X, None, 20 * cls_count) cost, time, hid_act = rbm.train( X_mnb, **{ 'epochs': 5, 'eps': 0.05, 'spars': 0.05, 'spars_cost': 6.0 }) analysis.display_RBM(rbm, 32, 24)
def get_data(): """ Returns the data for the workflow: a tuple of two dicts (data_train, data_test). data_train maps class counts (integers indicating how many classes are used) to a tuple of form (X_mnb, y_mnb) where X_mnb are data samples split into minibatches and y are corresponding labels. data_test maps class counts to a tuple of form (X, y) where X are data samples and y are corresponding labels, NOT split into minibatches. The data is lazily initialized into the global __data variable. """ global __data if __data is None: X, y, classes = raw_data log.info('Read %d samples', len(y)) def data_subset(cls_count): cls = ['A', 'B', 'C', 'D', 'E', 'F', 'X', '_BLANK', '_UNKNOWN'] cls_subs = cls[:cls_count] log.info('Taking a subset of data containing classes %r', cls_subs) bool_mask = np.array([(classes[ind] in cls_subs) for ind in y]) X_subs = X[bool_mask] y_subs = y[bool_mask] log.info('Subset has %d elements', len(X)) return X_subs, y_subs # splitting the trainset into train / test test_size = 0.1 test_indices = np.array(np.random.binomial(1, test_size, len(X)), dtype=np.bool) train_indices = np.logical_not(test_indices) # create dicts of data subsets. each dict has form: # {class_count: (X, y)} # note that X and y are for 'train' data split into # minibatches, but for 'test' they are not data_train = {} data_test = {} for cls_cnt in [1, 3, 7, 9]: # get data subset X_subs, y_subs = data_subset(cls_cnt) N = len(X_subs) # split data subset into train and test X_subs_train = X_subs[train_indices[:N]] y_subs_train = y_subs[train_indices[:N]] X_subs_test = X_subs[test_indices[:N]] y_subs_test = y_subs[test_indices[:N]] data_train[cls_cnt] = util.create_minibatches( X_subs_train, y_subs_train, 20 * cls_cnt) data_test[cls_cnt] = (X_subs_test, y_subs_test) __data = (data_train, data_test) return __data