def get_mini_batch(self, batch_size): """ Helper function for sampling mini-batches from the training set. Note, random_state needs to be set to None or the same mini batch will be sampled eternally! Parameters ---------- batch_size: int Number of elements to return in the mini batch Returns ------- X: np.ndarray A feature matrix subsampled from self.train y: np.ndarray A one-hot matrix of class labels subsampled from self.train """ random_state = check_random_state(None) # self.random_state) n_training_samples = self.train.X.shape[0] minibatch_indices = random_state.randint(0, n_training_samples - 1, batch_size) return self.train.X[minibatch_indices, :], self.train.y[ minibatch_indices, :]
def __init__(self, y, n_folds=5, shuffle=False, random_state=None): super(KFoldStratified, self).__init__(len(y), n_folds, shuffle, random_state) self.y = y self.idxs = np.arange(len(y)) self.sort_indx = self.y.argsort() if shuffle: rng = check_random_state(self.random_state) rng.shuffle(self.idxs)
def __init__(self, n, labels, n_folds=3, shuffle=False, random_state=None): super(KFoldSubject, self).__init__(n, n_folds, shuffle, random_state) self.idxs = np.arange(n) self.labels = np.array(labels, copy=True) self.n_subs = len(np.unique(self.labels)) if shuffle: rng = check_random_state(self.random_state) rng.shuffle(self.idxs)
def __init__(self, n, labels, shuffle=False, random_state=None): super(LeaveOneSubjectOut, self).__init__(n, len(np.unique(labels)), shuffle, random_state) self.idxs = np.arange(len(labels)) self.labels = np.array(labels, copy=True) self.n_subs = len(np.unique(self.labels)) if shuffle: rng = check_random_state(self.random_state) rng.shuffle(self.idxs)
def _iter_indices(self): rng = check_random_state(self.random_state) # random partition permutation = rng.permutation(self.n) for i in range(self.n_iter): ind_test = permutation[i*self.n_test:(i+1)*self.n_test] dates_test = [self.unique_dates[i] for i in ind_test] dates_train = [self.unique_dates[i] for i in range(self.n) if i not in ind_test] yield [i for i, d in enumerate(self.dates) if d in dates_train], \ [i for i, d in enumerate(self.dates) if d in dates_test]
def _get_folds_column(self, length): """ Return special column with indices of folds for all events. """ if self._random_number is None: self._random_number = check_random_state( self.random_state).randint(0, 100000) folds_column = numpy.zeros(length) for fold_number, (_, folds_indices) in enumerate( KFold(length, self.n_folds, shuffle=True, random_state=self._random_number)): folds_column[folds_indices] = fold_number return folds_column
def _resample_partition(self, partition): rng = check_random_state(self.random_state) y = self.y[partition] unique_labels, y_inversed = np.unique(y, return_inverse=True) label_counts = bincount(y_inversed) class_share = max(label_counts) resampled_partition = np.empty(class_share*len(unique_labels), dtype=np.int_) for i,label in enumerate(unique_labels): indices = partition[y == label] class_size = len(indices) offset = class_share*i added = 0 while added < class_share: rng.shuffle(indices) to_add = min(class_share - added, class_size) resampled_partition[offset+added:offset+added+to_add] = \ indices[:to_add] added += to_add return resampled_partition
def get_mini_batch(self, batch_size): """ Helper function for sampling mini-batches from the training set. Note, random_state needs to be set to None or the same mini batch will be sampled eternally! Parameters ---------- batch_size: int Number of elements to return in the mini batch Returns ------- X: np.ndarray A feature matrix subsampled from self.train y: np.ndarray A one-hot matrix of class labels subsampled from self.train """ random_state = check_random_state(None) # self.random_state) n_training_samples = self.train.X.shape[0] minibatch_indices = random_state.randint(0, n_training_samples - 1, batch_size) return self.train.X[minibatch_indices, :], self.train.y[minibatch_indices, :]
A tuple of (nmf, transformed_data) nmf: An sklearn.NMF instance transformed_data: A numpy.ndarray ''' #Apply non-negative matrix factorization on train_data with specified parameters nmf = NMF(n_components=60, max_iter=200, random_state=random_state).fit(data) transformed_data = nmf.transform(data) #Normalize the data transformed_data = normalize(transformed_data, norm='l1', axis=1) return nmf, transformed_data # In[7]: nmf, td_norm = apply_nmf(train_data, random_state=check_random_state(0)) # In[8]: assert_is_instance(nmf, NMF) assert_is_instance(td_norm, np.ndarray) assert_equal(nmf.n_components, 60) assert_equal(nmf.max_iter, 200) assert_equal(td_norm.shape, (7769, 60)) assert_array_almost_equal(td_norm[0, :5], [0., 0.08515023, 0.01682892, 0., 0.02451052]) assert_array_almost_equal(td_norm[-1, -5:], [0., 0., 0., 0.00342309, 0.]) # ## Topic-based Classification # # - Train a LinearSVC classifier on the topics in the training data sample of the reuters data set.