def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] index_target_class = random_state.choice( range(np.count_nonzero(y == target_class)), size=n_samples, replace=self.replacement) else: index_target_class = slice(None) idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)[index_target_class]), axis=0) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) else: return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
def _fit_resample(self, X, y): if self.return_indices: deprecate_parameter(self, '0.4', 'return_indices', 'sample_indices_') random_state = check_random_state(self.random_state) target_stats = Counter(y) sample_indices = range(X.shape[0]) for class_sample, num_samples in self.sampling_strategy_.items(): target_class_indices = np.flatnonzero(y == class_sample) indices = random_state.randint(low=0, high=target_stats[class_sample], size=num_samples) sample_indices = np.append(sample_indices, target_class_indices[indices]) self.sample_indices_ = np.array(sample_indices) if self.return_indices: return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices), sample_indices) return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices))
def test_safe_indexing_mock_pandas(): X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) X_df = MockDataFrame(X) inds = np.array([1, 2]) X_df_indexed = safe_indexing(X_df, inds) X_indexed = safe_indexing(X_df, inds) assert_array_equal(np.array(X_df_indexed), X_indexed)
def test_safe_indexing_axis_0(asarray): X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] inds = np.array([1, 2]) if asarray else [1, 2] X_inds = safe_indexing(X, inds) X_arrays = safe_indexing(np.array(X), inds) assert_array_equal(np.array(X_inds), X_arrays) assert_array_equal(np.array(X_inds), np.array(X)[inds])
def sample_data(data, train_idx, test_idx): sample = bunch.Bunch(train=bunch.Bunch(), test=bunch.Bunch(), target_names=None) # sample.target_names = data.target_names # sample.train.data = safe_indexing(data.train.data,train_idx) sample.train.target = safe_indexing(data.train.target,train_idx) sample.train.bow = safe_indexing(data.train.bow,train_idx) sample.train.remaining = [] sample.train.validation = [] sample.train.revisit = [] sample.train.snippets=safe_indexing(data.train.snippets,train_idx) sample.train.sizes=safe_indexing(data.train.sizes,train_idx) sample.train.snippet_cost = safe_indexing(data.train.snippet_cost,train_idx) if len(test_idx) > 0: #if there are test indexes # sample.test.data = safe_indexing(data.train.target,test_idx) sample.test.target = safe_indexing(data.train.target,test_idx) sample.test.bow = safe_indexing(data.train.bow,train_idx) sample.test.snippets=safe_indexing(data.train.snippets,train_idx) sample.test.sizes=safe_indexing(data.train.sizes,train_idx) sample.test.snippet_cost = safe_indexing(data.train.snippet_cost,train_idx) else: sample.test = data.test return sample.train, sample.test
def generate_train_set(self, train_size=None, test_size=None, rand_state=None): """ :param test_size: :param rand_state: :param train_size: float or int (default=20) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. :return: """ # self.probe.clear() # self.gallery.clear() if train_size is None and test_size is None: self.probe.files_train, self.probe.files_test = [], self.probe.files self.gallery.files_train, self.gallery.files_test = [], self.gallery.files self.train_indexes, self.test_indexes = [], list(range(0, len(self.probe.files))) else: n_samples = len(self.probe.files) cv = ShuffleSplit(n_samples, test_size=test_size, train_size=train_size, random_state=rand_state) train_indexes, test_indexes = next(iter(cv)) arrays = [self.probe.files, self.gallery.files] self.probe.files_train, self.probe.files_test, self.gallery.files_train, self.gallery.files_test = \ list(chain.from_iterable((safe_indexing(a, train_indexes), safe_indexing(a, test_indexes)) for a in arrays)) self.train_indexes, self.test_indexes = train_indexes, test_indexes self.train_size = len(self.train_indexes) self.test_size = len(self.test_indexes)
def _split_fit_score_trial(self, X, y, idx=0): """ Splits the dataset, fits a clone of the estimator, then scores it according to the required metrics. The index of the split is added to the random_state if the random_state is not None; this ensures that every split is shuffled differently but in a deterministic fashion for testing purposes. """ random_state = self.random_state if random_state is not None: random_state += idx splitter = self._check_cv(self.cv, random_state) for train_index, test_index in splitter.split(X, y): # Safe indexing handles multiple types of inputs including # DataFrames and structured arrays - required for generic splits. X_train = safe_indexing(X, train_index) y_train = safe_indexing(y, train_index) X_test = safe_indexing(X, test_index) y_test = safe_indexing(y, test_index) model = clone(self.estimator) model.fit(X_train, y_train) if hasattr(model, "predict_proba"): # Get the probabilities for the positive class y_scores = model.predict_proba(X_test)[:,1] else: # Use the decision function to get the scores y_scores = model.decision_function(X_test) # Compute the curve metrics and thresholds curve_metrics = precision_recall_curve(y_test, y_scores) precision, recall, thresholds = curve_metrics # Compute the F1 score from precision and recall # Don't need to warn for F, precision/recall would have warned with np.errstate(divide='ignore', invalid='ignore'): beta = self.fbeta ** 2 f_score = ((1 + beta) * precision * recall / (beta * precision + recall)) # Ensure thresholds ends at 1 thresholds = np.append(thresholds, 1) # Compute the queue rate queue_rate = np.array([ (y_scores >= threshold).mean() for threshold in thresholds ]) yield { 'thresholds': thresholds, 'precision': precision, 'recall': recall, 'fscore': f_score, 'queue_rate': queue_rate }
def test_safe_indexing(): X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] inds = np.array([1, 2]) X_inds = safe_indexing(X, inds) X_arrays = safe_indexing(np.array(X), inds) assert_array_equal(np.array(X_inds), X_arrays) assert_array_equal(np.array(X_inds), np.array(X)[inds])
def __data_generation(self, indices_head, indices_tail): l = np.random.beta(self.alpha, self.alpha, self.batch_size) X_l = l.reshape(self.batch_size, 1, 1, 1) y_l = l.reshape(self.batch_size, 1) X1_tmp = safe_indexing(self.X, indices_head) X2_tmp = safe_indexing(self.X, indices_tail) n, _, w, _ = X1_tmp.shape X1 = np.zeros((n, w, w, 1)) X2 = np.zeros((n, w, w, 1)) for i in range(self.batch_size): X1[i] = crop_image(X1_tmp[i]) X2[i] = crop_image(X2_tmp[i]) X = X1 * X_l + X2 * (1.0 - X_l) y1 = safe_indexing(self.y, indices_head) y2 = safe_indexing(self.y, indices_tail) y = y1 * y_l + y2 * (1.0 - y_l) if self.datagen is not None: for i in range(self.batch_size): X[i] = self.datagen.random_transform(X[i]) X[i] = self.datagen.standardize(X[i]) return X, y
def _safe_split(estimator, X, y, indices, train_indices=None): """Create subset of dataset and properly handle kernels""" from sklearn.gaussian_process.kernels import Kernel as GPKernel if (hasattr(estimator, 'kernel') and callable(estimator.kernel) and not isinstance(estimator.kernel, GPKernel)): # cannot compute the kernel values with custom function raise ValueError("Cannot use a custom kernel function. " "Precompute the kernel matrix instead.") if not hasattr(X, "shape"): if getattr(estimator, "_pairwise", False): raise ValueError("Precomputed kernels or affinity matrices have " "to be passed as arrays or sparse matrices.") X_subset = [X[index] for index in indices] else: if getattr(estimator, "_pairwise", False): # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") if train_indices is None: X_subset = X[np.ix_(indices, indices)] else: X_subset = X[np.ix_(indices, train_indices)] else: X_subset = safe_indexing(X, indices) if y is not None: y_subset = safe_indexing(y, indices) else: y_subset = None return X_subset, y_subset
def _safe_split(estimator, X, y, indices, train_indices=None): """Create subset of dataset and properly handle kernels.""" if hasattr(estimator, 'kernel') and callable(estimator.kernel): # cannot compute the kernel values with custom function raise ValueError("Cannot use a custom kernel function. " "Precompute the kernel matrix instead.") if not hasattr(X, "shape"): if getattr(estimator, "_pairwise", False): raise ValueError("Precomputed kernels or affinity matrices have " "to be passed as arrays or sparse matrices.") X_subset = [X[idx] for idx in indices] else: if getattr(estimator, "_pairwise", False): # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") if train_indices is None: X_subset = X[np.ix_(indices, indices)] else: X_subset = X[np.ix_(indices, train_indices)] else: X_subset = safe_indexing(X, indices) if y is not None: y_subset = safe_indexing(y, indices) else: y_subset = None return X_subset, y_subset
def _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, indices): depth_subset = safe_indexing(depthmaps, indices) offsets_subset = safe_indexing(offset_points_projected, indices) directions_subset = safe_indexing(direction_vectors, indices) truths_subset = safe_indexing(true_joints, indices) return depth_subset, offsets_subset, directions_subset, truths_subset
def sem_cross_validate(estimator, X, y, scoring, n_splits=10, similarities=None): # similarities: doc-doc partial similarities, according to shuffled indexes cv = KFold(n_splits=n_splits) scores = {} result_sims = [] for key, scorer in scoring.items(): scores["test_" + key] = [] for i, (train_idxes, test_idxes) in enumerate(cv.split(X)): # create new estimator cur_estimator = clone(estimator) train_X = safe_indexing(X, train_idxes) train_y = safe_indexing(y, train_idxes) test_X = safe_indexing(X, test_idxes) test_y = safe_indexing(y, test_idxes) cur_estimator.fit(train_X, train_y) if similarities is None: rec_perm_lists = cur_estimator.transform(test_X) result_sims.append(cur_estimator.sims_) else: rec_perm_lists = cur_estimator.transform(test_X, similarities[i]) for key, scorer in scoring.items(): cur_score = scorer._sign * scorer._score_func( test_y, rec_perm_lists, **scorer._kwargs) scores["test_" + key].append(cur_score) result_sims = np.array(result_sims) return scores, result_sims
def _fit_resample(self, X, y): n_samples = X.shape[0] # convert y to z_score y_z = (y - y.mean()) / y.std() index0 = np.arange(n_samples) index_negative = index0[y_z > self.negative_thres] index_positive = index0[y_z <= self.positive_thres] index_unclassified = [ x for x in index0 if x not in index_negative and x not in index_positive ] y_z[index_negative] = 0 y_z[index_positive] = 1 y_z[index_unclassified] = -1 ros = RandomOverSampler(sampling_strategy=self.sampling_strategy, random_state=self.random_state, ratio=self.ratio) _, _ = ros.fit_resample(X, y_z) sample_indices = ros.sample_indices_ print("Before sampler: %s. Total after: %s" % (Counter(y_z), sample_indices.shape)) self.sample_indices_ = np.array(sample_indices) if self.return_indices: return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices), sample_indices) return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices))
def _fit_resample(self, X, y): if self.return_indices: deprecate_parameter(self, '0.4', 'return_indices', 'sample_indices_') random_state = check_random_state(self.random_state) idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] index_target_class = random_state.choice( range(np.count_nonzero(y == target_class)), size=n_samples, replace=self.replacement) else: index_target_class = slice(None) idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)[index_target_class]), axis=0) self.sample_indices_ = idx_under if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
def _fit_resample(self, X, y): n_samples = X.shape[0] # convert y to z_score y_z = (y - y.mean()) / y.std() index0 = np.arange(n_samples) index_negative = index0[y_z > self.negative_thres] index_positive = index0[y_z <= self.positive_thres] index_unclassified = [x for x in index0 if x not in index_negative and x not in index_positive] y_z[index_negative] = 0 y_z[index_positive] = 1 y_z[index_unclassified] = -1 ros = RandomOverSampler( sampling_strategy=self.sampling_strategy, random_state=self.random_state, ratio=self.ratio) _, _ = ros.fit_resample(X, y_z) sample_indices = ros.sample_indices_ print("Before sampler: %s. Total after: %s" % (Counter(y_z), sample_indices.shape)) self.sample_indices_ = np.array(sample_indices) if self.return_indices: return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices), sample_indices) return (safe_indexing(X, sample_indices), safe_indexing(y, sample_indices))
def _safe_split(estimator, X, y, dy, indices, train_indices=None): """Create subset of dataset and properly handle kernels. Slice X, y according to indices for cross-validation, but take care of precomputed kernel-matrices or pairwise affinities / distances. If ``estimator._pairwise is True``, X needs to be square and we slice rows and columns. If ``train_indices`` is not None, we slice rows using ``indices`` (assumed the test set) and columns using ``train_indices``, indicating the training set. Labels y will always be indexed only along the first axis. Parameters ---------- estimator : object Estimator to determine whether we should slice only rows or rows and columns. X : array-like, sparse matrix or iterable Data to be indexed. If ``estimator._pairwise is True``, this needs to be a square array-like or sparse matrix. y : array-like, sparse matrix or iterable Targets to be indexed. indices : array of int Rows to select from X and y. If ``estimator._pairwise is True`` and ``train_indices is None`` then ``indices`` will also be used to slice columns. train_indices : array of int or None, default=None If ``estimator._pairwise is True`` and ``train_indices is not None``, then ``train_indices`` will be use to slice the columns of X. Returns ------- X_subset : array-like, sparse matrix or list Indexed data. y_subset : array-like, sparse matrix or list Indexed targets. """ if getattr(estimator, "_pairwise", False): if not hasattr(X, "shape"): raise ValueError("Precomputed kernels or affinity matrices have " "to be passed as arrays or sparse matrices.") # X is a precomputed square kernel matrix if X.shape[0] != X.shape[1]: raise ValueError("X should be a square kernel matrix") if train_indices is None: X_subset = X[np.ix_(indices, indices)] else: X_subset = X[np.ix_(indices, train_indices)] else: X_subset = safe_indexing(X, indices) if y is not None: y_subset = safe_indexing(y, indices) else: y_subset = None if dy is not None: dy_subset = safe_indexing(dy, indices) else: dy_subset = None return X_subset, y_subset, dy_subset
def _safe_split(y, exog, train, test): """Performs the CV indexing given the indices""" y_train, y_test = y.take(train), y.take(test) if exog is None: exog_train = exog_test = None else: exog_train, exog_test = \ safe_indexing(exog, train), safe_indexing(exog, test) return y_train, y_test, exog_train, exog_test
def train(excel_file, text_column, labels_column, train_test_idxs_file, n_jobs, model_file, n_accepted_probs, output_file): execution_info = pd.DataFrame() execution_info['Start date'] = [get_local_time_str()] torch.manual_seed(RANDOM_STATE) device = torch.device(f'cuda:{torch.cuda.current_device()}' \ if torch.cuda.is_available() \ else 'cpu') device_str = f'{device.type}:{device.index} ({torch.cuda.get_device_name(device.index)})' \ if device.type == 'cuda' \ else device.type print(f'Device: {device_str}') df = pd.read_excel(excel_file) df = df.fillna('NaN') corpus = df[text_column].tolist() labels = df[labels_column].tolist() train_test_idxs = load_json(train_test_idxs_file) train_idxs = train_test_idxs['train_idxs'] test_idxs = train_test_idxs['test_idxs'] corpus_train = utils.safe_indexing(corpus, train_idxs) corpus_test = utils.safe_indexing(corpus, test_idxs) y_train = utils.safe_indexing(labels, train_idxs) y_test = utils.safe_indexing(labels, test_idxs) train_set = BERTTokenizedDataset(corpus_train, y_train) val_set = BERTTokenizedDataset(corpus_test, y_test) train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=n_jobs-1) val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, num_workers=n_jobs-1) assert train_loader.dataset.classes_ == val_loader.dataset.classes_ net = BERTNeuralNet(len(val_loader.dataset.classes_), freeze_bert=FREEZE_BERT) net.load_state_dict(torch.load(model_file, map_location=device)['model_state_dict']) net.additional_layers = nn.Sequential(*list(net.additional_layers.children())[0:-1]) ft = FeatureExtractor(device, net) X_train = ft.extract_features(train_loader, 'X_train.pkl', 'X_train.dat') X_test = ft.extract_features(val_loader, 'X_test.pkl', 'X_test.dat') clfs = [ ensemble.RandomForestClassifier(n_estimators=100, n_jobs=n_jobs, random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE), dummy.DummyClassifier(strategy='stratified', random_state=RANDOM_STATE, constant=None), linear_model.SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, n_jobs=n_jobs, random_state=RANDOM_STATE) ] predictions = {'y_true': y_test} for clf in tqdm(iterable=clfs, desc='Fitting classifiers', unit='clf'): clf.fit(X_train, y_train) dump_pickle(clf, '%s.pkl' % (clf.__class__.__name__)) for clf in tqdm(iterable=clfs, desc='Obtaining probabilities', unit='clf'): y_predict_proba = clf.predict_proba(X_test) dicts = predict_proba_to_dicts(clf.classes_, y_predict_proba) predictions[clf.__class__.__name__] = dicts dump_json(predictions, 'predictions.json') execution_info['End date'] = [get_local_time_str()] execution_info['Excel file'] = [excel_file] execution_info['Text column'] = [text_column] execution_info['Label column'] = [labels_column] execution_info['Accepted probabilities'] = [n_accepted_probs] execution_info['Device'] = [device_str] execution_info['Base model'] = [model_file] execution_info['Batch size'] = [BATCH_SIZE] generate_report(execution_info, predictions, output_file)
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ self._validate_estimator() if self.voting == 'auto': if sparse.issparse(X): self.voting_ = 'hard' else: self.voting_ = 'soft' else: if self.voting in VOTING_KIND: self.voting_ = self.voting else: raise ValueError("'voting' needs to be one of {}. Got {}" " instead.".format(VOTING_KIND, self.voting)) X_resampled, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.ratio_.keys(): n_samples = self.ratio_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) X_new, y_new = self._generate_sample( X, y, self.estimator_.cluster_centers_, target_class) X_resampled.append(X_new) y_resampled.append(y_new) else: target_class_indices = np.flatnonzero(y == target_class) X_resampled.append(safe_indexing(X, target_class_indices)) y_resampled.append(safe_indexing(y, target_class_indices)) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, np.array(y_resampled)
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ self._validate_estimator() if self.voting == 'auto': if sparse.issparse(X): self.voting_ = 'hard' else: self.voting_ = 'soft' else: if self.voting in VOTING_KIND: self.voting_ = self.voting else: raise ValueError("'voting' needs to be one of {}. Got {}" " instead.".format(VOTING_KIND, self.voting)) X_resampled, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) X_new, y_new = self._generate_sample( X, y, self.estimator_.cluster_centers_, target_class) X_resampled.append(X_new) y_resampled.append(y_new) else: target_class_indices = np.flatnonzero(y == target_class) X_resampled.append(safe_indexing(X, target_class_indices)) y_resampled.append(safe_indexing(y, target_class_indices)) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, np.array(y_resampled)
def _safe_split(X, y, indices): """Create subset of dataset""" X_subset = safe_indexing(X, indices) if y is not None: y_subset = safe_indexing(y, indices) else: y_subset = None return X_subset, y_subset
def _fit_score(pipe, param_grid, X, y, train_idx, test_idx, cv_idx): """Fit a pipeline and score. Parameters ---------- pipe : Estimator A scikit-learn pipeline. param_grid : ParameterGrid A ParameterGrid with all the parameters to try for the pipeline. X : ndarray, shape (n_samples, n_features) The full dataset. y : ndarray, shape (n_samples,) The associated target. train_idx : ndarray, (n_train_samples,) The training indexes. test_idx : ndarray, (n_test_samples,) The testing indexes. cv_idx : int The index of the fold. Returns ------- cv_results : dict A dictionary containing the score and parameters. """ cv_results = defaultdict(list) X_train, y_train = safe_indexing(X, train_idx), y[train_idx] X_test, y_test = safe_indexing(X, test_idx), y[test_idx] for param in param_grid: pipe_cv = clone(pipe) pipe_cv.set_params(**param) try: pipe_cv.fit(X_train, y_train) except ValueError: continue y_pred_proba_train = pipe_cv.predict_proba(X_train) y_pred_proba_test = pipe_cv.predict_proba(X_test) y_pred_train = pipe_cv.predict(X_train) y_pred_test = pipe_cv.predict(X_test) cv_results['auc_train_score'].append( roc_auc_score(y_train, y_pred_proba_train[:, 1])) cv_results['auc_test_score'].append( roc_auc_score(y_test, y_pred_proba_test[:, 1])) cv_results['bacc_train_score'].append( balanced_accuracy_score(y_train, y_pred_train)) cv_results['bacc_test_score'].append( balanced_accuracy_score(y_test, y_pred_test)) cv_results['cv_idx'].append(cv_idx) for k, v in param.items(): cv_results[k].append(v) return cv_results
def fix_target(classes_, target_: np.array, pred_: np.array): if not np.array_equal(classes_, np.arange(len(classes_))): for i_, c_ in enumerate(classes_): target_[target_ == c_] = -i_ target_ *= -1 return safe_indexing(target_, np.where(target_ >= 0)[0]), safe_indexing( pred_, np.where(target_ >= 0)[0])
def stratify_split(df, y, cats, ratio): keys = df[cats] if y.dtype.name[:5] != 'float': keys = pd.concat([keys, y], axis=1) keys = keys.apply(lambda x: '~'.join([str(j) for j in x.values]), axis=1) sss = split_by_cats(train_size =1-ratio, test_size=ratio) train, val = next(sss.split(df, keys)) x_trn, x_val = safe_indexing(df, train), safe_indexing(df, val) y_trn, y_val = safe_indexing(y, train), safe_indexing(y, val) return x_trn, y_trn, x_val, y_val
def train_test_split3(*arrays, **options): """Split arrays or matrices into random train, test and eval subsets Quick utility that wraps input validation and ``next(ShuffleSplit().split(X, y))`` and application to input data into a single call for splitting (and optionally subsampling) data in a oneliner. Read more in the :ref:`User Guide <cross_validation>`. Parameters ---------- *arrays : sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : boolean, optional (default=True) Whether or not to shuffle the data before splitting. Returns ------- splitting : list, length=2 * len(arrays) List containing train-test-eval split of inputs. """ n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") random_state = options.pop('random_state', None) shuffleresults = options.pop('shuffle', True) test_fold = options.pop('test_fold', None) if test_fold is None: raise TypeError("Parameter test_fold is required.") test_fold = np.array(test_fold, dtype=np.int) test_fold = column_or_1d(test_fold) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) evalu=np.where(test_fold==2)[0] if shuffleresults: rng = check_random_state(random_state) rng.shuffle(evalu) cv = PredefinedThreeSplit(test_fold=test_fold, shuffle=shuffleresults, random_state=random_state) train, test = next(cv.split()) #print evalu if len(evalu)==0: return list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test), np.array(0)) for a in arrays)) return list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test), safe_indexing(a, evalu)) for a in arrays))
def test_safe_indexing_pandas(): try: import pandas as pd except ImportError: raise SkipTest("Pandas not found") X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) X_df = pd.DataFrame(X) inds = np.array([1, 2]) X_df_indexed = safe_indexing(X_df, inds) X_indexed = safe_indexing(X_df, inds) assert_array_equal(np.array(X_df_indexed), X_indexed)
def test_safe_indexing_axis_1_sparse(idx, asarray): if isinstance(idx, Iterable) and asarray: idx = np.asarray(idx) X_true = safe_indexing(X_toy, idx, axis=1) # scipy matrix will always return a 2D array if X_true.ndim == 1: X_true = X_true[:, np.newaxis] X_sparse = sp.csc_matrix(X_toy) assert_array_equal(safe_indexing(X_sparse, idx, axis=1).toarray(), X_true)
def test_safe_indexing_axis_1_pandas(idx_array, idx_df, asarray): pd = pytest.importorskip('pandas') if asarray and isinstance(idx_array, Iterable): idx_array = np.asarray(idx_array) if (asarray and (not isinstance(idx_df, str) and isinstance(idx_df, Iterable))): idx_df = np.asarray(idx_df) X_true = safe_indexing(X_toy, idx_array, axis=1) X_df = pd.DataFrame(X_toy, columns=['col_{}'.format(i) for i in range(3)]) assert_array_equal(safe_indexing(X_df, idx_df, axis=1).values, X_true)
def split_dataset(dataset): X = dataset.drop(y_col, axis=1) y = dataset[y_col] test_fold = (fold_pattern * ( (dataset.shape[0] - 1) // len(fold_pattern) + 1))[:dataset.shape[0]] splitter = PredefinedSplit(test_fold) for train_index, test_index in splitter.split(): X_train, X_test = safe_indexing(X, train_index), safe_indexing( X, test_index) y_train, y_test = safe_indexing(y, train_index), safe_indexing( y, test_index) return X_train, y_train, X_test, y_test
def train(excel_file, text_column, labels_column, train_test_idxs_file, n_jobs, n_accepted_probs, output_file): execution_info = pd.DataFrame() execution_info['Start date'] = [get_local_time_str()] df = pd.read_excel(excel_file) df = df.fillna('NaN') preprocessor = Preprocessor() corpus = preprocessor.preprocess(df[text_column]) dump_json(corpus, 'preprocessed_corpus_ELMo.json') labels = df[labels_column].tolist() train_test_idxs = load_json(train_test_idxs_file) train_idxs = train_test_idxs['train_idxs'] test_idxs = train_test_idxs['test_idxs'] corpus_train = utils.safe_indexing(corpus, train_idxs) corpus_test = utils.safe_indexing(corpus, test_idxs) y_train = utils.safe_indexing(labels, train_idxs) y_test = utils.safe_indexing(labels, test_idxs) ft = FeatureExtractor() X_train = ft.extract_features(corpus_train, 'X_train_ELMo.pkl', 'X_train_ELMo.dat') X_test = ft.extract_features(corpus_test, 'X_test_ELMo.pkl', 'X_test_ELMo.dat') clfs = [ ensemble.RandomForestClassifier(n_estimators=100, n_jobs=n_jobs, random_state=RANDOM_STATE), LinearSVC(random_state=RANDOM_STATE), dummy.DummyClassifier(strategy='stratified', random_state=RANDOM_STATE, constant=None), linear_model.SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, n_jobs=n_jobs, random_state=RANDOM_STATE) ] predictions = {'y_true': y_test} for clf in tqdm(iterable=clfs, desc='Fitting classifiers', unit='clf'): clf.fit(X_train, y_train) dump_pickle(clf, '%s.pkl' % (clf.__class__.__name__)) for clf in tqdm(iterable=clfs, desc='Obtaining probabilities', unit='clf'): y_predict_proba = clf.predict_proba(X_test) dicts = predict_proba_to_dicts(clf.classes_, y_predict_proba) predictions[clf.__class__.__name__] = dicts dump_json(predictions, 'predictions.json') execution_info['End date'] = [get_local_time_str()] execution_info['Excel file'] = excel_file execution_info['Text column'] = text_column execution_info['Label column'] = labels_column execution_info['n_jobs'] = n_jobs execution_info['Accepted probabilities'] = n_accepted_probs generate_report(execution_info, predictions, output_file)
def generator(X, y, sample_weight, indices, batch_size): while True: for index in range(0, len(indices), batch_size): X_res = safe_indexing(X, indices[index:index + batch_size]) y_res = safe_indexing(y, indices[index:index + batch_size]) if issparse(X_res) and not keep_sparse: X_res = X_res.toarray() if sample_weight is None: yield X_res, y_res else: sw_res = safe_indexing(sample_weight, indices[index:index + batch_size]) yield X_res, y_res, sw_res
def test_safe_indexing_1d_array_error(X_constructor): # check that we are raising an error if the array-like passed is 1D and # we try to index on the 2nd dimension X = list(range(5)) if X_constructor == 'array': X_constructor = np.asarray(X) elif X_constructor == 'series': pd = pytest.importorskip("pandas") X_constructor = pd.Series(X) err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or pandas" with pytest.raises(ValueError, match=err_msg): safe_indexing(X_constructor, [0, 1], axis=1)
def davies_bouldin_score_eu_cos(X, deltaX, labels): X, labels = check_X_y(X, labels) # 检查X和labels的尺寸 deltaX, labels = check_X_y(deltaX, labels) # 检查X和labels的尺寸 # deltaX = np.diff(X, n = 1, axis = 1) # 求X的一阶差分 le = LabelEncoder() labels = le.fit_transform(labels) # 将labels转换为0-k之间的整数 n_samples, _ = X.shape # 样本数 n_labels = len(le.classes_) # label数 check_number_of_labels(n_labels, n_samples) # 检查label数量是否正确 intra_dists = np.zeros(n_labels) # 初始化类内距离 # 初始化聚类中心 centroids = np.zeros((n_labels, len(X[0])), dtype=np.float) centroids_delta = np.zeros((n_labels, len(deltaX[0])), dtype=np.float) eu_X = pairwise_distances(X, metric='euclidean') eu_max, eu_min = np.max(eu_X), np.min(eu_X) cos_X = pairwise_distances(deltaX, metric='cosine') cos_max, cos_min = np.max(cos_X), np.min(cos_X) scaling_ratio = (eu_max - eu_min) / (cos_max - cos_min) for k in range(n_labels): cluster_k = safe_indexing(X, labels == k) # 索引样本集中标签为k的样本(即第k类) delta_k = safe_indexing(deltaX, labels == k) # 索引一阶差分样本集中的标签为k的样本 centroid = cluster_k.mean(axis=0) # 求第k类样本的中心点 centroid_delta = delta_k.mean(axis=0) # 求第k类一阶差分样本的中心点 centroids[k] = centroid centroids_delta[k] = centroid_delta # 求第k类样本到第k类中心的距离的平均值(以metric度量) a = 0.5 b = 1 - a intra_dist_eu = np.average( pairwise_distances(cluster_k, [centroid], metric='euclidean')) intra_dist_cos = np.average( pairwise_distances(delta_k, [centroid_delta], metric='cosine')) intra_dists[k] = a * intra_dist_eu + b * intra_dist_cos * scaling_ratio # 求不同类中心的距离(以metric度量) centroid_distances_eu = pairwise_distances(centroids, metric='euclidean') centroid_distances_cos = pairwise_distances(centroids_delta, metric='cosine') centroid_distances = a * centroid_distances_eu + b * centroid_distances_cos * scaling_ratio # 如果类内、类间距离非常小,接近于0,返回0 if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0): return 0.0 score = (intra_dists[:, None] + intra_dists) / centroid_distances score[score == np.inf] = np.nan # 将无穷大的值转换为nan return np.mean(np.nanmax(score, axis=1))
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) If `return_indices` is `True`, an array will be returned containing a boolean for each sample to represent whether that sample was selected or not. """ random_state = check_random_state(self.random_state) idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): n_samples = self.ratio_[target_class] index_target_class = random_state.choice( range(np.count_nonzero(y == target_class)), size=n_samples, replace=self.replacement) else: index_target_class = slice(None) idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)[index_target_class]), axis=0) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) else: return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) If `return_indices` is `True`, an array will be returned containing a boolean for each sample to represent whether that sample was selected or not. """ random_state = check_random_state(self.random_state) idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): n_samples = self.ratio_[target_class] index_target_class = random_state.choice( range(np.count_nonzero(y == target_class)), size=n_samples, replace=self.replacement) else: index_target_class = slice(None) idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)[ index_target_class]), axis=0) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) else: return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
def _sample(self, X, y): # FIXME: uncomment in version 0.6 # self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples(X_class, y.dtype, class_sample, X_class, nns, n_samples, 1.0) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) sparse_func = 'tocsc' if X.format == 'csc' else 'tocsr' X_resampled = getattr(X_resampled, sparse_func)() else: X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled
def _index_param_value(X, v, indices): """Private helper function for parameter value indexing.""" if not _is_arraylike(v) or _num_samples(v) != _num_samples(X): # pass through: skip indexing return v if sp.issparse(v): v = v.tocsr() return safe_indexing(v, indices)
def _fit_resample(self, X, y): # check for deprecated random_state if self.random_state is not None: deprecate_parameter(self, '0.4', 'random_state') # Find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) nn.fit(X) nns = nn.kneighbors(X, return_distance=False)[:, 1] links = self.is_tomek(y, nns, self.sampling_strategy_) idx_under = np.flatnonzero(np.logical_not(links)) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) else: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under))
def extract_param(self, key, x, n): if self.cache is not None and (n, key) in self.cache: return self.cache[n, key] out = safe_indexing(x, self.splits[n][0]) if _is_arraylike(x) else x if self.cache is not None: self.cache[n, key] = out return out
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) If `return_indices` is `True`, a boolean array will be returned containing the which samples have been selected. """ # check for deprecated random_state if self.random_state is not None: deprecate_parameter(self, '0.4', 'random_state') # Find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) nn.fit(X) nns = nn.kneighbors(X, return_distance=False)[:, 1] links = self.is_tomek(y, nns, self.ratio_) idx_under = np.flatnonzero(np.logical_not(links)) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) else: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under))
def __getitem__(self, index): X_resampled = safe_indexing( self.X, self.indices_[index * self.batch_size: (index + 1) * self.batch_size]) y_resampled = safe_indexing( self.y, self.indices_[index * self.batch_size: (index + 1) * self.batch_size]) if issparse(X_resampled) and not self.keep_sparse: X_resampled = X_resampled.toarray() if self.sample_weight is not None: sample_weight_resampled = safe_indexing( self.sample_weight, self.indices_[index * self.batch_size: (index + 1) * self.batch_size]) if self.sample_weight is None: return X_resampled, y_resampled else: return X_resampled, y_resampled, sample_weight_resampled
def _extract(self, X, y, n, is_x=True, is_train=True): if self.cache is not None and (n, is_x, is_train) in self.cache: return self.cache[n, is_x, is_train] inds = self.splits[n][0] if is_train else self.splits[n][1] result = safe_indexing(X if is_x else y, inds) if self.cache is not None: self.cache[n, is_x, is_train] = result return result
def test_safe_indexing_pandas(): try: import pandas as pd except ImportError: raise SkipTest("Pandas not found") X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) X_df = pd.DataFrame(X) inds = np.array([1, 2]) X_df_indexed = safe_indexing(X_df, inds) X_indexed = safe_indexing(X_df, inds) assert_array_equal(np.array(X_df_indexed), X_indexed) # fun with read-only data in dataframes # this happens in joblib memmapping X.setflags(write=False) X_df_readonly = pd.DataFrame(X) with warnings.catch_warnings(record=True): X_df_ro_indexed = safe_indexing(X_df_readonly, inds) assert_array_equal(np.array(X_df_ro_indexed), X_indexed)
def _shuffle(y, groups, random_state): """Return a shuffled copy of y eventually shuffle among same groups.""" if groups is None: indices = random_state.permutation(len(y)) else: indices = np.arange(len(groups)) for group in np.unique(groups): this_mask = (groups == group) indices[this_mask] = random_state.permutation(indices[this_mask]) return safe_indexing(y, indices)
def _fit_resample(self, X, y): if self.return_indices: deprecate_parameter(self, '0.4', 'return_indices', 'sample_indices_') self._validate_estimator() random_state = check_random_state(self.random_state) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # select a sample from the current class idx_maj = np.flatnonzero(y == target_class) sel_idx_maj = random_state.randint( low=0, high=target_stats[target_class], size=self.n_seeds_S) idx_maj_sample = idx_maj[sel_idx_maj] minority_class_indices = np.flatnonzero(y == class_minority) C_indices = np.append(minority_class_indices, idx_maj_sample) # create the set composed of all minority samples and one # sample from the current class. C_x = safe_indexing(X, C_indices) C_y = safe_indexing(y, C_indices) # create the set S with removing the seed from S # since that it will be added anyway idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0) S_x = safe_indexing(X, idx_maj_extracted) S_y = safe_indexing(y, idx_maj_extracted) self.estimator_.fit(C_x, C_y) pred_S_y = self.estimator_.predict(S_x) S_misclassified_indices = np.flatnonzero(pred_S_y != S_y) idx_tmp = idx_maj_extracted[S_misclassified_indices] idx_under = np.concatenate( (idx_under, idx_maj_sample, idx_tmp), axis=0) else: idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)), axis=0) X_resampled = safe_indexing(X, idx_under) y_resampled = safe_indexing(y, idx_under) # apply Tomek cleaning tl = TomekLinks( sampling_strategy=list(self.sampling_strategy_.keys())) X_cleaned, y_cleaned = tl.fit_resample(X_resampled, y_resampled) self.sample_indices_ = safe_indexing(idx_under, tl.sample_indices_) if self.return_indices: return (X_cleaned, y_cleaned, self.sample_indices_) return X_cleaned, y_cleaned
def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) target_stats = Counter(y) sample_indices = range(X.shape[0]) for class_sample, num_samples in self.sampling_strategy_.items(): target_class_indices = np.flatnonzero(y == class_sample) indices = random_state.randint( low=0, high=target_stats[class_sample], size=num_samples) sample_indices = np.append(sample_indices, target_class_indices[indices]) if self.return_indices: return (safe_indexing(X, sample_indices), safe_indexing( y, sample_indices), sample_indices) else: return (safe_indexing(X, sample_indices), safe_indexing( y, sample_indices))
def _local_parallel_build_trees(sampler, tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None): # resample before to fit the tree X_resampled, y_resampled = sampler.fit_sample(X, y) if sample_weight is not None: sample_weight = safe_indexing(sample_weight, sampler.sample_indices_) tree = _parallel_build_trees(tree, forest, X_resampled, y_resampled, sample_weight, tree_idx, n_trees, verbose=verbose, class_weight=class_weight) return sampler, tree
def _fit_resample(self, X, y): if self.return_indices: deprecate_parameter(self, '0.4', 'return_indices', 'sample_indices_') self._validate_estimator() target_stats = Counter(y) skf = StratifiedKFold( n_splits=self.cv, shuffle=False, random_state=self.random_state).split(X, y) probabilities = np.zeros(y.shape[0], dtype=float) for train_index, test_index in skf: X_train = safe_indexing(X, train_index) X_test = safe_indexing(X, test_index) y_train = safe_indexing(y, train_index) y_test = safe_indexing(y, test_index) self.estimator_.fit(X_train, y_train) probs = self.estimator_.predict_proba(X_test) classes = self.estimator_.classes_ probabilities[test_index] = [ probs[l, np.where(classes == c)[0][0]] for l, c in enumerate(y_test) ] idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] threshold = np.percentile( probabilities[y == target_class], (1. - (n_samples / target_stats[target_class])) * 100.) index_target_class = np.flatnonzero( probabilities[y == target_class] >= threshold) else: index_target_class = slice(None) idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)[index_target_class]), axis=0) self.sample_indices_ = idx_under if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
def _fit_resample(self, X, y): self._validate_estimator() idx_under = np.empty((0, ), dtype=int) self.nn_.fit(X) for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): target_class_indices = np.flatnonzero(y == target_class) X_class = safe_indexing(X, target_class_indices) y_class = safe_indexing(y, target_class_indices) nnhood_idx = self.nn_.kneighbors( X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] if self.kind_sel == 'mode': nnhood_label, _ = mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label) == y_class elif self.kind_sel == 'all': nnhood_label = nnhood_label == target_class nnhood_bool = np.all(nnhood_label, axis=1) index_target_class = np.flatnonzero(nnhood_bool) else: index_target_class = slice(None) idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)[index_target_class]), axis=0) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) else: return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
def _sample_regular(self, X, y): """Resample the dataset using the regular SMOTE implementation. Use the regular SMOTE algorithm proposed in [1]_. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. """ X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue target_class_indices = np.flatnonzero(y == class_sample) X_class = safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples(X_class, class_sample, X_class, nns, n_samples, 1.0) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) else: X_resampled = np.vstack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) If `return_indices` is `True`, a boolean array will be returned containing the which samples have been selected. """ self._validate_estimator() idx_under = np.empty((0, ), dtype=int) self.nn_.fit(X) for target_class in np.unique(y): if target_class in self.ratio_.keys(): target_class_indices = np.flatnonzero(y == target_class) X_class = safe_indexing(X, target_class_indices) y_class = safe_indexing(y, target_class_indices) nnhood_idx = self.nn_.kneighbors( X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] if self.kind_sel == 'mode': nnhood_label, _ = mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label) == y_class elif self.kind_sel == 'all': nnhood_label = nnhood_label == target_class nnhood_bool = np.all(nnhood_label, axis=1) index_target_class = np.flatnonzero(nnhood_bool) else: index_target_class = slice(None) idx_under = np.concatenate( (idx_under, np.flatnonzero(y == target_class)[ index_target_class]), axis=0) if self.return_indices: return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), idx_under) else: return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
def _fit_resample(self, X, y): self._validate_estimator() if self.voting == 'auto': if sparse.issparse(X): self.voting_ = 'hard' else: self.voting_ = 'soft' else: if self.voting in VOTING_KIND: self.voting_ = self.voting else: raise ValueError("'voting' needs to be one of {}. Got {}" " instead.".format(VOTING_KIND, self.voting)) X_resampled, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) X_new, y_new = self._generate_sample( X, y, self.estimator_.cluster_centers_, target_class) X_resampled.append(X_new) y_resampled.append(y_new) else: target_class_indices = np.flatnonzero(y == target_class) X_resampled.append(safe_indexing(X, target_class_indices)) y_resampled.append(safe_indexing(y, target_class_indices)) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, np.array(y_resampled, dtype=y.dtype)