def test_hash_X_y(): rng = check_random_state(0) X = rng.randn(2000, 20) y = np.array([0] * 500 + [1] * 1500) assert hash_X_y(X, y, 10, 10) == (joblib.hash(X[::200, ::2]), joblib.hash(y[::200])) X = rng.randn(5, 2) y = np.array([0] * 2 + [1] * 3) # all data will be used in this case assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
def test_hash_X_y(): rng = check_random_state(0) X = rng.randn(2000, 20) y = np.array([0] * 500 + [1] * 1500) assert hash_X_y(X, y, 10, 10) == (joblib.hash(X[::200, ::2]), joblib.hash(y[::200])) X = rng.randn(5, 2) y = np.array([0] * 2 + [1] * 3) # all data will be used in this case assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
def test_hash_X_y_pandas(): pd = pytest.importorskip("pandas") rng = check_random_state(0) X = pd.DataFrame(rng.randn(2000, 20)) y = pd.Series([0] * 500 + [1] * 1500) assert hash_X_y(X, y, 10, 10) == (joblib.hash(X.iloc[::200, ::2]), joblib.hash(y.iloc[::200])) X = pd.DataFrame(rng.randn(5, 2)) y = pd.Series([0] * 2 + [1] * 3) # all data will be used in this case assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.X_hash_, self.y_hash_ = hash_X_y(X, y) self.ratio_ = check_ratio(self.ratio, y, self._sampling_type) # Cluster input space self.clustering_labels_ = self.clusterer[0][1].fit_predict(X, y) # Identify majority and minority majority_label = [label for label, n_samples in self.ratio_.items() if n_samples == 0][0] minority_labels = [label for label in self.ratio_.keys() if label != majority_label] # Clusters imbalance ratios weights = pd.DataFrame() return self
def fit(self, X, y=None): """Save the initial input matrix and the number of samples to be removed. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) if self.ratio is not None and self.ratio != 1.0: self.ratio_ = self.ratio self.n_samples_ = int(self.ratio_ * len(X)) else: self.ratio_ = None self.X_hash_, self.y_hash_ = hash_X_y(X, y) return self
def fit(self, X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.X_hash_, self.y_hash_ = hash_X_y(X, y) self._fit(X, y) return self
def fit(self, X, y): """ Find the classes statistics to perform sampling. Parameters ---------- X : 2d ndarray or scipy sparse matrix, shape [n_samples, n_features] Matrix containing the data which have to be sampled. y : 1d ndarray, shape [n_samples] Corresponding label for each sample in X. Returns ------- self """ X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.X_hash_, self.y_hash_ = hash_X_y(X, y) self.ratio_ = check_ratio(self.ratio, y) return self
def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) y = check_target_type(y) self.ratio_ = self.ratio self.X_hash_, self.y_hash_ = hash_X_y(X, y) labels = np.unique(y) counts = np.bincount(y) under_dict = {} over_dict = {} for lbl in labels: count = counts[lbl] if count < self.min_freq: under_dict[lbl] = count over_dict[lbl] = self.min_freq elif count > self.max_freq: under_dict[lbl] = self.max_freq over_dict[lbl] = self.max_freq else: under_dict[lbl] = count over_dict[lbl] = count self.under_sampler = RandomUnderSampler(ratio=under_dict, random_state=self.random_state) self.over_sampler = RandomOverSampler(ratio=over_dict, random_state=self.random_state) return self
def fit(self, X, y): self.ratio_ = 1 self.X_hash_ = hash_X_y(X, y) return self
def fit(self, X, y): self.ratio_ = 1 self.X_hash_ = hash_X_y(X, y) return self