def test_random_oversampling_limit_case(plot=False): """Execute k-means SMOTE with parameters equivalent to random oversampling""" kmeans_smote = KMeansSMOTE( random_state=RND_SEED, imbalance_ratio_threshold=float('Inf'), kmeans_args={ 'n_clusters': 1 }, smote_args={ 'k_neighbors': 0 } ) random_oversampler = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) X_resampled_random_oversampler, y_resampled_random_oversampler = random_oversampler.fit_sample( X, Y) if plot: plot_resampled(X_resampled, y_resampled, 'random_oversampling_limit_case_test_kmeans_smote') plot_resampled(X_resampled_random_oversampler, y_resampled_random_oversampler, 'random_oversampling_limit_case_test_random_oversampling') assert_array_equal(X_resampled, X_resampled_random_oversampler) assert_array_equal(y_resampled, y_resampled_random_oversampler)
def smote(target_df, target_outcome, grouping): y = target_df[target_outcome] target_df.drop(target_outcome, axis=1, inplace=True) target_df[grouping] = [ float((x.partition('_')[2])) for x in target_df[grouping] ] #target_df.drop(grouping, axis=1, inplace=True) X = target_df target_columns = target_df.columns #target_columns = target_columns.insert(0, grouping) target_columns = target_columns.insert(len(target_columns), target_outcome) kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 5}, smote_args={'k_neighbors': 10}) X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) X_resampled = pd.DataFrame(X_resampled) y_resampled = pd.DataFrame(y_resampled) frames = [X_resampled, y_resampled] total_df = pd.concat(frames, axis=1) total_df.columns = target_columns total_df[grouping] = ['p_' + str(x) for x in total_df[grouping]] return total_df
def test_smote_fallback(plot=False): """Assert that regular SMOTE is applied if no minority clusters are found.""" kmeans_smote = KMeansSMOTE( random_state=RND_SEED, kmeans_args={ 'n_clusters': 1 } ) smote = SMOTE(random_state=RND_SEED) with warnings.catch_warnings(record=True) as w: X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) assert len(w) == 1 assert "No minority clusters found" in str(w[0].message) assert "Performing regular SMOTE" in str(w[0].message) assert issubclass(w[0].category, UserWarning) X_resampled_smote, y_resampled_smote = smote.fit_sample(X, Y) if plot: plot_resampled(X_resampled, y_resampled, 'smote_fallback_test_kmeans_smote') plot_resampled(X_resampled_smote, y_resampled_smote, 'smote_fallback_test_smote') assert_array_equal(X_resampled, X_resampled_smote) assert_array_equal(y_resampled, y_resampled_smote)
def test_smoke(plot=False): """Execute k-means SMOTE with default parameters""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED) X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_SHAPE_EXPECTED) if plot: plot_resampled(X, X_resampled, Y, y_resampled, 'smoke_test')
def test_backwards_compatibility(plot=False): """Test if deprecated parameter ratio can still be used without error""" with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) kmeans_smote = KMeansSMOTE(random_state=RND_SEED, ratio={0: Y.sum()}) X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_SHAPE_EXPECTED) if plot: plot_resampled(X, X_resampled, Y, y_resampled, 'smoke_test')
def test_smoke_multiclass(plot=False): """Execute k-means SMOTE with default parameters for multi-class dataset""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED) X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS, Y_MULTICLASS) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_MULTICLASS_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_MULTICLASS_SHAPE_EXPECTED) if plot: plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled, 'smoke_multiclass_test')
def test_multiclass(plot=False): """Execute k-means SMOTE for multi-class dataset with user-defined n_clusters""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED, kmeans_args={'n_clusters': 10}) X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS, Y_MULTICLASS) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_MULTICLASS_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_MULTICLASS_SHAPE_EXPECTED) if plot: plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled, 'multiclass_test')
def test_documentation_example(): """Test basic code example shown in documentation""" from imblearn.datasets import fetch_datasets datasets = fetch_datasets(filter_data=['oil']) X, y = datasets['oil']['data'], datasets['oil']['target'] labels, counts = np.unique(y, return_counts=True) assert counts[0] > counts[1] kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 100}, smote_args={'k_neighbors': 10}) X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) labels, counts = np.unique(y_resampled, return_counts=True) assert counts[0] == counts[1]
def test_smote_limit_case(plot=False): """Execute k-means SMOTE with parameters equivalent to SMOTE""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED, imbalance_ratio_threshold=float('Inf'), kmeans_args={'n_clusters': 1}) smote = SMOTE(random_state=RND_SEED) X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) X_resampled_smote, y_resampled_smote = smote.fit_sample(X, Y) if plot: plot_resampled(X, X_resampled, Y, y_resampled, 'smote_limit_case_test_kmeans_smote') plot_resampled(X, X_resampled_smote, Y, y_resampled_smote, 'smote_limit_case_test_smote') assert_array_equal(X_resampled, X_resampled_smote) assert_array_equal(y_resampled, y_resampled_smote)
def test_multiclass_irt_dict(plot=False): """ Execute k-means SMOTE for multi-class dataset with different imbalance ratio thresholds per class. """ kmeans_smote = KMeansSMOTE(random_state=RND_SEED, kmeans_args={'n_clusters': 10}, imbalance_ratio_threshold={ 1: 1, 2: np.inf }) X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS, Y_MULTICLASS) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_MULTICLASS_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_MULTICLASS_SHAPE_EXPECTED) if plot: plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled, 'multiclass_test')
def test_smote_limit_case_multiclass(plot=False): """Execute k-means SMOTE with parameters equivalent to SMOTE""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED, imbalance_ratio_threshold=float('Inf'), kmeans_args={'n_clusters': 1}, smote_args={'k_neighbors': 3}) smote = SMOTE(random_state=RND_SEED, k_neighbors=3) X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS, Y_MULTICLASS) X_resampled_smote, y_resampled_smote = smote.fit_sample( X_MULTICLASS, Y_MULTICLASS) if plot: plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled, 'smote_limit_case_multiclass_test_kmeans_smote') plot_resampled(X_MULTICLASS, X_resampled_smote, Y_MULTICLASS, y_resampled_smote, 'smote_limit_case_multiclass_test_smote') assert_array_equal(X_resampled, X_resampled_smote) assert_array_equal(y_resampled, y_resampled_smote)
def split_dataset(X_1, y_1, X_0, y_0, test_size=0.2, random_seed=0, smote=True): if smote: dataX_1, testX_1, datay_1, testy_1 = train_test_split( X_1, y_1, test_size=0.16, random_state=random_seed) dataX_0, testX_0, datay_0, testy_0 = train_test_split( X_0, y_0, test_size=0.082, random_state=random_seed) test_X, test_y = shuffled(array_joint(testX_1, testX_0), array_joint(testy_1, testy_0), random_seed=random_seed) # print(len(testy_1), len(testy_0), len(testy_1) + len(testy_0)) addX, addy = shuffled(array_joint(dataX_1, dataX_0), array_joint(datay_1, datay_0), random_seed=random_seed) kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 2}, smote_args={'k_neighbors': 2}, random_state=random_seed) X_resampled, y_resampled = kmeans_smote.fit_sample(addX, addy) # datay_1 = np.ones(len(smote_dataX_1), dtype=np.int16) train_X, train_y = shuffled(X_resampled, y_resampled, random_seed=random_seed) else: dataX_1, testX_1, datay_1, testy_1 = train_test_split( X_1, y_1, test_size=0.2, random_state=random_seed) dataX_0, testX_0, datay_0, testy_0 = train_test_split( X_0, y_0, test_size=0.067, random_state=random_seed) test_X, test_y = shuffled(array_joint(testX_1, testX_0), array_joint(testy_1, testy_0), random_seed=random_seed) train_X, train_y = shuffled(array_joint(dataX_1, dataX_0), array_joint(datay_1, datay_0), random_seed=random_seed) return train_X, test_X, train_y, test_y
from kmeans_smote import KMeansSMOTE from TestDataGeneration.data_generation import DataGenerator from experiment import Experiment if __name__ == "__main__": total_number = 10000 ratio = 0.9 dg1 = DataGenerator(total_number=total_number, ratio=ratio) data_train, label_train = dg1.generate() dg2 = DataGenerator(total_number=total_number, ratio=ratio) data_test, label_test = dg2.generate() Ratio = "minority" kmeans_args = {"n_clusters": 20} imbalance_ratio_threshold = 20 smote_args = {"k_neighbors": 10} kmeans_smote = KMeansSMOTE( ratio=Ratio, kmeans_args=kmeans_args, smote_args=smote_args, imbalance_ratio_threshold=imbalance_ratio_threshold) X_resampled, Y_resampled = kmeans_smote.fit_sample(data_train, label_train) exp = Experiment(data=X_resampled, label=Y_resampled) n_neighbors = [1, 3, 5, 7, 9, 11, 13, 15, 17] for neighbor in n_neighbors: true_posi, false_posi, true_neg, false_neg = exp.get_confusion_matrix( data=data_test, label=label_test, n_neighbors=neighbor) print("true_posi:", true_posi, "false_posi:", false_posi, "true_neg:", true_neg, "false_neg:", false_neg)
ly = LabelEncoder() y = ly.fit_transform(y) import numpy as np #from imblearn.datasets import fetch_datasets from kmeans_smote import KMeansSMOTE [ print('Class {} has {} instances'.format(label, count)) for label, count in zip(*np.unique(y, return_counts=True)) ] kmeans_smote = KMeansSMOTE(sampling_strategy='minority', kmeans_args={'n_clusters': 100}, smote_args={'k_neighbors': 10}) X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) [ print('Class {} has {} instances after oversampling'.format(label, count)) for label, count in zip(*np.unique(y_resampled, return_counts=True)) ] #Splitting Training and Test Set #Since we have a very small dataset, we will train our model with all availabe data. from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2) from sklearn.svm import SVC svc1 = SVC(C=50, kernel='rbf', gamma=1)