def test_cnn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def under_sampling_algs(): algs = list() algs.append(("No Rs Undersampling case", "No Re-sampling")) algs.append((RandomUnderSampler(random_state=1), 'RU')) algs.append((ClusterCentroids(random_state=1), 'CC')) algs.append((TomekLinks(), 'TL')) algs.append((NearMiss(version=1), 'NM1')) algs.append((NearMiss(version=2), 'NM2')) algs.append((NearMiss(version=3), 'NM3')) algs.append((CondensedNearestNeighbour(random_state=1), 'CNN')) algs.append((OneSidedSelection(random_state=1), 'OSS')) algs.append((EditedNearestNeighbours(), 'ENN')) algs.append((NeighbourhoodCleaningRule(), 'NCL')) algs.append((InstanceHardnessThreshold(random_state=1), 'IHT')) algs.append((RepeatedEditedNearestNeighbours(), 'RENN')) algs.append((AllKNN(), 'AllKNN')) return algs
def equalize_training_dataset_with_CondensedNN(x_train, y_train): from imblearn.under_sampling import CondensedNearestNeighbour old_shape = list(x_train.shape) # reshape before using using over/undersampling method x_tmp = np.reshape(x_train, (x_train.shape[0], -1)) x_resampled, y_resampled = CondensedNearestNeighbour( sampling_strategy={i: 180 for i in range(0, 43)}, n_neighbors=5, n_jobs=8).fit_resample(x_tmp, y_train) print(sorted(Counter(y_resampled).items())) # reshape after using using over/undersampling method old_shape[0] = x_resampled.shape[0] x_resampled = np.reshape(x_resampled, tuple(old_shape)) return x_resampled, y_resampled
def load_data(mode: str, normalize: bool = True): df, hidden_df = __load_data_first_time() # Extract x and y y = np.array(df['earnings'].to_numpy(), dtype=int) del df['earnings'] x = np.array(df.to_numpy(), dtype=float) # Hidden to numpy hidden = hidden_df.to_numpy() if mode == 'vanilla': pass elif mode == 'smote': x, y = SMOTE().fit_sample(x, y) elif mode == 'adasyn': x, y = ADASYN().fit_sample(x, y) elif mode == 'bordersmote': x, y = BorderlineSMOTE().fit_sample(x, y) elif mode == 'randomover': x, y, idxs = RandomOverSampler(return_indices=True).fit_sample(x, y) hidden = hidden[idxs] elif mode == 'randomunder': x, y, idxs = RandomUnderSampler(return_indices=True).fit_sample(x, y) hidden = hidden[idxs] elif mode == 'tomek': x, y, idxs = TomekLinks(return_indices=True).fit_sample(x, y) hidden = hidden[idxs] elif mode == 'knn': x, y, idxs = CondensedNearestNeighbour(return_indices=True, n_neighbors=3).fit_sample(x, y) hidden = hidden[idxs] if normalize: x -= np.mean(x, axis=0) x /= np.std(x, axis=0) return x, y, hidden
def test_cnn_fit_resample(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_resample(X, Y) X_gt = np.array([ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646], ]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def get_data(force_reload=False, strategy='oversampling', test_size=0.15): train_data_file = os.path.join(DATA_DIR, 'train_data.{}.npy'.format(strategy)) train_labels_file = os.path.join(DATA_DIR, 'train_labels.{}.npy'.format(strategy)) val_data_file = os.path.join(DATA_DIR, 'val_data.{}.npy'.format(strategy)) val_labels_file = os.path.join(DATA_DIR, 'val_labels.{}.npy'.format(strategy)) training_files_exist = os.path.exists(train_data_file) and os.path.exists(train_labels_file) val_files_exist = os.path.exists(val_data_file) and os.path.exists(val_labels_file) if not force_reload and training_files_exist and val_files_exist: X_train = np.load(train_data_file) y_train = np.load(train_labels_file) X_val = np.load(val_data_file) y_val = np.load(val_labels_file) else: train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv')) X, y = to_data_format(train_df) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size) print('Shapes before: {}, {}'.format(X_train.shape, y_train.shape)) if strategy == 'oversampling': X_train, y_train = SMOTE(n_jobs=n_jobs).fit_resample(X_train, y_train) elif strategy == 'combine': smote = SMOTE(n_jobs=n_jobs) enn = EditedNearestNeighbours(n_jobs=n_jobs) X_train, y_train = SMOTEENN(smote=smote, enn=enn).fit_resample(X_train, y_train) elif strategy == 'undersampling': enn = EditedNearestNeighbours(n_jobs=n_jobs) X_train, y_train = enn.fit_resample(X_train, y_train) elif strategy == 'condensed-undersampling': cnn = CondensedNearestNeighbour(n_jobs=n_jobs, n_neighbors=3) X_train, y_train = cnn.fit_resample(X_train, y_train) print('Shapes after: {}, {}'.format(X_train.shape, y_train.shape)) np.save(train_data_file, X_train) np.save(train_labels_file, y_train) np.save(val_data_file, X_val) np.save(val_labels_file, y_val) return X_train, X_val, y_train, y_val
def train_stage(df_path, cb_path): print('Load Train Data.') df = pd.read_csv(df_path) print('\nShape of Train Data: {}'.format(df.shape)) y_df = np.array(df['target']) df_ids = np.array(df.index) df.drop(['ID_code', 'target'], axis=1, inplace=True) cb_cv_result = np.zeros(df.shape[0]) skf = StratifiedKFold(n_splits=15, shuffle=False, random_state=42) skf.get_n_splits(df_ids, y_df) #sm = TomekLinks(random_state=42) sm = CondensedNearestNeighbour(random_state=42, n_jobs=3) print('\nModel Fitting...') for counter, ids in enumerate(skf.split(df_ids, y_df)): print('\nFold {}'.format(counter + 1)) X_fit, y_fit = df.values[ids[0]], y_df[ids[0]] X_val, y_val = df.values[ids[1]], y_df[ids[1]] X_fit, y_fit = sm.fit_sample(X_fit, y_fit) print('CatBoost') cb_cv_result[ids[1]] += fit_cb(X_fit, y_fit, X_val, y_val, counter, cb_path, name='cb') del X_fit, X_val, y_fit, y_val gc.collect() auc_cb = round(roc_auc_score(y_df, cb_cv_result), 4) print('Catboost VAL AUC: {}'.format(auc_cb)) return 0
def resample_data(predictors, target, df_data, method): """ This function resamples training datasets prior to training models. """ if method=='adasyn': util = ADASYN() elif method=='random-over-sampler': util = RandomOverSampler() elif method=='smote': util = SMOTE(kind='borderline2') elif method=='smote-tomek': util = SMOTETomek() elif method=='smote-enn': util = SMOTEENN() elif method=='edited-nn': util = EditedNearestNeighbours() elif method=='repeated-edited-nn': util = RepeatedEditedNearestNeighbours() elif method=='all-knn': util = AllKNN() elif method=='one-sided-selection': util = OneSidedSelection() elif method=='cluster-centroids': util = ClusterCentroids() elif method=='random-under-sampler': util = RandomUnderSampler() elif method=='neighbourhood-cleaning-rule': util = NeighbourhoodCleaningRule() elif method=='condensed-nearest-neighbour': util = CondensedNearestNeighbour() elif method=='near-miss': util = NearMiss(version=1) elif method=='instance-hardness-threshold': util = InstanceHardnessThreshold() x_resampled, y_resampled = util.fit_sample(df_data[predictors], df_data[target]) x_resampled = pd.DataFrame(x_resampled, columns=predictors) y_resampled = pd.DataFrame(y_resampled, columns=[target]) return x_resampled, y_resampled
def readFile(path, y_label,method, encode_features=[], skew_exempted=[], training_ratio=0.7, shuffle=True, needSkew=False,fea_eng=True): raw = pd.read_csv(path) n, d = raw.shape if (shuffle): raw = raw.sample(frac=1).reset_index(drop=True) # shuffle if (needSkew): skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop(skew_exempted)].apply(lambda x: skew(x.dropna())) skewed = skewed[skewed > 0.75].index raw[skewed] = np.log1p(raw[skewed]) # reduce skewness raw = pd.get_dummies(raw, columns=encode_features) # encode categorical features raw = raw.fillna(raw.mean()) # if(method=='OverSample'): # ind_more=np.argmax(np.bincount(raw[y_label])) # more=raw[ind] # less=raw[-ind] # x = [randint(0, len(less)) for a in range(0, len(more)-len(less))] # raw. X=raw.drop(y_label,axis=1) y=raw[y_label] if(method=='OverSample'): ada = ADASYN(random_state=42) X_res, y_res = ada.fit_resample(X, y) X=X_res y=y_res if(method=='UnderSample'): # for i in [] model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP X_res, y_res = model.fit_resample(X, y) #doctest: +SKIP \ X=X_res y=y_res # if(method=='Weights'): # if(fea_eng==True): # # X,y=feature_eng(X,y) X_train, X_test, y_train, y_test=split(X,y, training_ratio) return X_train, X_test, y_train, y_test
def __init__(self): from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \ TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \ CondensedNearestNeighbour, NeighbourhoodCleaningRule from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \ BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier self.oversamplers = { 'ADASYN': ADASYN(), 'RandomOverSampler': RandomOverSampler(), 'SMOTE': SMOTE(), 'BorderlineSMOTE': BorderlineSMOTE(), 'SVMSMOTE': SVMSMOTE() } self.undersamplers = { 'ClusterCentroids': ClusterCentroids(), 'RandomUnderSampler': RandomUnderSampler(), 'InstanceHardnessThreshold': InstanceHardnessThreshold(), 'NearMiss': NearMiss(), 'TomekLinks': TomekLinks(), 'EditedNearestNeighbours': EditedNearestNeighbours(), 'RepeatedEditedNearestNeighbours': RepeatedEditedNearestNeighbours(), 'AllKNN': AllKNN(), 'OneSidedSelection': OneSidedSelection(), 'CondensedNearestNeighbour': CondensedNearestNeighbour(), 'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule() } self.ensemblesamplers = { 'EasyEnsemble': EasyEnsemble(), 'EasyEnsembleClassifier': EasyEnsembleClassifier(), 'BalancedBaggingClassifier': BalancedBaggingClassifier(), 'BalanceCascade': BalanceCascade(), 'BalancedRandomForestClassifier': BalancedRandomForestClassifier, 'RUSBoostClassifier': RUSBoostClassifier() }
from matplotlib import pyplot import matplotlib.pyplot as plt from pylab import subplot, title from matplotlib.colors import ListedColormap from imblearn.under_sampling import CondensedNearestNeighbour X1, y1 = make_blobs(n_samples=150, centers=4, n_features=2,random_state=21) X2, y2 = make_gaussian_quantiles(mean=(2,2),cov=3., n_samples=150, n_features=2, n_classes=3, random_state=9) X3, y3 = make_gaussian_quantiles(mean=(5,5),cov=5., n_samples=150, n_features=2, n_classes=2, random_state=15) X = concatenate([X1,X2,X3]) y = concatenate([y1,y2,y3]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=35) cnn = CondensedNearestNeighbour(random_state=0) #random_state is used to get the same result for every run X_res1, y_res1 = cnn.fit_sample(X, y) X_train1, X_test1, y_train1, y_test1 = train_test_split(X_res1, y_res1, test_size=0.25, random_state=35) #CNN İLE ALAKALI ACCURACY İÇİN! h = .02 cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF', '#8B008B']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#8B008B']) clf1 = KNeighborsClassifier(n_neighbors=1, weights='uniform') clf2 = KNeighborsClassifier(n_neighbors=1, weights='uniform') clf1.fit(X_train, y_train) clf2.fit(X_train1,y_train1) pred1 = clf1.predict(X_test)
def test_cnn_fit_resample_with_wrong_object(): knn = 'rnd' cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int or an "): cnn.fit_resample(X, Y)
from feature_creation import X_train, y_train from feature_creation import selector, idx, df_reduced_train from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN from imblearn.under_sampling import TomekLinks, ClusterCentroids, NearMiss, CondensedNearestNeighbour, RandomUnderSampler from imblearn.under_sampling import OneSidedSelection, InstanceHardnessThreshold from imblearn.combine import SMOTEENN, SMOTETomek from sklearn.linear_model import SGDClassifier imbalances = [ RandomUnderSampler(), TomekLinks(), ClusterCentroids(), NearMiss(version=1, size_ngh=5), NearMiss(version=2, size_ngh=7), NearMiss(version=3, size_ngh=3), CondensedNearestNeighbour(size_ngh=3, n_seeds_S=51), OneSidedSelection(size_ngh=5, n_seeds_S=51), OneSidedSelection(size_ngh=5, n_seeds_S=35), InstanceHardnessThreshold(), RandomOverSampler(ratio='auto'), ADASYN(ratio='auto', k=3), ADASYN(ratio=0.1, k=5), ADASYN(ratio=0.2, k=7), ADASYN(ratio=0.4, k=7), SMOTE(ratio='auto', kind='regular', k=5), SMOTE(ratio=0.1, kind='regular', k=5), SMOTE(ratio='auto', kind='regular', k=7), SMOTE(ratio='auto', kind='regular', k=9, out_step=0.6), SMOTE(ratio=0.4, kind='regular', k=5, out_step=0.5), SMOTE(ratio='auto', kind='borderline1'), SMOTE(ratio='auto', kind='borderline2'),
def sample_data_by_cnn(X, y): cnn = CondensedNearestNeighbour(random_state=42) return cnn.fit_resample(X, y)
def test_cnn_fit_sample_with_wrong_object(): knn = 'rnd' cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) assert_raises_regex(ValueError, "has to be a int or an ", cnn.fit_sample, X, Y)
scores = cross_validate(enn_pipe_rf, X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision')) scores['test_roc_auc'].mean(), scores['test_average_precision'].mean() # (0.9248526844001812, 0.6883592815252976) ######### Condensed Nearest Neighbor ######### from imblearn.under_sampling import CondensedNearestNeighbour # opposite of ENN; iteratively adds points to the data that are misclassified by KNN cnn = CondensedNearestNeighbour() X_train_cnn, y_train_cnn = cnn.fit_sample(X_train, y_train) print(X_train_cnn.shape) print(np.bincount(y_train_cnn)) ### Pipeline method cnn_pipe = make_imb_pipeline(CondensedNearestNeighbour(), LogisticRegression()) scores = cross_validate(cnn_pipe, X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision')) pd.DataFrame(scores)[['test_roc_auc', 'test_average_precision']].mean()
def Sampling(X, y, method): """ function to sample imbalanced dataset: Arguments: X -- trainset features y -- trainset labels method -- sampling method Return: X_res -- sampled trainset features y_res -- sampled trainset labels """ #Under-sampling: if method == 'RandomUnderSampler': from imblearn.under_sampling import RandomUnderSampler us = RandomUnderSampler() X_res, y_res = us.fit_resample(X, y) elif method == 'TomekLinks': from imblearn.under_sampling import TomekLinks us = TomekLinks() X_res, y_res = us.fit_resample(X, y) elif method == 'OneSidedSelection': from imblearn.under_sampling import OneSidedSelection us = OneSidedSelection() X_res, y_res = us.fit_resample(X, y) elif method == 'NeighbourhoodCleaningRule': from imblearn.under_sampling import NeighbourhoodCleaningRule us = NeighbourhoodCleaningRule() X_res, y_res = us.fit_resample(X, y) elif method == 'NearMiss': from imblearn.under_sampling import NearMiss us = NearMiss() X_res, y_res = us.fit_resample(X, y) elif method == 'InstanceHardnessThreshold': from imblearn.under_sampling import InstanceHardnessThreshold us = InstanceHardnessThreshold() X_res, y_res = us.fit_resample(X, y) elif method == 'AllKNN': from imblearn.under_sampling import AllKNN us = AllKNN() X_res, y_res = us.fit_resample(X, y) elif method == 'RepeatedEditedNearestNeighbours': from imblearn.under_sampling import RepeatedEditedNearestNeighbours us = RepeatedEditedNearestNeighbours() X_res, y_res = us.fit_resample(X, y) elif method == 'EditedNearestNeighbours': from imblearn.under_sampling import EditedNearestNeighbours us = EditedNearestNeighbours() X_res, y_res = us.fit_resample(X, y) elif method == 'CondensedNearestNeighbour': from imblearn.under_sampling import CondensedNearestNeighbour us = CondensedNearestNeighbour() X_res, y_res = us.fit_resample(X, y) # Combination of over- and under-sampling: elif method == 'SMOTEENN': from imblearn.combine import SMOTEENN us = SMOTEENN() X_res, y_res = us.fit_resample(X, y) elif method == 'SMOTETomek': from imblearn.combine import SMOTETomek us = SMOTETomek() X_res, y_res = us.fit_resample(X, y) return X_res, y_res
def balance_cnn(input): input_x, input_y = input cnn = CondensedNearestNeighbour(random_state=42) X_res, y_res = cnn.fit_sample(input_x, input_y) return X_res, y_res
def condensedNearestNeighbour(self, n_neighbors=1): under = CondensedNearestNeighbour(n_neighbors=n_neighbors) self.steps.append(('u', under)) return resampling(data=self.data, target=self.target, steps=self.steps)
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train, AI_ova_X_train, AI_ova_y_train, AW_ova_X_train, AW_ova_y_train, CC_ova_X_train, CC_ova_y_train, QA_ova_X_train, QA_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN( ), ADASYN(), ADASYN() AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ALLKNN": AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN( ), AllKNN(), AllKNN(), AllKNN(), AllKNN() AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "CNN": AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ENN": AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "IHT": AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NCR": AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AA_ova_y_train = [ 0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train ] AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_ova_y_train = [ 0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train ] AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_ova_y_train = [ 0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train ] AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_ova_y_train = [ 0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train ] CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_ova_y_train = [ 0 if i == "Queued/Awaiting Assignment" else 1 for i in QA_ova_y_train ] QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NM": AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss( ), NearMiss(), NearMiss() AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "OSS": AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection( ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RENN": AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTE": AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE( ), SMOTE() AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "BSMOTE": AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE( ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTEENN": AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN( ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN() AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTETOMEK": AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek( ), SMOTETomek(), SMOTETomek(), SMOTETomek() AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "TOMEK": AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks( ), TomekLinks(), TomekLinks(), TomekLinks() AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ROS": AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler( ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RUS": AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler( ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train, QA_ova_y_train) return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 #initial statistics plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k') plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show() #using standard scaling on sepa-length,sepal-width ,petal -length,petal-width and encoding on #different species of iris x = iris.data[:, 0:4] y = iris.target X_normalized = normalize(x, axis=0) x_train, x_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.20) cnn = CondensedNearestNeighbour(return_indices=True) X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X_normalized, y) clf = KNeighborsClassifier(n_neighbors=1) clf.fit(X_resampled, y_resampled) y_pred = clf.predict(x_test) print(confusion_matrix(y_test, y_pred)) target_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'] print(classification_report(y_test, y_pred, target_names=target_names))
X = np.vstack((data_major, data_minor)) y = np.hstack((y, np.ones(len(data_minor)))) if i1 not in [0, 8]: if i1 == 1: res = RandomUnderSampler(random_state=seed) if i1 == 2: res = NearMiss(version=1) if i1 == 3: res = NearMiss(version=2) if i1 == 4: res = ClusterCentroids(random_state=seed) if i1 == 5: res = EditedNearestNeighbours() if i1 == 6: res = CondensedNearestNeighbour(random_state=seed) if i1 == 7: res = TomekLinks() X_re, y_re = res.fit_resample(X, y) else: if i1 == 0: X_re, y_re = X, y if i1 == 8: X_re, y_re = PSU(X, y) temp_color = [] for i2 in range(len(y_re)): if y_re[i2] > 0: temp_color.append("chocolate") else:
def test_cnn_init(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) assert_equal(cnn.n_seeds_S, 1) assert_equal(cnn.n_jobs, 1)
os.makedirs(out_dir) # save preprocessing transforms dump(self.df_imputer, open(os.path.join(out_dir, 'df_imputer.pkl'), 'wb')) dump(self.ord_enc, open(os.path.join(out_dir, 'ord_enc.pkl'), 'wb')) # save undersampling transforms for key, transform in self.undersamples.items(): dump(transform, open(os.path.join(out_dir, f'undersampler_{key}.pkl'), 'wb')) if __name__ == '__main__': trans_data = transaction_table('../data/train_transaction.csv') print(trans_data.df.head(10)) trans_data.add_undersampling_transform( 'random', RandomUnderSampler(sampling_strategy='majority', random_state=0)) trans_data.add_undersampling_transform( 'CNN', CondensedNearestNeighbour(n_neighbors=1, random_state=0, n_jobs=-1)) trans_data.add_undersampling_transform('Tomek', TomekLinks(n_jobs=-1)) trans_data.add_undersampling_transform( 'OSS', OneSidedSelection(n_neighbors=1, n_seeds_S=200, random_state=0, n_jobs=-1)) trans_data.save_trained_transforms()
GNG_RES = fill_table(GNG_RES, inds, j, interm, final) except Exception as e: print('an issue with {}, rate: {}, variant: {}'.format( dataset, rate, variant)) print(e) if CNN_FLAG: variant = 'CNN' print('>> {}, rate: {}, variant: {}'.format( dataset, rate, variant)) try: dataset_size = X_train.shape[0] coreset_size = max(int(dataset_size * rate / 100), 1) startTime = datetime.now() cnn = CondensedNearestNeighbour(random_state=SEED) observers, total_labels = cnn.fit_sample(X_train, y_train) observers, total_labels = fix_rate(X_train, y_train, observers, total_labels, coreset_size) interm = (datetime.now() - startTime).total_seconds() startTime = datetime.now() try: neigh = KNeighborsClassifier(n_neighbors=5) except: neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(observers, total_labels) y_pred = neigh.predict(X_test) final = (datetime.now() - startTime).total_seconds() inds = get_indices_(y_test, y_pred) CNN_RES = fill_table(CNN_RES, inds, j, interm, final)
def under_sample_CondensedNearestNeighbour(train_inputs, train_targets): sampler = CondensedNearestNeighbour(random_state=32) train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets) return train_inputs, train_targets
# ``CondensedNearestNeighbour`` is sensitive to noise by preserving the noisy # samples. ``OneSidedSelection`` also used the 1-NN and use ``TomekLinks`` to # remove the samples considered noisy. The ``NeighbourhoodCleaningRule`` use a # ``EditedNearestNeighbours`` to remove some sample. Additionally, they use a 3 # nearest-neighbors to remove samples which do not agree with this rule. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(15, 25)) X, y = create_dataset(n_samples=500, weights=(0.2, 0.3, 0.5), class_sep=0.8) ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6)) for ax, sampler in zip( ax_arr, ( CondensedNearestNeighbour(random_state=0), OneSidedSelection(random_state=0), NeighbourhoodCleaningRule(), ), ): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) ax[0].set_title(f"Decision function for {sampler.__class__.__name__}") plot_resampling(X, y, sampler, ax[1]) ax[1].set_title(f"Resampling using {sampler.__class__.__name__}") fig.tight_layout() ############################################################################### # ``InstanceHardnessThreshold`` uses the prediction of classifier to exclude # samples. All samples which are classified with a low probability will be
def main(): print("Symptom prediction") global variable_file print("Connecting to ML4H DB..") conn = pymysql.connect(host='nightmare.cs.uct.ac.za', port=3306, user='******', passwd='oesaerex', db='ochomo001') print("Connected") cur = conn.cursor() print("Executing SQL query..") #Query to get the totals of all the symptoms that a patient has reported to the clinic #Used as features to predict the next possible that a patient will report #cur.execute("select person_id, count(*) AS Tot_Num_Sympt, sum(case when value_coded_name_id =110 then 1 else 0 end) AS Cough, sum(case when value_coded_name_id = 4315 then 1 else 0 end) AS Fever, sum(case when value_coded_name_id = 156 then 1 else 0 end) AS Abdominal_pain, sum(case when value_coded_name_id = 524 then 1 else 0 end) AS skin_rash,sum(case when value_coded_name_id=1576 then 1 else 0 end) AS Lactic_acisdosis, sum(case when value_coded_name_id=2325 then 1 else 0 end) AS Lipodystrophy,sum(case when value_coded_name_id=3 then 1 else 0 end) AS Anemia,sum(case when value_coded_name_id=888 then 1 else 0 end) AS Anorexia,sum(case when value_coded_name_id=11335 then 1 else 0 end) AS Cough_any_duration,sum(case when value_coded_name_id=17 then 1 else 0 end) AS Diarrhea,sum(case when value_coded_name_id=10894 then 1 else 0 end) AS Leg_pain,sum(case when value_coded_name_id=4407 then 1 else 0 end) AS Night_Sweats,sum(case when value_coded_name_id=9345 then 1 else 0 end) AS Other,sum(case when value_coded_name_id=838 then 1 else 0 end) AS Peripheral_neuropathy,sum(case when value_coded_name_id=4355 then 1 else 0 end) AS Vomiting,sum(case when value_coded_name_id=11333 then 1 else 0 end) AS Weight_loss,min(obs_datetime), max(obs_datetime) from Obs_artvisit_sympt a inner join concept_name n on a.value_coded_name_id=n.concept_name_id group by person_id") cur.execute("select * from ML4H_symptom_totals") print("Executed") cur.close() #Loading features patient_ids = [] sum_of_features = [] for k in range( len(Patient.features) ): sum_of_features.append(0) patients = {} print("loading in data into program...") for row in cur: if (row[0] not in patient_ids): patient_ids.append(row[0]) patients[row[0]] = (Patient(int(row[0]))) patients[row[0]].feature_symptom_array = [ int(row[2]) + int(row[10]),int(row[3]),int(row[4]),int(row[5]),int(row[6]), int(row[7]), int(row[8]), int(row[9]), int(row[11]), int(row[12]), int(row[13]), int(row[14]), int(row[15]),int(row[16]), int(row[17]), 0, 0, 0 ] #To check the balance of features for i in range( len(sum_of_features)- 3 ): if i == 8: #Exception for Cough for duration - joining with Cough feature sum_of_features[0] += int(row[i + 2]) elif i > 8: sum_of_features[i-1]+= int(row[i + 2]) else: sum_of_features[i] += int(row[i + 2]) cur1 = conn.cursor() print("Executing SQL query..") #Gets the last reported symptom that a patient reported - RESULT CLASS (Trying to predict this) cur1.execute("select person_id, value_coded_name_id,name, MAX(obs_datetime) from Obs_artvisit_sympt a inner join concept_name n on a.value_coded_name_id = n.concept_name_id where value_coded_name_id IN (524,110,4315,156,3,888,11335,17,2325,4407,10894,9345,838,4355,11333) Group by person_id") print("Executed") cur1.close() #Loads result set - last symptom reported by patient for row in cur1: if row[2] == "None" or row[2] =='None' or row[2] is None: continue if (row[0] not in patient_ids): patient_ids.append( row[0] ) patients[row[0]] = (Patient( int(row[0])) ) if row[2] == "Cough of any duration": patients[row[0]].last_symptom = "Cough" else: patients[row[0]].last_symptom = row[2] patients[row[0]].set_sympt_class() #Indexing - Binaryzing classes for ROC scores later on print("Loaded data") print() print("Patient Ids:") #print(patient_ids) print("Patient objects") #print(patients) # Adding temporal aspect and adding more patient specific - Sex, Age, Last Drug, num of symptoms in prev month # If last reported symptom is longer than a specified number of days(eg. 30) then change result to No symptom # Reason: Too long after to be helpful i.e predicting that someone will eventually report a skin rash is not as helpful # as predicting a skin rash next month. print("Executing temporal query...") cur4 = temporal_symptoms_ML.query_for_40_day_prev_symptoms_FROM_TABLE(conn) #cur4 = temporal_symptoms_ML.query_for_10_day_prev_symptoms(conn) #cur4 = temporal_symptoms_ML.query_for_70_day_prev_symptoms(conn) for row in cur4: if (row[0] in patient_ids): if int(row[3]) < 11 and patients[row[0]].last_symptom != "None" : patients[row[0]].last_symptom = "No symptoms" patients[row[0]].set_sympt_class() # Reindexing - Binaryzing classes for ROC scores later on patients[row[0]].feature_symptom_array[ Patient.features.index("Age") ] = int( row[5].year ) patients[row[0]].feature_symptom_array[ Patient.features.index("Last Drug") ] = int( row[6] ) patients[row[0]].feature_symptom_array[ Patient.features.index("Tot Prev Month Symptoms") ] = int( row[7] ) if row[3] == None: patients[row[0]].time_between_symptom_reports = 1000 if int(row[3]) > 60: patients[row[0]].time_between_symptom_reports = 1000 else: patients[row[0]].time_between_symptom_reports = int(row[3]) print("Executed.") conn.close() #Load all data into array for ML application ml4hX = [] ml4hY = [] ml4hY_multiclass = [] ml4hY_multilabel = [] for id in patient_ids: if patients[id].check_if_null_features(): continue if patients[id].last_symptom == "None": continue if patients[id].feature_symptom_array[ Patient.features.index("Age") ] == 0: continue ml4hX.append( patients[id].feature_symptom_array ) ml4hY.append(patients[id].last_symptom) ml4hY_multiclass.append(patients[id].last_symptom_class) ml4hY_multilabel.append( patients[id].time_between_symptom_reports ) print("Feature set:") #print(ml4hX) print("Result set:") #print(ml4hY) print() print(len(ml4hX)) print(len(ml4hY)) # Opening significance file for writing variable_file = open("symptom_variable_significance.csv",'w') for symptom in Patient.features: variable_file.write("," + symptom) variable_file.write("\n") print("Orig") check_symptom_result_distribution(ml4hY) print() X_train1, X_validation1, Y_train1, Y_validation1 = model_selection.train_test_split(ml4hX, ml4hY_multilabel, test_size=0.2, random_state=7) kfold2 = model_selection.KFold(n_splits=10, random_state=7) cv = RandomForestRegressor() cv.fit( X_train1,Y_train1 ) print( explained_variance_score( Y_validation1, cv.predict(X_validation1) ) ) for h in range(15): print(X_validation1[h]) print("True :" + str(Y_validation1[h]) + " Pred : " + str(cv.predict([X_validation1[h] ])) ) #cv_results2 = model_selection.cross_val_score(RandomForestClassifier(), X_train1, Y_train1, cv=kfold2, scoring='accuracy') #print(cv_results2.mean()) print("ORIGINAL") apply_machine_learning_techniques(ml4hX,ml4hY_multiclass, "Original") # Trying different samplers samplers = [ ["RandomUnderSampler_0.6", RandomUnderSampler()], ["NearMiss_0.025", NearMiss(ratio=0.005)], #[ "RandomOver_0.3", RandomOverSampler() ], ["CondensedNearestNeighbour0.3", CondensedNearestNeighbour(ratio=0.3)], #["RepeatedEditedNearestNeighbours0.2",RepeatedEditedNearestNeighbours(ratio=0.2)], #["ALLKNN_0.4",AllKNN(ratio=0.005)], ["TomekLinks_0.3", TomekLinks(ratio=0.005)] ] for sampler in samplers: print(sampler[0]) X_resamp, Y_resamp = sampler[1].fit_sample(ml4hX, ml4hY_multiclass) # check_symptom_result_distribution(Y_resamp) # print() apply_machine_learning_techniques(X_resamp, Y_resamp, sampler[0]) print("............") nearmiss = NearMiss(ratio=0.03) X_resampled_nm, y_resampled_nm = nearmiss.fit_sample(ml4hX, ml4hY_multiclass) # check_symptom_result_distribution(y_resampled_nm) # print() print("RandomOVer") randomOVer = RandomOverSampler() X_resampled_ranO, y_resampled_ranO = randomOVer.fit_sample(X_resampled_nm, y_resampled_nm) check_symptom_result_distribution(y_resampled_ranO) print() print("NEAR MISS ADJUSTED OVERSAMPLE") apply_machine_learning_techniques(X_resampled_ranO, y_resampled_ranO, "NM_ADJ_Over") print("feature distribution") for j in range(len(sum_of_features)): print(Patient.features[j] + " : " + str(sum_of_features[j])) print() print("result class distribution") check_symptom_result_distribution(ml4hY) variable_file.close()
# Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Condensed Nearest Neighbours cnn = CondensedNearestNeighbour() X_resampled, y_resampled = cnn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
def test_cnn_init(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) assert cnn.n_seeds_S == 1 assert cnn.n_jobs == 1