def fit(self, X , y = None): # 'Random under-sampling' smote = CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51) #Accuracy: 0.939693267481 #Precision: 0.238095238095 #Recall: 0.897435897436 #Accuracy: 0.962568234988 #Precision: 0.324468085106 #Recall: 0.782051282051 #SMOTE(ratio=ratio, kind='borderline1') #Accuracy: 0.971146347803 #Precision: 0.372093023256 #Recall: 0.615384615385 #SMOTE(ratio=ratio, kind='borderline2') #Accuracy: 0.965427605927 #Precision: 0.333333333333 #Recall: 0.705128205128 #svm_args = {'class_weight': 'auto'} #svmsmote = SMOTE(ratio=ratio, kind='svm', **svm_args) #Accuracy: 0.972186119054 #Precision: 0.395683453237 #Recall: 0.705128205128 # smote = SMOTE(ratio='auto', kind='regular') X, y = smote.fit_sample(X.toarray(), y) weights = np.array([1/y.mean() if i == 1 else 1 for i in y]) return super(RandomForestClassifier, self).fit(X,y,sample_weight=weights)
def test_cnn_fit_sample_with_object(): """Test the fit sample routine with a knn object""" # Resample the data knn = KNeighborsClassifier(n_neighbors=1) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=1) X_resampled, y_resampled = cnn.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def random_instance_selection(dfZ, x, blackbox, dataset): dfZ1, Z = random_neighborhood(dfZ, x, blackbox, dataset) y = blackbox.predict(Z) cnn = CondensedNearestNeighbour(return_indices=True) Z, _, _ = cnn.fit_sample(Z, y) dfZ = build_df2explain(blackbox, Z, dataset) return dfZ, Z
def test_cnn_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object cnn = CondensedNearestNeighbour(random_state=RND_SEED) cnn.fit(X, Y) assert_raises(RuntimeError, cnn.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_cnn_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object cnn = CondensedNearestNeighbour(random_state=RND_SEED) cnn.fit(X, Y) assert_raises(RuntimeError, cnn.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def cnn_test(data_set: pd.DataFrame, metric: str, k: int, weights='uniform'): X = np.array(data_set.iloc[:, 0:2]) y = np.array(data_set.iloc[:, 2:]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10) cnn = CondensedNearestNeighbour(n_neighbors=k, sampling_strategy="all") X_train_re, y_train_re = cnn.fit_resample(X_train, y_train) clf = neighbors.KNeighborsClassifier(k, metric=metric, weights=weights) clf.fit(X_train_re, y_train_re.ravel()) predicted = clf.predict(X_test) accuracy = accuracy_score(predicted, y_test) print(accuracy) plot_decisions_boundaries(X_train, y_train, clf=clf)
def test_cnn_fit_sample(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample(): """Test the fit sample routine""" # Resample the data cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample(): """Test the fit sample routine""" # Resample the data cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def condensed_nearest_neighbour(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): cnn = CondensedNearestNeighbour(random_state=42) X_res, y_res = cnn.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_cnn_fit(): """Test the fitting method""" # Create the object cnn = CondensedNearestNeighbour(random_state=RND_SEED) # Fit the data cnn.fit(X, Y) # Check if the data information have been computed assert_equal(cnn.min_c_, 0) assert_equal(cnn.maj_c_, 1) assert_equal(cnn.stats_c_[0], 500) assert_equal(cnn.stats_c_[1], 4500)
def test_cnn_fit(): """Test the fitting method""" # Create the object cnn = CondensedNearestNeighbour(random_state=RND_SEED) # Fit the data cnn.fit(X, Y) # Check if the data information have been computed assert_equal(cnn.min_c_, 0) assert_equal(cnn.maj_c_, 1) assert_equal(cnn.stats_c_[0], 500) assert_equal(cnn.stats_c_[1], 4500)
def test_cnn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_cnn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_cnn_fit_sample_with_indices(): # Resample the data cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def getsampler(self, type): if type == 'none': sampler = NoSampler() elif type == 'randomunder': sampler = RandomUnderSampler() elif type == 'nearmiss': sampler = NearMiss() elif type == 'allknn': sampler = AllKNN() elif type == 'condensednn': sampler = CondensedNearestNeighbour() elif type == 'editednn': sampler = EditedNearestNeighbours() elif type == 'repeatededitednn': sampler = RepeatedEditedNearestNeighbours() elif type == 'tomeklinks': sampler = TomekLinks() elif type == 'randomover': sampler = RandomOverSampler() elif type == 'smote': sampler = SMOTE() elif type == 'adasyn': sampler = ADASYN() elif type == 'smotenc': sampler = SMOTENC() elif type == 'quality': # and self.quality_model_selection_type == 'extended': sampler = QualitySampler(self.n_init) else: print("Unsupported sampler %s" % type) exit(1) if type != 'none' and type != 'quality' and 'random_state' in sampler.get_params( ).keys(): sampler.set_params(random_state=self.random_state) return sampler
def test_cnn_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object cnn = CondensedNearestNeighbour(random_state=RND_SEED) assert_raises(RuntimeError, cnn.sample, X, Y)
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def test_cnn_fit_sample_with_wrong_object(): """Test either if an error is raised while a wrong object is given""" # Resample the data knn = 'rnd' cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) assert_raises(ValueError, cnn.fit_sample, X, Y)
def Balance_classes(X_train, y_train, Sampling_Function): if Sampling_Function == 'RandomUnderSampler': us = RandomUnderSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'NearMiss1': us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3) elif Sampling_Function == 'NearMiss2': us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3) elif Sampling_Function == 'NearMiss3': us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3) elif Sampling_Function == 'CondensedNearestNeighbour': us = CondensedNearestNeighbour(random_state=1) elif Sampling_Function == 'EditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'RepeatedEditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'TomekLinks': us = TomekLinks(random_state=1) elif Sampling_Function == 'RandomOverSampler': us = RandomOverSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'SMOTE': us = SMOTE(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTETomek': us = SMOTETomek(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTEENN': us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5) elif Sampling_Function == 'EasyEnsemble': us = EasyEnsemble() elif Sampling_Function == 'BalanceCascade_rf': us = BalanceCascade(classifier='random-forest', random_state=1) elif Sampling_Function == 'BalanceCascade_svm': us = BalanceCascade(classifier='linear-svm', random_state=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) return X_train_res, y_train_res
def under_sample(X, y, sampler="RandomUnderSampler"): # list of all samplers, in case you want to iterate all of them samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', 'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection'] print(samplers_list) # currently there is no parameters sampler # this dict is used to choose a resampler by user. default is random samplers = { "RandomUnderSampler": RandomUnderSampler(), "ClusterCentroids": ClusterCentroids(), "NearMiss": NearMiss(), "InstanceHardnessThreshold": InstanceHardnessThreshold(), "CondensedNearestNeighbour": CondensedNearestNeighbour(), "EditedNearestNeighbours": EditedNearestNeighbours(), "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(), "AllKNN": AllKNN(), "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(), "OneSidedSelection": OneSidedSelection(), } sampler = samplers[sampler] # plot y class count before and after resample print("before", sorted(Counter(y).items())) # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) print("after", sorted(Counter(y_resampled).items())) print('===' * 4, 'under_sample finished') return X_resampled, y_resampled
def test_continuous_error(): """Test either if an error is raised when the target are continuous type""" # continuous case y = np.linspace(0, 1, 20) cnn = CondensedNearestNeighbour(random_state=RND_SEED) assert_warns(UserWarning, cnn.fit, X, y)
def test_cnn_init(): """Test the initialisation of the object""" # Define a ratio cnn = CondensedNearestNeighbour(random_state=RND_SEED) assert_equal(cnn.n_seeds_S, 1) assert_equal(cnn.n_jobs, 1)
def Resampling(train_x, train_y, resampling_method): train_y.data = LabelEncoder().fit_transform(train_y.data) # summarize distribution # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling #plotGraphics.piePlot(train_y, "Before Resampling") # ---- UNDER-SAMPLING ------ # if resampling_method == "ClusterCentroids": resample = ClusterCentroids(voting='hard', random_state=42) if resampling_method == "CondensedNearestNeighbour": resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42) if resampling_method == "EditedNearestNeighbours": resample = EditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "RepeatedEditedNearestNeighbours": resample = RepeatedEditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "AllKNN": resample = AllKNN(n_neighbors=7, kind_sel='mode', allow_minority=True, n_jobs=-1) if resampling_method == "NearMiss": resample = NearMiss(n_neighbors=7, n_jobs=-1) if resampling_method == "NeighbourhoodCleaningRule": resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all') if resampling_method == "RandomUnderSampler": resample = RandomUnderSampler(random_state=42) if resampling_method == "TomekLinks": resample = TomekLinks(n_jobs=-1) # ---- OVER-SAMPLING ------ # if resampling_method == "BorderlineSMOTE": resample = BorderlineSMOTE(random_state=42, n_jobs=-1) if resampling_method == "KMeansSMOTE": resample = KMeansSMOTE(random_state=42) if resampling_method == "RandomUnderSampler": resample = RandomOverSampler(random_state=42) if resampling_method == "SMOTE": resample = SMOTE(random_state=42, n_jobs=-1) # transform the dataset train_x.data, train_y.data = resample.fit_resample(train_x.data, train_y.data)
def test_cnn_fit_single_class(): """Test either if an error when there is a single class""" # Create the object cnn = CondensedNearestNeighbour(random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_warns(UserWarning, cnn.fit, X, y_single_class)
def votingClassifier(): print(colored("------Voting Classification-------", 'red')) # models random_forest = RandomForestClassifier(criterion='entropy', max_depth=30, n_estimators=48, random_state=0) clf_lr = LogisticRegression() clf_knn = KNeighborsClassifier(n_neighbors=7) # build classifier model = VotingClassifier(estimators=[('rf', random_forest), ('knn', clf_knn)], voting='soft', n_jobs=-1, weights=[2, 1]) print("Training the Voting classification.......") # start timer starttime = timeit.default_timer() # start timer cnn = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP # train model.fit(train_x, train_Y) print("The time difference is :", timeit.default_timer() - starttime) print("Predicting test data.......") # predict y_pred = model.predict(test_x) # results c_matrix = confusion_matrix(test_Y, y_pred) error = zero_one_loss(test_Y, y_pred) score = accuracy_score(test_Y, y_pred) # display results print('Confusion Matrix\n---------------------------\n', c_matrix) print('---------------------------') print("Error: {:.4f}%".format(error * 100)) print("Accuracy Score: {:.4f}%".format(score * 100)) print(classification_report(test_Y, y_pred)) print('accuracy: ', c_matrix.diagonal() / c_matrix.sum(axis=1)) # Plot non-normalized confusion matrix disp = plot_confusion_matrix(model, test_x, test_Y, cmap=plt.cm.Greens, values_format='.0f', xticks_rotation='horizontal') plt.title("Confusion Matrix for Voting Classifier") plt.show()
def get_data(force_reload=False, strategy='oversampling', test_size=0.15): train_data_file = os.path.join(DATA_DIR, 'train_data.{}.npy'.format(strategy)) train_labels_file = os.path.join(DATA_DIR, 'train_labels.{}.npy'.format(strategy)) val_data_file = os.path.join(DATA_DIR, 'val_data.{}.npy'.format(strategy)) val_labels_file = os.path.join(DATA_DIR, 'val_labels.{}.npy'.format(strategy)) training_files_exist = os.path.exists(train_data_file) and os.path.exists(train_labels_file) val_files_exist = os.path.exists(val_data_file) and os.path.exists(val_labels_file) if not force_reload and training_files_exist and val_files_exist: X_train = np.load(train_data_file) y_train = np.load(train_labels_file) X_val = np.load(val_data_file) y_val = np.load(val_labels_file) else: train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv')) X, y = to_data_format(train_df) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size) print('Shapes before: {}, {}'.format(X_train.shape, y_train.shape)) if strategy == 'oversampling': X_train, y_train = SMOTE(n_jobs=n_jobs).fit_resample(X_train, y_train) elif strategy == 'combine': smote = SMOTE(n_jobs=n_jobs) enn = EditedNearestNeighbours(n_jobs=n_jobs) X_train, y_train = SMOTEENN(smote=smote, enn=enn).fit_resample(X_train, y_train) elif strategy == 'undersampling': enn = EditedNearestNeighbours(n_jobs=n_jobs) X_train, y_train = enn.fit_resample(X_train, y_train) elif strategy == 'condensed-undersampling': cnn = CondensedNearestNeighbour(n_jobs=n_jobs, n_neighbors=3) X_train, y_train = cnn.fit_resample(X_train, y_train) print('Shapes after: {}, {}'.format(X_train.shape, y_train.shape)) np.save(train_data_file, X_train) np.save(train_labels_file, y_train) np.save(val_data_file, X_val) np.save(val_labels_file, y_val) return X_train, X_val, y_train, y_val
def resample(self, X, y, by, random_state=None, visualize=False): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y if visualize: df = pd.DataFrame(X_train) df['label'] = y_train df.plot.scatter(x=0, y=1, c='label', s=3, colormap='coolwarm', title='{} training set'.format(by)) return X_train, y_train
def train_stage(df_path, cb_path): print('Load Train Data.') df = pd.read_csv(df_path) print('\nShape of Train Data: {}'.format(df.shape)) y_df = np.array(df['target']) df_ids = np.array(df.index) df.drop(['ID_code', 'target'], axis=1, inplace=True) cb_cv_result = np.zeros(df.shape[0]) skf = StratifiedKFold(n_splits=15, shuffle=False, random_state=42) skf.get_n_splits(df_ids, y_df) #sm = TomekLinks(random_state=42) sm = CondensedNearestNeighbour(random_state=42, n_jobs=3) print('\nModel Fitting...') for counter, ids in enumerate(skf.split(df_ids, y_df)): print('\nFold {}'.format(counter + 1)) X_fit, y_fit = df.values[ids[0]], y_df[ids[0]] X_val, y_val = df.values[ids[1]], y_df[ids[1]] X_fit, y_fit = sm.fit_sample(X_fit, y_fit) print('CatBoost') cb_cv_result[ids[1]] += fit_cb(X_fit, y_fit, X_val, y_val, counter, cb_path, name='cb') del X_fit, X_val, y_fit, y_val gc.collect() auc_cb = round(roc_auc_score(y_df, cb_cv_result), 4) print('Catboost VAL AUC: {}'.format(auc_cb)) return 0
def readFile(path, y_label,method, encode_features=[], skew_exempted=[], training_ratio=0.7, shuffle=True, needSkew=False,fea_eng=True): raw = pd.read_csv(path) n, d = raw.shape if (shuffle): raw = raw.sample(frac=1).reset_index(drop=True) # shuffle if (needSkew): skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop(skew_exempted)].apply(lambda x: skew(x.dropna())) skewed = skewed[skewed > 0.75].index raw[skewed] = np.log1p(raw[skewed]) # reduce skewness raw = pd.get_dummies(raw, columns=encode_features) # encode categorical features raw = raw.fillna(raw.mean()) # if(method=='OverSample'): # ind_more=np.argmax(np.bincount(raw[y_label])) # more=raw[ind] # less=raw[-ind] # x = [randint(0, len(less)) for a in range(0, len(more)-len(less))] # raw. X=raw.drop(y_label,axis=1) y=raw[y_label] if(method=='OverSample'): ada = ADASYN(random_state=42) X_res, y_res = ada.fit_resample(X, y) X=X_res y=y_res if(method=='UnderSample'): # for i in [] model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP X_res, y_res = model.fit_resample(X, y) #doctest: +SKIP \ X=X_res y=y_res # if(method=='Weights'): # if(fea_eng==True): # # X,y=feature_eng(X,y) X_train, X_test, y_train, y_test=split(X,y, training_ratio) return X_train, X_test, y_train, y_test
def test_cnn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def resample(self, X, y, by, random_state=None): ''' by: String The method used to perform re-sampling currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS', 'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek', 'ORG'] ''' if by == 'RUS': sampler = RandomUnderSampler(random_state=random_state) elif by == 'CNN': sampler = CondensedNearestNeighbour(random_state=random_state) elif by == 'ENN': sampler = EditedNearestNeighbours(random_state=random_state) elif by == 'NCR': sampler = NeighbourhoodCleaningRule(random_state=random_state) elif by == 'Tomek': sampler = TomekLinks(random_state=random_state) elif by == 'ALLKNN': sampler = AllKNN(random_state=random_state) elif by == 'OSS': sampler = OneSidedSelection(random_state=random_state) elif by == 'NM': sampler = NearMiss(random_state=random_state) elif by == 'CC': sampler = ClusterCentroids(random_state=random_state) elif by == 'ROS': sampler = RandomOverSampler(random_state=random_state) elif by == 'SMOTE': sampler = SMOTE(random_state=random_state) elif by == 'ADASYN': sampler = ADASYN(random_state=random_state) elif by == 'BorderSMOTE': sampler = BorderlineSMOTE(random_state=random_state) elif by == 'SMOTEENN': sampler = SMOTEENN(random_state=random_state) elif by == 'SMOTETomek': sampler = SMOTETomek(random_state=random_state) elif by == 'ORG': sampler = None else: raise Error('Unexpected \'by\' type {}'.format(by)) if by != 'ORG': X_train, y_train = sampler.fit_resample(X, y) else: X_train, y_train = X, y return X_train, y_train
def test_cnn_init(): """Test the initialisation of the object""" # Define a ratio verbose = True cnn = CondensedNearestNeighbour(random_state=RND_SEED, verbose=verbose) assert_equal(cnn.size_ngh, 1) assert_equal(cnn.n_seeds_S, 1) assert_equal(cnn.n_jobs, -1) assert_equal(cnn.random_state, RND_SEED) assert_equal(cnn.verbose, verbose) assert_equal(cnn.min_c_, None) assert_equal(cnn.maj_c_, None) assert_equal(cnn.stats_c_, {})
def UnderSample(X, Y, method='Random', random_state=42): if X.size == len(X): X = X.reshape(-1, 1) if method is 'Cluster': # 默认kmeans估计器 sampler = ClusterCentroids(ratio='auto', random_state=random_state, estimator=None) elif method is 'Random': sampler = RandomUnderSampler(ratio='auto', random_state=random_state, replacement=False) elif method is 'NearMiss_1': sampler = NearMiss(ratio='auto', random_state=random_state, version=1) elif method is 'NearMiss_2': sampler = NearMiss(ratio='auto', random_state=random_state, version=2) elif method is 'NearMiss_3': sampler = NearMiss(ratio='auto', random_state=random_state, version=3) elif method is 'TomekLinks': sampler = TomekLinks(ratio='auto', random_state=random_state) elif method is 'ENN': # kind_sel可取'all'和'mode' sampler = EditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'RENN': # kind_sel可取'all'和'mode' sampler = RepeatedEditedNearestNeighbours(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'All_KNN': sampler = AllKNN(ratio='auto', random_state=random_state, kind_sel='all') elif method is 'CNN': sampler = CondensedNearestNeighbour(ratio='auto', random_state=random_state) elif method is 'One_SS': sampler = OneSidedSelection(ratio='auto', random_state=random_state) elif method is 'NCR': sampler = NeighbourhoodCleaningRule(ratio='auto', random_state=random_state, kind_sel='all', threshold_cleaning=0.5) elif method is 'IHT': sampler = InstanceHardnessThreshold(estimator=None, ratio='auto', random_state=random_state) X_resampled, Y_resampled = sampler.fit_sample(X, Y) return X_resampled, Y_resampled
def equalize_training_dataset_with_CondensedNN(x_train, y_train): from imblearn.under_sampling import CondensedNearestNeighbour old_shape = list(x_train.shape) # reshape before using using over/undersampling method x_tmp = np.reshape(x_train, (x_train.shape[0], -1)) x_resampled, y_resampled = CondensedNearestNeighbour( sampling_strategy={i: 180 for i in range(0, 43)}, n_neighbors=5, n_jobs=8).fit_resample(x_tmp, y_train) print(sorted(Counter(y_resampled).items())) # reshape after using using over/undersampling method old_shape[0] = x_resampled.shape[0] x_resampled = np.reshape(x_resampled, tuple(old_shape)) return x_resampled, y_resampled
def under_sampling_algs(): algs = list() algs.append(("No Rs Undersampling case", "No Re-sampling")) algs.append((RandomUnderSampler(random_state=1), 'RU')) algs.append((ClusterCentroids(random_state=1), 'CC')) algs.append((TomekLinks(), 'TL')) algs.append((NearMiss(version=1), 'NM1')) algs.append((NearMiss(version=2), 'NM2')) algs.append((NearMiss(version=3), 'NM3')) algs.append((CondensedNearestNeighbour(random_state=1), 'CNN')) algs.append((OneSidedSelection(random_state=1), 'OSS')) algs.append((EditedNearestNeighbours(), 'ENN')) algs.append((NeighbourhoodCleaningRule(), 'NCL')) algs.append((InstanceHardnessThreshold(random_state=1), 'IHT')) algs.append((RepeatedEditedNearestNeighbours(), 'RENN')) algs.append((AllKNN(), 'AllKNN')) return algs
def load_data(mode: str, normalize: bool = True): df, hidden_df = __load_data_first_time() # Extract x and y y = np.array(df['earnings'].to_numpy(), dtype=int) del df['earnings'] x = np.array(df.to_numpy(), dtype=float) # Hidden to numpy hidden = hidden_df.to_numpy() if mode == 'vanilla': pass elif mode == 'smote': x, y = SMOTE().fit_sample(x, y) elif mode == 'adasyn': x, y = ADASYN().fit_sample(x, y) elif mode == 'bordersmote': x, y = BorderlineSMOTE().fit_sample(x, y) elif mode == 'randomover': x, y, idxs = RandomOverSampler(return_indices=True).fit_sample(x, y) hidden = hidden[idxs] elif mode == 'randomunder': x, y, idxs = RandomUnderSampler(return_indices=True).fit_sample(x, y) hidden = hidden[idxs] elif mode == 'tomek': x, y, idxs = TomekLinks(return_indices=True).fit_sample(x, y) hidden = hidden[idxs] elif mode == 'knn': x, y, idxs = CondensedNearestNeighbour(return_indices=True, n_neighbors=3).fit_sample(x, y) hidden = hidden[idxs] if normalize: x -= np.mean(x, axis=0) x /= np.std(x, axis=0) return x, y, hidden
def test_cnn_fit_sample_with_wrong_object(): knn = 'rnd' cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int or an "): cnn.fit_sample(X, Y)
palette = sns.color_palette() # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Condensed Nearest Neighbours cnn = CondensedNearestNeighbour() X_resampled, y_resampled = cnn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)