def test_sample_regular_half(): """Test sample function with regular SMOTE and a ratio of 0.5.""" # Create the object ratio = 0.8 smote = SMOTETomek(ratio=ratio, random_state=RND_SEED) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.45784496, -0.1053161]]) y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_deprecation(): """Test right processing while passing old parameters""" X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.93976473, -0.06570176], [0.70319159, -0.02571668], [0.75052536, -0.19246517]]) y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]) smt = SMOTETomek(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) smt = SMOTETomek(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def outer_cv_loop(Xdata,Ydata,clf,parameters=[], n_splits=10,test_size=0.25): pred=numpy.zeros(len(Ydata)) importances=[] kf=StratifiedShuffleSplit(n_splits=n_splits,test_size=test_size) rocscores=[] for train,test in kf.split(Xdata,Ydata): if numpy.var(Ydata[test])==0: print('zero variance',varname) rocscores.append(numpy.nan) continue Ytrain=Ydata[train] Xtrain=fancyimpute.SoftImpute(verbose=False).complete(Xdata[train,:]) Xtest=fancyimpute.SoftImpute(verbose=False).complete(Xdata[test,:]) if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2: smt = SMOTETomek() Xtrain,Ytrain=smt.fit_sample(Xtrain.copy(),Ydata[train]) # filter out bad folds clf.fit(Xtrain,Ytrain) pred=clf.predict(Xtest) if numpy.var(pred)>0: rocscores.append(roc_auc_score(Ydata[test],pred)) else: rocscores.append(numpy.nan) importances.append(clf.feature_importances_) return rocscores,importances
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object sm = SMOTETomek(random_state=RND_SEED) sm.fit(X, Y) assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_smote_fit(): """Test the fitting method""" # Create the object smote = SMOTETomek(random_state=RND_SEED) # Fit the data smote.fit(X, Y) # Check if the data information have been computed assert_equal(smote.min_c_, 0) assert_equal(smote.maj_c_, 1) assert_equal(smote.stats_c_[0], 8) assert_equal(smote.stats_c_[1], 12)
def test_multiclass_error(): """ Test either if an error is raised when the target are not binary type. """ # continuous case y = np.linspace(0, 1, 20) sm = SMOTETomek(random_state=RND_SEED) assert_warns(UserWarning, sm.fit, X, y) # multiclass case y = np.array([0] * 3 + [1] * 2 + [2] * 15) sm = SMOTETomek(random_state=RND_SEED) assert_warns(UserWarning, sm.fit, X, y)
def prep_data(self, test_ratio, smoteenn, smotomek): # split data into train and test X_train, X_test, y_train, y_test = train_test_split( self.X, self.y, test_size=test_ratio, random_state=4) # if smoteenn is true, use smoteenn sampling if smoteenn: sme = SMOTEENN(random_state=1) X_train, y_train = sme.fit_resample(X_train, y_train) # if smotomek is true, use smotomek sampling if smotomek: smt = SMOTETomek(random_state=1) X_train, y_train = smt.fit_resample(X_train, y_train) return X_train, X_test, y_train, y_test
def smote_tomek(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): smt = SMOTETomek(random_state=42) X_res, y_res = smt.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def load_kinematics_and_labels(data_dir, trial_name): """ Load kinematics data and labels. Args: data_dir: A string. trial_name: A string. Returns: A 2-D NumPy array with time on the first axis. Labels are appended as a new column to the raw kinematics data (and are therefore represented as floats). """ ''' labels_dir = os.path.join(data_dir, 'transcriptions') labels_path = os.path.join(labels_dir, trial_name + '.txt') raw_labels_data = np.genfromtxt(labels_path, dtype=np.int, converters=LABELS_CONVERTERS, usecols=LABELS_USECOLS) frames = np.arange(1, kinematics_data.shape[0]+1, dtype=np.int) labels = np.zeros(frames.shape, dtype=np.int) for start, end, label in raw_labels_data: mask = (frames >= start) & (frames <= end) labels[mask] = label labels_data = labels.reshape(-1, 1) ''' print('TRIAL NAME:', trial_name) kinematics_data = load_kinematics(data_dir, trial_name) trial_name=trial_name.replace('_capture1','') trial_name=trial_name.replace('_capture2','') val = df_labels.loc[df_labels['filename'].str.match(trial_name),['label']] labels_data=np.array(val) if 'Suturing_G001' in trial_name: kinematics_data = downsample(kinematics_data,factor=8) labels_data = downsample(labels_data,factor=8) else: kinematics_data = downsample(kinematics_data) labels_data = downsample(labels_data) print(kinematics_data.shape,labels_data.shape) smt = SMOTETomek(sampling_strategy='auto', ratio=sample(labels_data)) X_smt, y_smt = smt.fit_sample(kinematics_data, labels_data) y_smt = np.expand_dims(y_smt, axis=1) print('X_smt.shape:',X_smt.shape,y_smt.shape) data = np.concatenate([X_smt, y_smt], axis=1) #labeled_data_only_mask = labels_data.flatten() != 0 return data#[labeled_data_only_mask, :]
def sampling (train): clustering = AgglomerativeClustering(n_clusters=10).fit(train.drop(columns = 'failure')) train['clusters'] = clustering.labels smt = SMOTETomek(ratio='auto') frame = [] for i in range(10): hold = train.loc[df['clusters'] == i] if hold['failures'] >= train['failures'].sum()*.1: frame[i] = smt.fit_sample(hold.drop(columns = 'failure'), hold['failures']) train = pd.concat(frame) return(train)
def test_sample_regular_half(): ratio = 0.8 smote = SMOTETomek(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.45784496, -0.1053161]]) y_gt = np.array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def rm_main(data): column_vals = data.columns X = np.array(data.loc[:, data.columns != CONST_CLASS_LABEL] ) # All of the features into an np array y = np.array(data.loc[:, data.columns == CONST_CLASS_LABEL]) # The class into an np array smt = SMOTETomek(random_state=2) # performs SMOTE and TOMEK # For SMOTE only.. use SMOTE(random_state=2) feature_train_res, class_train_res = smt.fit_sample( X, y.ravel()) # performs SMOTE full_training_set = np.column_stack((feature_train_res, class_train_res)) df = pd.DataFrame(data=full_training_set, columns=column_vals) return df
def smote_tomek(X, y): """Balancing data using SMOTETomek Args: X: Training set without Class Target y:Training set Class Target Returns: balanced train_x, test_x """ sample = SMOTETomek(random_state=42, sampling_strategy='all') X, y = sample.fit_resample(X, y) print('after balancing:', X.shape) return X, y
def Smote_Tomek(self): ''' First oversamples the minority classes using SMOTE based on the number of instances selected and then cleans all the data using Tomek Links. Returns ------- None. ''' X_train = self.X_train.copy() y_train = self.y_train.copy() smt = SMOTETomek(random_state=2020) (self.X_train_balanced, self.y_train_balanced) = smt.fit_resample(X_train, y_train)
def test_validate_estimator_default(): smt = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def resample(): test_switch = np.load('data/test_switch_w_64_f_20.npy') test_non_switch = np.load('data/test_non_switch_w_64_f_20.npy') train_switch = np.load('data/train_switch_w_64_f_20.npy') train_non_switch = np.load('data/train_non_switch_w_64_f_20.npy') resample_train = SMOTETomek(sampling_strategy='all', smote=SMOTE(n_jobs=4), tomek=TomekLinks(n_jobs=4)) resampe_test = SMOTETomek(sampling_strategy='all', smote=SMOTE(n_jobs=4), tomek=TomekLinks(n_jobs=4)) print('Beginning train resample...') X = np.concatenate((train_switch, train_non_switch)) y = np.concatenate( (np.zeros(train_switch.shape[0]), np.ones(train_non_switch.shape[0]))) X_res, y_res = resample_train.fit_resample(X, y) train_switch = [] train_non_switch = [] for i in range(X_res.shape[0]): if y_res[i] == 0: train_switch.append(X_res[i]) else: train_non_switch.append(X_res[i]) np.save('data/train_switch_w_64_f_20_samp.npy', np.array(train_switch)) np.save('data/train_non_switch_w_64_f_20_samp.npy', np.array(train_non_switch)) print('Beginning test resample...') X = np.concatenate((test_switch, test_non_switch)) y = np.concatenate( (np.zeros(test_switch.shape[0]), np.ones(test_non_switch.shape[0]))) X_res, y_res = resample_test.fit_resample(X, y) test_switch = [] test_non_switch = [] for i in range(X_res.shape[0]): if y_res[i] == 0: test_switch.append(X_res[i]) else: test_non_switch.append(X_res[i]) np.save('data/test_switch_w_64_f_20_samp.npy', np.array(test_switch)) np.save('data/test_non_switch_w_64_f_20_samp.npy', np.array(test_non_switch)) return
def main_cv_loop(Xdata, Ydata, clf, parameters, n_folds=4, oversample_thresh=0.1, verbose=False): # use stratified K-fold CV to get roughly equal folds #kf=StratifiedKFold(n_splits=nfolds) kf = StratifiedShuffleSplit(n_splits=4, test_size=0.2) # use oversampling if the difference in prevalence is greater than 20% if numpy.abs(numpy.mean(y) - 0.5) > oversample_thresh: oversample = 'smote' else: oversample = 'none' # variables to store outputs pred = numpy.zeros(len(y)) # predicted values kernel = [] C = [] fa_ctr = 0 for train, test in kf.split(Xdata, Ydata): Xtrain = Xdata[train, :] Xtest = Xdata[test, :] Ytrain = Ydata[train] if numpy.abs(numpy.mean(Ytrain) - 0.5) > 0.2: if verbose: print('oversampling using SMOTETomek') sm = SMOTETomek() Xtrain, Ytrain = sm.fit_sample(Xtrain, Ytrain) best_estimator_, bestroc, fa = inner_cv_loop(Xtrain, Ytrain, clf, parameters, verbose=True) if not fa is None: if verbose: print('transforming using fa') print(fa) tmp = fa.transform(Xtest) Xtest = tmp fa_ctr += 1 pred.flat[test] = best_estimator_.predict_proba(Xtest) kernel.append(best_estimator_.kernel) C.append(best_estimator_.C) return roc_auc_score(y, pred, average='weighted'), y, pred
def test_sample_regular(): """Test sample function with regular SMOTE.""" # Create the object smote = SMOTETomek(random_state=RND_SEED) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_tomek_reg_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_tomek_reg_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def my_BalancedSample(df, target, choice=1): from imblearn.combine import SMOTETomek from imblearn.combine import SMOTEENN columns = df.columns.difference([target]) print('\nthe data originally has a shape--------->\n', df[target].value_counts()) model = SMOTETomek() if choice == 1 else SMOTEENN() X_smt, y_smt = model.fit_sample(df[columns], df[target]) X_smt = pd.DataFrame(X_smt, columns=columns) X_smt[target] = y_smt print('\nthe data now has a shape------->\n', X_smt[target].value_counts()) return (X_smt)
def balance_samples(features, labels): prev_count = len(features) analize_fold_balance(labels) print('Performing resample with SMOTETomek...') print('Original train hfo count : {0}'.format(prev_count)) # smt = RepeatedEditedNearestNeighbours( n_jobs=-1) # smt = NeighbourhoodCleaningRule(sampling_strategy='majority', n_neighbors=3, n_jobs=-1) smt = SMOTETomek(sampling_strategy=1, random_state=42, n_jobs=4) features, labels = smt.fit_resample(features, labels) post_count = len(features) print('{0} instances after SMOTE...'.format(post_count)) return features, labels
def learning_curve(X_train, X_test, y_train, y_test, model=sklearn.svm.SVC(), observations=[50, 75, 100, 125, 150]): recalls = [] f1s = [] precs = [] accs = [] for n in observations: smt = SMOTETomek(ratio='auto') smt.fit(X_train, y_train) X_resampled, y_resampled = smt.fit_sample(X_train, y_train) model.fit(X_resampled, y_resampled) y_pred = model.predict(X_test) f1 = f1_score(y_pred=y_pred, y_true=y_test, average='macro') acc = accuracy_score(y_pred=y_pred, y_true=y_test) prec = precision_score(y_pred=y_pred, y_true=y_test, average='weighted') recall = recall_score(y_pred=y_pred, y_true=y_test) f1s.append(f1) accs.append(acc) precs.append(prec) recalls.append(recall) plt.plot(observations, f1s, linewidth=4, color='blue', label='f1') plt.plot(observations, accs, linewidth=4, color='red', label='aacuracy') plt.plot(observations, precs, linewidth=4, color='green', label='precision') plt.plot(observations, recalls, linewidth=4, color='orange', label='recalls') plt.legend() plt.title("RandomUnderSampler Learning Curve", fontsize=16) plt.gca().set_xlabel("# of Points per Class", fontsize=14) plt.gca().set_ylabel("Training Accuracy", fontsize=14) sns.despine() return f1s, accs, precs, recalls, smt, model
def getUnderAndOverSamplers(): samplers = { 'SMOTEENN': SMOTEENN(sampling_strategy=0.5, n_jobs=-1), # 'SMOTEENN': SMOTEENN(sampling_strategy=0.5, n_jobs=-1) 'SMOTETomek': SMOTETomek(sampling_strategy=0.5, n_jobs=-1) } return samplers
def Balance_classes(X_train, y_train, Sampling_Function): if Sampling_Function == 'RandomUnderSampler': us = RandomUnderSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'NearMiss1': us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3) elif Sampling_Function == 'NearMiss2': us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3) elif Sampling_Function == 'NearMiss3': us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3) elif Sampling_Function == 'CondensedNearestNeighbour': us = CondensedNearestNeighbour(random_state=1) elif Sampling_Function == 'EditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'RepeatedEditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'TomekLinks': us = TomekLinks(random_state=1) elif Sampling_Function == 'RandomOverSampler': us = RandomOverSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'SMOTE': us = SMOTE(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTETomek': us = SMOTETomek(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTEENN': us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5) elif Sampling_Function == 'EasyEnsemble': us = EasyEnsemble() elif Sampling_Function == 'BalanceCascade_rf': us = BalanceCascade(classifier='random-forest', random_state=1) elif Sampling_Function == 'BalanceCascade_svm': us = BalanceCascade(classifier='linear-svm', random_state=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) return X_train_res, y_train_res
def __init__(self, window_size=6, training_ratio=.7, seq="sequence", pos="label"): self.training_ratio = training_ratio # Float value representing % of data used for training self.features = [] self.labels = [] self.words = [] self.window_size = window_size self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4), "mlp_adam": MLPClassifier(), "svc": svm.SVC(verbose=1), "xgb": XGBClassifier(max_delta_step=5), "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf") } self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(), "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(), "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(), "near_miss": NearMiss(), "pass": -1} self.seq = seq self.pos = pos self.random_data = 0 self.test_results = 0 self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"} self.vector = 0 self.features_labels = {} self.test_cv = 0 self.benchmark_mcc = 0 self.mcc_scorer = make_scorer(matthews_corrcoef)
def balanceData(self, method: str = "mixsampling") -> None: """ Function -> balanceData Balance data classes wiht method selected Parameters --------------------------------------------------------------------------- method => mixsampling, undersampling or oversampling Return --------------------------------------------------------------------------- None => Modify self.balanceObj """ if method == "mixsampling": from imblearn.combine import SMOTETomek self.balanceObj = SMOTETomek(sampling_strategy='auto') elif method == "undersampling": from imblearn.under_sampling import NearMiss self.balanceObj = NearMiss(sampling_strategy= "auto", n_neighbors=3, version=2) elif method == "oversampling": from imblearn.over_sampling import RandomOverSampler self.balanceObj = RandomOverSampler(sampling_strategy = "auto") else: raise NameError(f"{method} method not defined")
def smote_classify(X, y): # X_class, y_class = make_classification( # n_samples=10000, # random_state=10, # n_classes=2, # n_informative = 4 # ) X_class, y_class = make_classification( random_state=10, n_classes=2, ) print('Original dataset shape %s' % Counter(y)) smt = SMOTETomek(random_state=42) X_res, y_res = smt.fit_resample(X, y) print('Resampled dataset shape %s' % Counter(y_res)) return X_res, y_res
def test_validate_estimator_default(): smt = SMOTETomek(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ 0.62366841, -0.21312976 ], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [ -0.23374509, 0.18370049 ], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [ 0.38307743, -0.05670439 ], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def sampling_factory(X,Y,ratio,cat): if cat == 'ros': sampling = 'random_over_sampling' data = RandomOverSampler(ratio=ratio) elif cat == 'rus': sampling = 'random_under_sampling' data = RandomUnderSampler(ratio=ratio) elif cat == 'smo': sampling = 'SMOTE_over_sampling' data = SMOTE(ratio=ratio) elif cat == 'smob1': sampling = 'borderline_SMOTE1_over_sampling' data = SMOTE(ratio=ratio,kind='borderline1') elif cat == 'smob2': sampling = 'borderline_SMOTE2_over_sampling' data = SMOTE(ratio=ratio,kind='borderline2') elif cat == 'sme': sampling = 'SMOTEENN_combine_sampling' data = SMOTEENN(ratio=ratio,random_state=42) else : sampling = 'SMOTETomek_combine_sampling' data = SMOTETomek(random_state=42) X_resampled,y_resampled = data.fit_sample(X,Y) X2 = pd.DataFrame(X_resampled) # columns rename X2.columns = X.columns.values return X2
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def test_smote_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object smote = SMOTETomek(random_state=RND_SEED) assert_raises(RuntimeError, smote.sample, X, Y)
def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} smote = SMOTETomek( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [0.62366841, -0.21312976], [ 1.61091956, -0.40283504 ], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [ -0.00288378, 0.84259929 ], [1.79580611, -0.02219234], [0.45784496, -0.1053161]]) y_gt = np.array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half(): """Test sample function with regular SMOTE and a ratio of 0.5.""" # Create the object ratio = 0.5 smote = SMOTETomek(ratio=ratio, random_state=RND_SEED) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, "data", "smote_tomek_reg_x_05.npy")) y_gt = np.load(os.path.join(currdir, "data", "smote_tomek_reg_y_05.npy")) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def SMOTE_methods(df_train, target, method): '''The output data has been normalized by MinMaxScaler''' scaler = MinMaxScaler() X = df_train.drop([target], axis=1) y = df_train[target] X_normalized = scaler.fit_transform(X) if method == 'regular': X_res, y_res = SMOTE(kind='regular').fit_sample(X_normalized, y) elif method == 'borderline1': X_res, y_res = SMOTE(kind='borderline1').fit_sample(X_normalized, y) elif method == 'borderline2': X_res, y_res = SMOTE(kind='borderline2').fit_sample(X_normalized, y) elif method == 'svm': X_res, y_res = SMOET(kind='svm').fit_sample(X_normalized, y) elif method == 'Tomek': sm = SMOTETomek() X_res, y_res = sm().fit_sample(X_normalized, y) elif method == 'ENN': sm = SMOTEENN() X_res, y_res = sm().fit_sample(X_normalized, y) else: raise ValueError('输入方法有误') df_final = pd.DataFrame(X_res, columns=X.columns) df_final['target'] = y_res return df_final
def test_error_wrong_object(): smote = 'rnd' tomek = 'rnd' smt = SMOTETomek(smote=smote, random_state=RND_SEED) with raises(ValueError, match="smote needs to be a SMOTE"): smt.fit_sample(X, Y) smt = SMOTETomek(tomek=tomek, random_state=RND_SEED) with raises(ValueError, match="tomek needs to be a TomekLinks"): smt.fit_sample(X, Y)
class Resampling: def __init__(self, name): self.strategie = None self.name = name if name == "enn": self.strategie = EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3, kind_sel='all', n_jobs=-1) elif name == "allknn": self.strategie = AllKNN(sampling_strategy='auto', n_neighbors=3, kind_sel='all', allow_minority=False, n_jobs=-1) elif name == "renn": self.strategie = RepeatedEditedNearestNeighbours( sampling_strategy='auto', n_neighbors=3, max_iter=100, kind_sel='all', n_jobs=-1) elif name == "tomek": self.strategie = TomekLinks(sampling_strategy='auto', n_jobs=-1) elif name == "smote": self.strategie = SMOTE(sampling_strategy='auto', k_neighbors=5, n_jobs=-1, random_state=42) elif name == "bdsmote": self.strategie = BorderlineSMOTE(random_state=42, n_jobs=-1) elif name == "adasyn": self.strategie = ADASYN(sampling_strategy='auto', n_neighbors=5, n_jobs=-1, random_state=42) elif name == "smoteenn": self.strategie = SMOTEENN(sampling_strategy='auto', smote=None, enn=None, n_jobs=-1, random_state=42) elif name == "smotetomek": self.strategie = SMOTETomek(sampling_strategy='auto', smote=None, tomek=None, n_jobs=-1, random_state=42) def fit_resample(self, x, y): x_res, y_res = self.strategie.fit_resample(x, y) return x_res, y_res
def make_clf(usx, usy, clf, clf_name, sampling, normalize=False): ''' Function for the classification task - Trains and tests the classifier clf using 10-fold cross-validation If normalize flag is True then the data are being normalised The sampling parameter sets the type of sampling to be used ''' print('----------{} with {}----------'.format(clf_name, sampling)) totalTP, totalFP, totalFN, totalTN = 0, 0, 0, 0 plot_ind = randint(0, 9) j = 0 skf = StratifiedKFold(n_splits=10, shuffle=True) for train_index, test_index in skf.split(usx, usy): x_train, x_test = usx[train_index], usx[test_index] y_train, y_test = usy[train_index], usy[test_index] if sampling == 'SMOTE': x_train, y_train = SMOTE(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'ADASYN': x_train, y_train = ADASYN(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'ENN': x_train, y_train = EditedNearestNeighbours().fit_resample(x_train, y_train) elif sampling == 'Tomek': x_train, y_train = TomekLinks().fit_resample(x_train, y_train) elif sampling == 'SMOTETomek': x_train, y_train = SMOTETomek(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'SMOTEENN': x_train, y_train = SMOTEENN(sampling_strategy=0.3).fit_resample(x_train, y_train) elif sampling == 'NCR': x_train, y_train = NeighbourhoodCleaningRule().fit_resample(x_train, y_train) elif sampling == 'OSS': x_train, y_train = OneSidedSelection().fit_resample(x_train, y_train) if normalize: scaler = StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) clf.fit(x_train, y_train) # if plot_ind == j and clf_name == 'DecisionTreeClassifier': # plot_decision_tree(clf) y_predict = clf.predict(x_test) for i in range(len(y_predict)): if y_test[i] and y_predict[i]: totalTP += 1 if not y_test[i] and y_predict[i]: totalFP += 1 if y_test[i] and not y_predict[i]: totalFN += 1 if not y_test[i] and not y_predict[i]: totalTN += 1 j += 1 print('TOTAL TP: ' + str(totalTP)) print('TOTAL FP: ' + str(totalFP)) print('TOTAL FN: ' + str(totalFN)) print('TOTAL TN: ' + str(totalTN))
def SMOTE_Tomek(X_train, Y_train, seed, sampling_strategy, k_neighbors_smote=5): tl = TomekLinks(random_state=seed, n_jobs=-1) smote = SMOTE(random_state=seed, n_jobs=-1, k_neighbors=k_neighbors_smote) smote_tomek = SMOTETomek(random_state=seed, smote=smote, tomek=tl) print('Before SMOTE + Tomek : ', sorted(Counter(Y_train).items())) X_train_resampled, Y_train_resampled = smote_tomek.fit_resample( X_train, Y_train) print('After SMOTE + Tomek : ', sorted(Counter(Y_train_resampled).items())) X_train_resampled, Y_train_resampled = shuffle_dataset( X_train_resampled, Y_train_resampled, seed) return X_train_resampled, Y_train_resampled
def main_cv_loop(Xdata,Ydata,clf,parameters, n_folds=4,oversample_thresh=0.1,verbose=False): # use stratified K-fold CV to get roughly equal folds #kf=StratifiedKFold(n_splits=nfolds) kf=StratifiedShuffleSplit(n_splits=4,test_size=0.2) # use oversampling if the difference in prevalence is greater than 20% if numpy.abs(numpy.mean(Ydata)-0.5)>oversample_thresh: oversample='smote' else: oversample='none' # variables to store outputs pred=numpy.zeros(len(Ydata)) # predicted values pred_proba=numpy.zeros(len(Ydata)) # predicted values kernel=[] C=[] fa_ctr=0 for train,test in kf.split(Xdata,Ydata): Xtrain=Xdata[train,:] Xtest=Xdata[test,:] Ytrain=Ydata[train] if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2: if verbose: print('oversampling using SMOTETomek') sm = SMOTETomek() Xtrain, Ytrain = sm.fit_sample(Xtrain, Ytrain) best_estimator_,bestroc,fa=inner_cv_loop(Xtrain,Ytrain,clf, parameters,verbose=True) if not fa is None: if verbose: print('transforming using fa') print(fa) tmp=fa.transform(Xtest) Xtest=tmp fa_ctr+=1 pred_proba.flat[test]=best_estimator_.predict_proba(Xtest) pred.flat[test]=best_estimator_.predict(Xtest) kernel.append(best_estimator_.kernel) C.append(best_estimator_.C) return roc_auc_score(Ydata,pred,average='weighted'),Ydata,pred,pred_proba
def test_error_wrong_object(): smote = 'rnd' tomek = 'rnd' smt = SMOTETomek(smote=smote, random_state=RND_SEED) with raises(ValueError, match="smote needs to be a SMOTE"): smt.fit_resample(X, Y) smt = SMOTETomek(tomek=tomek, random_state=RND_SEED) with raises(ValueError, match="tomek needs to be a TomekLinks"): smt.fit_resample(X, Y)
def test_parallelisation(): # Check if default job count is 1 smt = SMOTETomek(random_state=RND_SEED) smt._validate_estimator() assert smt.n_jobs == 1 assert smt.smote_.n_jobs == 1 assert smt.tomek_.n_jobs == 1 # Check if job count is set smt = SMOTETomek(random_state=RND_SEED, n_jobs=8) smt._validate_estimator() assert smt.n_jobs == 8 assert smt.smote_.n_jobs == 8 assert smt.tomek_.n_jobs == 8
def test_error_wrong_object(smote_params, err_msg): smt = SMOTETomek(**smote_params) with pytest.raises(ValueError, match=err_msg): smt.fit_resample(X, Y)
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=100, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply SMOTE + Tomek links sm = SMOTETomek() X_resampled, y_resampled = sm.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=0.5) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],