def get_models(): models, names = list(), list() # SMOTEENN sampling = SMOTEENN(enn=EditedNearestNeighbours( sampling_strategy='majority')) model = LogisticRegression(solver='liblinear') steps = [('e', sampling), ('m', model)] models.append(Pipeline(steps=steps)) names.append('LR') # SMOTEENN + Norm sampling = SMOTEENN(enn=EditedNearestNeighbours( sampling_strategy='majority')) model = LogisticRegression(solver='liblinear') steps = [('t', MinMaxScaler()), ('e', sampling), ('m', model)] models.append(Pipeline(steps=steps)) names.append('Norm') # SMOTEENN + Std sampling = SMOTEENN(enn=EditedNearestNeighbours( sampling_strategy='majority')) model = LogisticRegression(solver='liblinear') steps = [('t', StandardScaler()), ('e', sampling), ('m', model)] models.append(Pipeline(steps=steps)) names.append('Std') # SMOTEENN + Power sampling = SMOTEENN(enn=EditedNearestNeighbours( sampling_strategy='majority')) model = LogisticRegression(solver='liblinear') steps = [('t1', MinMaxScaler()), ('t2', PowerTransformer()), ('e', sampling), ('m', model)] models.append(Pipeline(steps=steps)) names.append('Power') return models, names
def test_validate_estimator_deprecation(): """Test right processing while passing old parameters""" X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) smt = SMOTEENN(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_error_wrong_object(): smote = 'rnd' enn = 'rnd' smt = SMOTEENN(smote=smote, random_state=RND_SEED) with raises(ValueError, match="smote needs to be a SMOTE"): smt.fit_sample(X, Y) smt = SMOTEENN(enn=enn, random_state=RND_SEED) with raises(ValueError, match="enn needs to be an "): smt.fit_sample(X, Y)
def test_error_wrong_object(): smote = 'rnd' enn = 'rnd' smt = SMOTEENN(smote=smote, random_state=RND_SEED) assert_raises_regex(ValueError, "smote needs to be a SMOTE", smt.fit_sample, X, Y) smt = SMOTEENN(enn=enn, random_state=RND_SEED) assert_raises_regex(ValueError, "enn needs to be an ", smt.fit_sample, X, Y)
def test_error_wrong_object(): """Test either if an error is raised while wrong objects are provided at the initialization""" # Create a SMOTE and Tomek object smote = 'rnd' enn = 'rnd' smt = SMOTEENN(smote=smote, random_state=RND_SEED) assert_raises(ValueError, smt.fit, X, Y) smt = SMOTEENN(enn=enn, random_state=RND_SEED) assert_raises(ValueError, smt.fit, X, Y)
def statistic_set(model,df,test_set,k_fold): start_time=time.time() #input_x,test_x,input_y,test_y=train_test_split(input_x,input_y, # test_size=0.25,stratify=input_y,random_state=43) input_y= list(df['label']) input_x= make_set(df) if not test_set.empty: src_df_res=test_set test_x=make_set(src_df_res) test_y=list(src_df_res['label']) df=df[~df[['SOURCE_ID_1','SOURCE_ID_2']].apply(tuple,1).isin(src_df_res[['SOURCE_ID_1','SOURCE_ID_2']].apply(tuple,1))] input_y= list(df['label']) input_x= make_set(df) else: input_x,test_x,input_y,test_y=train_test_split(input_x,input_y, test_size=0.25,stratify=input_y,random_state=43) if k_fold==1: cv= KFold(5,shuffle=True,random_state=43) for i,(idx_train,idx_test) in enumerate(cv.split(input_x,input_y)): x_train_list=[] y_train_list=[] x_test_list=[] y_test_list=[] for idx in idx_train: x_train_list.append(input_x[idx]) y_train_list.append(input_y[idx]) for idx in idx_test: x_test_list.append(input_x[idx]) y_test_list.append(input_y[idx]) x_train_list,y_train_list=SMOTEENN(random_state=0).fit_sample(x_train_list,y_train_list) clf=model.fit(x_train_list,y_train_list) print("score = %.8f"%(clf.score(x_test_list,y_test_list))) input_x,input_y = SMOTEENN(random_state=0).fit_sample(input_x,input_y) fin_clf=model.fit(input_x,input_y) fin_score=fin_clf.score(test_x,test_y) print('final_score') print(fin_score) res_time=time.time()-start_time return fin_clf,fin_score,test_y,fin_clf.predict(test_x),test_x,res_time
def test_senn_multiclass_error(): """ Test either if an error is raised when the target are not binary type. """ # continuous case y = np.linspace(0, 1, 20) sm = SMOTEENN(random_state=RND_SEED) assert_warns(UserWarning, sm.fit, X, y) # multiclass case y = np.array([0] * 3 + [1] * 2 + [2] * 15) sm = SMOTEENN(random_state=RND_SEED) assert_warns(UserWarning, sm.fit, X, y)
def test_validate_estimator_deprecation(): smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) smt = SMOTEENN(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_parallelisation(): # Check if default job count is 1 smt = SMOTEENN(random_state=RND_SEED) smt._validate_estimator() assert smt.n_jobs == 1 assert smt.smote_.n_jobs == 1 assert smt.enn_.n_jobs == 1 # Check if job count is set smt = SMOTEENN(random_state=RND_SEED, n_jobs=8) smt._validate_estimator() assert smt.n_jobs == 8 assert smt.smote_.n_jobs == 8 assert smt.enn_.n_jobs == 8
def use_debug_parameters(self, reduced_selected_features): # Define parameters as an array of dicts in case different parameters are used for different optimizations params_debug = [ { 'scaler': [StandardScaler()], 'sampling': [modelutil.Nosampler(), SMOTE(), SMOTEENN(), ADASYN()], 'feat__cols': reduced_selected_features[0:2], 'model__kernel': ['linear'], 'model__C': [0.1, 1, 10], 'model__gamma': [0.1, 1, 10], }, { 'scaler': [StandardScaler(), Normalizer()], 'sampling': [modelutil.Nosampler()], 'feat__cols': reduced_selected_features[0:1], 'model__C': [1], # default C=1 'model__kernel': ['rbf'], 'model__gamma': [1] # Only relevant in rbf, default='auto'=1/n_features } ] return params_debug
def SMOTE_ENN(X_train, Y_train, seed, sampling_strategy, k_neighbors_smote=5, n_neighbors_enn=3, kind_sel='all'): enn = EditedNearestNeighbours(random_state=seed, n_jobs=-1, n_neighbors=n_neighbors_enn, kind_sel=kind_sel, sampling_strategy=sampling_strategy) smote = SMOTE(random_state=seed, n_jobs=-1, k_neighbors=k_neighbors_smote, sampling_strategy=sampling_strategy) smote_enn = SMOTEENN(random_state=seed, smote=smote, enn=enn, sampling_strategy=sampling_strategy) print('Before SMOTE + ENN : ', sorted(Counter(Y_train).items())) X_train_resampled, Y_train_resampled = smote_enn.fit_resample( X_train, Y_train) print('After SMOTE + ENN : ', sorted(Counter(Y_train_resampled).items())) X_train_resampled, Y_train_resampled = shuffle_dataset( X_train_resampled, Y_train_resampled, seed) return X_train_resampled, Y_train_resampled
def smoter(df): IDs = df.Quote_ID target = df.QuoteConversion_Flag data = df.drop(['QuoteConversion_Flag'], axis=1).values print("Before SMOTE: ", sorted(Counter(target).items())) #### # ENN #### enn = ENN(sampling_strategy="not majority", kind_sel="mode", n_neighbors=5, n_jobs=-1, random_state=RANDOM_STATE) smote_enn = SMOTEENN(enn=enn, random_state=RANDOM_STATE) X_resampled, y_resampled = smote_enn.fit_resample(data, target) print("SMOTE ENN: ", sorted(Counter(y_resampled).items())) #### # Tomeks #### # smote_tomek = SMOTETomek(random_state=0) # X_resampled, y_resampled = smote_tomek.fit_resample(data, target) # print("Using SMOTE: ", sorted(Counter(y_resampled).items())) data = pd.DataFrame(data=X_resampled, columns=FIELDS) target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag']) return data, target
def OverSampling_SMOTE(df): df.replace([np.inf, -np.inf], np.nan, inplace=True) train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] train_df_X = train_df.drop('TARGET', axis=1) train_df_y = train_df.TARGET # SMOTE print('Creating Smote Data...') smote = SMOTE(k_neighbors=5, n_jobs=-1) smote_enn = make_pipeline(SimpleImputer(), SMOTEENN(smote=smote)) X_res, y_res = smote_enn.fit_resample(train_df_X, train_df_y) X_res_df = pd.DataFrame(X_res, columns=train_df_X.columns) train_df_new = X_res_df.join(y_res.to_frame()) df = train_df_new.append(test_df) # Save data to csv file df.to_csv('data/df_prepared_to_model.csv') # Save data to pickle file df.to_pickle("data/df_prepared_to_model.pkl") return df
def smot2(train_x, train_y, feature_columns): from imblearn.combine import SMOTEENN from imblearn.over_sampling import SMOTE from imblearn.under_sampling import TomekLinks from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import ADASYN from sklearn.svm import SVC from imblearn.under_sampling import CondensedNearestNeighbour print('\nOriginal dataset shape {}'.format(Counter(train_y))) sm = SMOTEENN(ratio='minority', n_jobs=3, random_state=42, n_neighbors=50, smote=SMOTE()) #sm = ADASYN(ratio='minority', n_jobs=3,random_state=42,n_neighbors=100) #sm = SMOTE(ratio='minority', n_jobs=3, random_state=42,m_neighbors=200) #sm = CondensedNearestNeighbour(ratio='majority', random_state=42) log.traceLogInfo("\nFIT DE SMOT2 ...equilibrage") X_res, y_res = sm.fit_sample(train_x, train_y) print('\nResampled dataset shape {}'.format(Counter(y_res))) # reconstitution DATAFRAME train_x = pd.DataFrame(X_res, columns=feature_columns) train_y = pd.Series(y_res) return train_x, train_y
def __init__(self, window_size=6, training_ratio=.7, seq="sequence", pos="label"): self.training_ratio = training_ratio # Float value representing % of data used for training self.features = [] self.labels = [] self.words = [] self.window_size = window_size self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4), "mlp_adam": MLPClassifier(), "svc": svm.SVC(verbose=1), "xgb": XGBClassifier(max_delta_step=5), "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf") } self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(), "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(), "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(), "near_miss": NearMiss(), "pass": -1} self.seq = seq self.pos = pos self.random_data = 0 self.test_results = 0 self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"} self.vector = 0 self.features_labels = {} self.test_cv = 0 self.benchmark_mcc = 0 self.mcc_scorer = make_scorer(matthews_corrcoef)
def SMOTE_methods(df_train, target, method): '''The output data has been normalized by MinMaxScaler''' scaler = MinMaxScaler() X = df_train.drop([target], axis=1) y = df_train[target] X_normalized = scaler.fit_transform(X) if method == 'regular': X_res, y_res = SMOTE(kind='regular').fit_sample(X_normalized, y) elif method == 'borderline1': X_res, y_res = SMOTE(kind='borderline1').fit_sample(X_normalized, y) elif method == 'borderline2': X_res, y_res = SMOTE(kind='borderline2').fit_sample(X_normalized, y) elif method == 'svm': X_res, y_res = SMOET(kind='svm').fit_sample(X_normalized, y) elif method == 'Tomek': sm = SMOTETomek() X_res, y_res = sm().fit_sample(X_normalized, y) elif method == 'ENN': sm = SMOTEENN() X_res, y_res = sm().fit_sample(X_normalized, y) else: raise ValueError('输入方法有误') df_final = pd.DataFrame(X_res, columns=X.columns) df_final['target'] = y_res return df_final
def test_smote_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object smote = SMOTEENN(random_state=RND_SEED) assert_raises(RuntimeError, smote.sample, X, Y)
def split_data_resampling(X, y, test_percentage=0.2): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_percentage, random_state=42) smote_enn = SMOTEENN(random_state=0) X_train_resampled, y_train_resampled = smote_enn.fit_resample( X_train, y_train) return X_train_resampled, y_train_resampled, X_test, y_test
def __init__(self, companiesPath, riskPath, target='is_fraud', oversampler=SMOTEENN(), testKwargs={ 'test_size': .3, 'random_state': 22 }, auto=True): rawData = Reader(companiesPath, riskPath) self.fitRiskData = RiskData().fit(rawData.dfRisk) dfRisk = self.fitRiskData.transform(rawData.dfRisk) df = rawData.dfCompanies.merge(dfRisk, how='left', on='company_id') train, test = tts(df, stratify=df[target], **testKwargs) self.partition_data(train, test, target, oversampler) self.results = {} self.estimators = {} self.featureEval = None if auto: self.build(build_param_grid()) self.evaluate()
def Balance_classes(X_train, y_train, Sampling_Function): if Sampling_Function == 'RandomUnderSampler': us = RandomUnderSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'NearMiss1': us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3) elif Sampling_Function == 'NearMiss2': us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3) elif Sampling_Function == 'NearMiss3': us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3) elif Sampling_Function == 'CondensedNearestNeighbour': us = CondensedNearestNeighbour(random_state=1) elif Sampling_Function == 'EditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'RepeatedEditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'TomekLinks': us = TomekLinks(random_state=1) elif Sampling_Function == 'RandomOverSampler': us = RandomOverSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'SMOTE': us = SMOTE(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTETomek': us = SMOTETomek(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTEENN': us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5) elif Sampling_Function == 'EasyEnsemble': us = EasyEnsemble() elif Sampling_Function == 'BalanceCascade_rf': us = BalanceCascade(classifier='random-forest', random_state=1) elif Sampling_Function == 'BalanceCascade_svm': us = BalanceCascade(classifier='linear-svm', random_state=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) return X_train_res, y_train_res
def main(): data_x, data_y = read_mat(path='./data/nbadata.mat') # import pdb; pdb.set_trace() # print('------- Newton Method------') sm = SMOTEENN() # syn_data_x, syn_data_y = sm.fit_sample(data_x, data_y) syn_data_x, syn_data_y = data_aug(data_x, data_y) mean = np.mean(data_x, axis=1) mean = np.expand_dims(mean, axis=1) std = np.std(data_x, axis=1) std = np.expand_dims(std, axis=1) data_x = (data_x - mean) / std # syn_data_x, syn_data_y = data_aug(data_x,data_y) # import pdb; pdb.set_trace() mean_syn = np.mean(syn_data_x, axis=1) mean_syn = np.expand_dims(mean_syn, axis=1) std_syn = np.std(syn_data_x, axis=1) std_syn = np.expand_dims(std_syn, axis=1) syn_data_x = (syn_data_x - mean_syn) / std_syn # import pdb; pdb.set_trace() model = LogisticRegression(x_data=np.vstack([data_x, syn_data_x]), y_data=np.hstack([data_y, syn_data_y]), original_x=data_x, original_y=data_y) # model = LogisticRegression(x_data=syn_data_x, y_data=syn_data_y, original_x=data_x, original_y=data_y) # model = LogisticRegression(x_data=data_x, y_data=data_y, original_x=data_x, original_y=data_y) # model.model_fit(method='NewtonMethod',lr=5e-2,error_bound=1e-1) # model.model_fit(method='GradientDescent',lr=1e-3, error_bound=1e-2) model.model_fit(method='BFGS', lr=1e-1, error_bound=1e-1)
def smpote_test(): # 读取测试测试数据集中的数据 truth_df = pd.read_hdf('D:\\kpi\\1.hdf') # print(truth_df["KPI ID"]) kpi_names = truth_df['KPI ID'].values truth = truth_df[truth_df["KPI ID"] == kpi_names[0]] y = truth['label'] X = truth.drop(columns=['label', 'KPI ID']) sm = SMOTEENN() X_resampled, y_resampled = sm.fit_sample(X, y) dfX = pd.DataFrame(X_resampled, columns=['timestamp', 'value']) DFy = pd.DataFrame(y_resampled, columns=['label']) plt.plot(np.array(X['timestamp']), np.array(X['value']), color='green', label='training accuracy') plt.legend() # 显示图例 plt.show() dfX = dfX.join(DFy).sort_values(by="timestamp", ascending=True) plt.plot(np.array(dfX['timestamp']), np.array(dfX['value']), color='red', label='training accuracy') plt.legend() # 显示图例 plt.show()
def test_validate_estimator_init(): """Test right processing while passing objects as initialization""" # Create a SMOTE and Tomek object smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(random_state=RND_SEED) smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def fit(self, c_data, x_data, y_data): # this is to track evolution of the size of the training samples self.samplesize = [] self.samplesize.append(len(x_data)) if self.reject_by_calendar: mask = self.mask_cal(c_data, y_data) # filter rows rejected by this calendar criteria # not filtering them might improve second classifier training #x_data = normalize(x_data[mask]) #y_data = y_data[mask] self.samplesize.append(len(x_data)) if self.use_resampling: # undersample resampler = AllKNN() x_data, y_data = resampler.fit_sample(x_data, y_data) self.samplesize.append(len(x_data)) # oversample resampler = SMOTEENN() x_data, y_data = resampler.fit_sample(x_data, y_data) self.samplesize.append(len(x_data)) # train clf only with filtered and resampled data if self.use_weights: try: self.clf.fit(x_data, y_data, self.get_weights(y_data)) except TypeError: print "The classifier selected does not admit weights for training samples" print "Switching to no weights" self.use_weights = False self.clf.fit(x_data, y_data) else: self.clf.fit(x_data, y_data)
def use_debug_parameters(self, reduced_selected_features): ### XGBOOST CODE start params_debug = [{ 'scaler': [StandardScaler()], 'sampling': [modelutil.Nosampler(), SMOTE(), SMOTEENN(), ADASYN()], 'feat__cols': reduced_selected_features[0:2], 'model__nthread': [4], # when use hyperthread, xgboost may become slower 'model__objective': ['binary:logistic'], 'model__learning_rate': [0.05, 0.5], # so called `eta` value 'model__max_depth': [6, 7, 8], 'model__min_child_weight': [11], 'model__silent': [1], 'model__subsample': [0.8], 'model__colsample_bytree': [0.7], 'model__n_estimators': [5, 10], # number of trees, change it to 1000 for better results 'model__missing': [-999], 'model__seed': [1337] }] return params_debug
def sampling(X, y): debug('Started sampling') lists = [] names = [] lists.append((X, y)) names.append('original') ### ovesampling query_time = time.time() pp = SMOTE(kind='regular') X_pp, y_pp = pp.fit_sample(X, y) lists.append((X_pp, y_pp)) names.append('over-SMOTE') process_time = int(time.time() - query_time) debug('Finished sampling SMOTE in {} seconds'.format(process_time)) ### undersampling # query_time = time.time() # pp = EditedNearestNeighbours() # X_pp, y_pp = pp.fit_sample(X, y) # lists.append((X_pp, y_pp)) # names.append('under-ENN') # process_time = int(time.time() - query_time) # debug('Finished sampling ENN in {} seconds'.format(process_time)) ### oversampling + undersampling query_time = time.time() pp = SMOTEENN() X_pp, y_pp = pp.fit_sample(X, y) lists.append((X_pp, y_pp)) names.append('over+under-SMOTE-ENN') process_time = int(time.time() - query_time) debug('Finished sampling SMOTE-ENN in {} seconds'.format(process_time)) return lists, names
def resampling(X_train, y_train): from imblearn.combine import SMOTEENN sm = SMOTEENN() print('dataset shape {}'.format(Counter(y_train))) X_train, y_train = sm.fit_sample(X_train, y_train) print('Resampled dataset shape {}'.format(Counter(y_train))) return X_train, y_train
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def over_sampling(x_train, y_train): print() print("Doing over sampling...") print("Before over sampling:") class0_num = np.sum(y_train == 0) class1_num = np.sum(y_train == 1) class2_num = np.sum(y_train == 2) print("#Sample in Class 0: {}".format(class0_num)) print("#Sample in Class 1: {}".format(class1_num)) print("#Sample in Class 2: {}".format(class2_num)) # Using SMOTE: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html # an Over-sampling approach # Over sampling on training and validation data # sm = SMOTE(sampling_strategy='auto', random_state=10) # sm = SVMSMOTE(random_state=0) sm = SMOTEENN(random_state=0) # sm = SMOTETomek(ratio='auto') x_train, y_train = sm.fit_resample(x_train, y_train) # x_train, y_train = sm.fit_resample(x_train, y_train) # X_train, X_val, y_train, y_val = train_test_split(X_train,y,test_size=0.2,random_state=7) x_out = x_train y_out = y_train print("After over sampling:") class0_num = np.sum(y_out == 0) class1_num = np.sum(y_out == 1) class2_num = np.sum(y_out == 2) print("#Sample in Class 0: {}".format(class0_num)) print("#Sample in Class 1: {}".format(class1_num)) print("#Sample in Class 2: {}".format(class2_num)) return x_out, y_out
def resample_dataset(df, feature_list, repo_type): num_rows = len(df.index) # number of rows in <df> num_features = len(feature_list) # number of feature columns to resample cur_row = [] # list to hold the current row of <df> feat_val_mat = [] # the matrix (list of lists) to hold all feature values counter = 0 # counter for progress print "\nResampling data for the " + repo_type + " dataset..." for idx, row in tqdm(df.iterrows(), desc="\tProgress"): # loop <num_rows> times counter += 1 # print_progress(counter, num_rows) for j in range(num_features): # loop <num_features> times cur_row.append( row[feature_list[j]]) # form list of current row values feat_val_mat.append(cur_row) # append <cur_row> to <feat_val_mat> cur_row = [] smote_obj = SMOTEENN( sampling_strategy="all", random_state=99 ) # <smote_obj> should over/under-sample both the "NEUTRAL" and "INSECURE" classes resampled_data, resampled_targets = smote_obj.fit_resample( feat_val_mat, list(df["SECU_FLAG"])) resampled_df = pd.DataFrame( resampled_data, columns=feature_list) # recreate the reduced dataframe resampled_df[ "SECU_FLAG"] = resampled_targets # re-initialize the "SECU_FLAG" column resampled_df["REPO_TYPE"] = [repo_type] * len( resampled_df.index) # re-initialize the "REPO_TYPE" column return resampled_df