def test_validate_estimator_init(): """Test right processing while passing objects as initialization""" # Create a SMOTE and Tomek object smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(random_state=RND_SEED) smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_default(): """Test right processing while passing no object as initialization""" smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_regular(): """Test sample function with regular SMOTE.""" # Create the object smote = SMOTEENN(random_state=RND_SEED) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half(): """Test sample function with regular SMOTE and a ratio of 0.5.""" # Create the object ratio = 0.8 smote = SMOTEENN(ratio=ratio, random_state=RND_SEED) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.36784496, -0.1953161], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def resampling(X_train, y_train): from imblearn.combine import SMOTEENN sm = SMOTEENN() print('dataset shape {}'.format(Counter(y_train))) X_train, y_train = sm.fit_sample(X_train, y_train) print('Resampled dataset shape {}'.format(Counter(y_train))) return X_train, y_train
def split_data_resampling(X, y, test_percentage=0.2): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_percentage, random_state=42) smote_enn = SMOTEENN(random_state=0) X_train_resampled, y_train_resampled = smote_enn.fit_resample( X_train, y_train) return X_train_resampled, y_train_resampled, X_test, y_test
def get_smotenn(X_trn, y_trn, seed=int(623 * 449)): """ Resamples using SMOTENN """ SME = SMOTEENN(random_state=seed) X_trn, y_trn = SME.fit_resample(X_trn, y_trn) return X_trn, y_trn
def over_sampling(x_train, y_train): print() print("Doing over sampling...") print("Before over sampling:") class0_num = np.sum(y_train == 0) class1_num = np.sum(y_train == 1) class2_num = np.sum(y_train == 2) print("#Sample in Class 0: {}".format(class0_num)) print("#Sample in Class 1: {}".format(class1_num)) print("#Sample in Class 2: {}".format(class2_num)) # Using SMOTE: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html # an Over-sampling approach # Over sampling on training and validation data # sm = SMOTE(sampling_strategy='auto', random_state=10) # sm = SVMSMOTE(random_state=0) sm = SMOTEENN(random_state=0) # sm = SMOTETomek(ratio='auto') x_train, y_train = sm.fit_resample(x_train, y_train) # x_train, y_train = sm.fit_resample(x_train, y_train) # X_train, X_val, y_train, y_val = train_test_split(X_train,y,test_size=0.2,random_state=7) x_out = x_train y_out = y_train print("After over sampling:") class0_num = np.sum(y_out == 0) class1_num = np.sum(y_out == 1) class2_num = np.sum(y_out == 2) print("#Sample in Class 0: {}".format(class0_num)) print("#Sample in Class 1: {}".format(class1_num)) print("#Sample in Class 2: {}".format(class2_num)) return x_out, y_out
def runtree(data, target): lb = preprocessing.LabelEncoder() lb.fit(target) target1 = lb.transform(target) sm = SMOTEENN() clf = tree.DecisionTreeClassifier() folds = [3] depths = [10] print("------------ TREE ------------") for fold in folds: skf = StratifiedKFold(n_splits=fold, random_state=5) test_target = [] test_predict = [] test_proba = [] test_proba_target = [] for train_index, test_index in skf.split(data, target1): clf_ = clone(clf) X_resampled, y_resampled = sm.fit_sample(data[train_index], target1[train_index]) clf_.fit(X_resampled, y_resampled) test_predict.append(clf_.predict(data[test_index])) test_target.append(target1[test_index]) test_proba_target.extend(target1[test_index]) test_proba.extend(clf_.predict_proba(data[test_index])[:, 1]) print_scores(test_predict, test_target) print(roc_auc_score(y_true=test_proba_target, y_score=test_proba))
def get_simple_train_test_split(self): X_train, X_test, y_train, y_test = train_test_split( self.X, self.y, test_size=self.test_size, random_state=self.random_state ) if self.missingvals: # Impute missing vals with column mean imp = SimpleImputer() imp.fit(X_train) X_train = imp.transform(X_train) X_test = imp.transform(X_test) if self.balance: # Balance out classes # Not needed when we use frequency binning! balancer = SMOTEENN(random_state=self.random_state) X_train, y_train = balancer.fit_resample(X_train, y_train) if self.standardize: scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) return X_train, y_train, X_test, y_test
def get_models(): models, names = list(), list() # SMOTEENN sampling = SMOTEENN(enn=EditedNearestNeighbours( sampling_strategy='majority')) model = LogisticRegression(solver='liblinear') steps = [('e', sampling), ('m', model)] models.append(Pipeline(steps=steps)) names.append('LR') # SMOTEENN + Norm sampling = SMOTEENN(enn=EditedNearestNeighbours( sampling_strategy='majority')) model = LogisticRegression(solver='liblinear') steps = [('t', MinMaxScaler()), ('e', sampling), ('m', model)] models.append(Pipeline(steps=steps)) names.append('Norm') # SMOTEENN + Std sampling = SMOTEENN(enn=EditedNearestNeighbours( sampling_strategy='majority')) model = LogisticRegression(solver='liblinear') steps = [('t', StandardScaler()), ('e', sampling), ('m', model)] models.append(Pipeline(steps=steps)) names.append('Std') # SMOTEENN + Power sampling = SMOTEENN(enn=EditedNearestNeighbours( sampling_strategy='majority')) model = LogisticRegression(solver='liblinear') steps = [('t1', MinMaxScaler()), ('t2', PowerTransformer()), ('e', sampling), ('m', model)] models.append(Pipeline(steps=steps)) names.append('Power') return models, names
def SMOTE_ENN(X_train, Y_train, seed, sampling_strategy, k_neighbors_smote=5, n_neighbors_enn=3, kind_sel='all'): enn = EditedNearestNeighbours(random_state=seed, n_jobs=-1, n_neighbors=n_neighbors_enn, kind_sel=kind_sel, sampling_strategy=sampling_strategy) smote = SMOTE(random_state=seed, n_jobs=-1, k_neighbors=k_neighbors_smote, sampling_strategy=sampling_strategy) smote_enn = SMOTEENN(random_state=seed, smote=smote, enn=enn, sampling_strategy=sampling_strategy) print('Before SMOTE + ENN : ', sorted(Counter(Y_train).items())) X_train_resampled, Y_train_resampled = smote_enn.fit_resample( X_train, Y_train) print('After SMOTE + ENN : ', sorted(Counter(Y_train_resampled).items())) X_train_resampled, Y_train_resampled = shuffle_dataset( X_train_resampled, Y_train_resampled, seed) return X_train_resampled, Y_train_resampled
def smpote_test(): # 读取测试测试数据集中的数据 truth_df = pd.read_hdf('D:\\kpi\\1.hdf') # print(truth_df["KPI ID"]) kpi_names = truth_df['KPI ID'].values truth = truth_df[truth_df["KPI ID"] == kpi_names[0]] y = truth['label'] X = truth.drop(columns=['label', 'KPI ID']) sm = SMOTEENN() X_resampled, y_resampled = sm.fit_sample(X, y) dfX = pd.DataFrame(X_resampled, columns=['timestamp', 'value']) DFy = pd.DataFrame(y_resampled, columns=['label']) plt.plot(np.array(X['timestamp']), np.array(X['value']), color='green', label='training accuracy') plt.legend() # 显示图例 plt.show() dfX = dfX.join(DFy).sort_values(by="timestamp", ascending=True) plt.plot(np.array(dfX['timestamp']), np.array(dfX['value']), color='red', label='training accuracy') plt.legend() # 显示图例 plt.show()
def test_validate_estimator_deprecation(): """Test right processing while passing old parameters""" X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-0.41635887, -0.38299653], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) smt = SMOTEENN(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def resample_dataset(df, feature_list, repo_type): num_rows = len(df.index) # number of rows in <df> num_features = len(feature_list) # number of feature columns to resample cur_row = [] # list to hold the current row of <df> feat_val_mat = [] # the matrix (list of lists) to hold all feature values counter = 0 # counter for progress print "\nResampling data for the " + repo_type + " dataset..." for idx, row in tqdm(df.iterrows(), desc="\tProgress"): # loop <num_rows> times counter += 1 # print_progress(counter, num_rows) for j in range(num_features): # loop <num_features> times cur_row.append( row[feature_list[j]]) # form list of current row values feat_val_mat.append(cur_row) # append <cur_row> to <feat_val_mat> cur_row = [] smote_obj = SMOTEENN( sampling_strategy="all", random_state=99 ) # <smote_obj> should over/under-sample both the "NEUTRAL" and "INSECURE" classes resampled_data, resampled_targets = smote_obj.fit_resample( feat_val_mat, list(df["SECU_FLAG"])) resampled_df = pd.DataFrame( resampled_data, columns=feature_list) # recreate the reduced dataframe resampled_df[ "SECU_FLAG"] = resampled_targets # re-initialize the "SECU_FLAG" column resampled_df["REPO_TYPE"] = [repo_type] * len( resampled_df.index) # re-initialize the "REPO_TYPE" column return resampled_df
def balanced_train(data, features): X = data[features] y = data['label'] from imblearn.combine import SMOTEENN smote_enn = SMOTEENN(random_state=42) X_resampled, y_resampled = smote_enn.fit_sample(X, y) return X_resampled, y_resampled
def resample(X, Y, nb_class): print("original shape: ", X.shape) labels = Y.astype(int) counts = np.bincount(labels) if len(counts) != nb_class: print("there is no samples to interpolate! skip this fold.") return X, Y class_dist = counts / float(sum(counts)) print("original dist: ", class_dist) org_shape = X.shape sampler = SMOTEENN(random_state=0) flattend_X = X.reshape( (X.shape[0], X.shape[1] * X.shape[2] * X.shape[3] * X.shape[4])) X_resampled, Y_resampled = sampler.fit_sample(flattend_X, labels) X_resampled = X_resampled.reshape( (X_resampled.shape[0], X.shape[1], X.shape[2], X.shape[3], X.shape[4])) print("sampled shape: ", X_resampled.shape) Y_resampled = Y_resampled.astype(int) counts = np.bincount(Y_resampled) class_dist = counts / float(sum(counts)) print("after SMOTEENN dist: ", class_dist) return X_resampled, Y_resampled
def smoter(df): IDs = df.Quote_ID target = df.QuoteConversion_Flag data = df.drop(['QuoteConversion_Flag'], axis=1).values print("Before SMOTE: ", sorted(Counter(target).items())) #### # ENN #### enn = ENN(sampling_strategy="not majority", kind_sel="mode", n_neighbors=5, n_jobs=-1, random_state=RANDOM_STATE) smote_enn = SMOTEENN(enn=enn, random_state=RANDOM_STATE) X_resampled, y_resampled = smote_enn.fit_resample(data, target) print("SMOTE ENN: ", sorted(Counter(y_resampled).items())) #### # Tomeks #### # smote_tomek = SMOTETomek(random_state=0) # X_resampled, y_resampled = smote_tomek.fit_resample(data, target) # print("Using SMOTE: ", sorted(Counter(y_resampled).items())) data = pd.DataFrame(data=X_resampled, columns=FIELDS) target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag']) return data, target
def smot2(train_x, train_y, feature_columns): from imblearn.combine import SMOTEENN from imblearn.over_sampling import SMOTE from imblearn.under_sampling import TomekLinks from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import ADASYN from sklearn.svm import SVC from imblearn.under_sampling import CondensedNearestNeighbour print('\nOriginal dataset shape {}'.format(Counter(train_y))) sm = SMOTEENN(ratio='minority', n_jobs=3, random_state=42, n_neighbors=50, smote=SMOTE()) #sm = ADASYN(ratio='minority', n_jobs=3,random_state=42,n_neighbors=100) #sm = SMOTE(ratio='minority', n_jobs=3, random_state=42,m_neighbors=200) #sm = CondensedNearestNeighbour(ratio='majority', random_state=42) log.traceLogInfo("\nFIT DE SMOT2 ...equilibrage") X_res, y_res = sm.fit_sample(train_x, train_y) print('\nResampled dataset shape {}'.format(Counter(y_res))) # reconstitution DATAFRAME train_x = pd.DataFrame(X_res, columns=feature_columns) train_y = pd.Series(y_res) return train_x, train_y
def unba_smoteenn(x,y): x1 = x.reshape(x.shape[0],-1)# 7259*480 smoteenn = SMOTEENN(random_state=0) # 建立smoteenn模型对象 x1,y1 = smoteenn.fit_resample(x1,y)# 扩增以后*480 x2 = np.zeros((x1.shape[0],x.shape[1],x.shape[2],1)) for i in tqdm(range(x1.shape[0])): x2[i,:,:,0] = np.reshape(x1[i],(60,8)) return x2,y1
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, export=False, **kwargs): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data max_depth = None if "max_depth" not in kwargs else kwargs["max_depth"] dtc = DecisionTreeClassifier(criterion="entropy", random_state=0, max_depth=max_depth) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) selector = SelectKBest(score_function, k=k) selector = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in selector.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: print("Exporting tree to graph...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def imbalance_undersampling(datafile): df = filling_missing(datafile) # combine oversampling and undersampling togeter with SMOTEENN smote_enn = SMOTEENN(random_state=0) X_resampled, y_resampled = smote_enn.fit_resample(df[features], df.country_destination) print(sorted(Counter(y_resampled).items())) back = pd.DataFrame(np.hstack((X_resampled, y_resampled[:, None]))) #[516489 rows x 14 columns] # print(back) return back
def SMOTEENN_oversampling(x, y): print('Original dataset shape {}'.format(Counter(y))) smote_enn = SMOTEENN(random_state=42) x_sampled, y_sampled = smote_enn.fit_sample(x, y) print('With SMOTEENN sampled dataset shape {}'.format(Counter(y_sampled))) return x_sampled, y_sampled
def test_error_wrong_object(): smote = 'rnd' enn = 'rnd' smt = SMOTEENN(smote=smote, random_state=RND_SEED) assert_raises_regex(ValueError, "smote needs to be a SMOTE", smt.fit_sample, X, Y) smt = SMOTEENN(enn=enn, random_state=RND_SEED) assert_raises_regex(ValueError, "enn needs to be an ", smt.fit_sample, X, Y)
def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object sm = SMOTEENN(random_state=RND_SEED) sm.fit(X, Y) assert_raises(RuntimeError, sm.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def over_sampling(data): data = data.drop('aid', axis=1) data = data.drop('uid', axis=1) y = data['label'] X = data.drop('label', axis=1) sme = SMOTEENN() X_res, y_res = sme.fit_sample(X, y) data_res = pd.concat([X_res, y_res], axis=1) data_res.to_csv('./data/train_all_after_overSamlping.csv', index=False)
def balance(x, y, randomstate=None, **kwargs): sm = SMOTEENN(random_state=randomstate, n_jobs=3, n_neighbors=kwargs['neighbors']) print('dataset shape {}'.format(Counter(y))) print('Resampling...') rx, ry = sm.fit_sample(x, y) print('Resampled dataset shape {}'.format(Counter(ry))) return rx, ry
def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ 0.61319159, -0.11571667 ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def balancingClassesSmoteenn(x_train, y_train): # Using SMOTEEN to balance our training data points smn = SMOTEENN(random_state=7) features_balanced, target_balanced = smn.fit_resample(x_train, y_train) print("Count for each class value after SMOTEEN:", collections.Counter(target_balanced)) return features_balanced, target_balanced
def balance_train_data(data): print("Start balancing...") features, labels = data start_time = time.time() smote_enn = SMOTEENN(random_state=42) features, labels = smote_enn.fit_sample(features, labels) print("Balanced dataset:", sorted(Counter(labels).items())) print("Balancing time:", time.time() - start_time) return (features, labels)
def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half(): ratio = 0.8 smote = SMOTEENN(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def smote_enn_sampling(X,Y): nsamples, nx, ny = X.shape X = X.reshape((nsamples, nx*ny)) X, Y, idx_resampled = SMOTEENN().fit_sample(X,Y) nsamples, ny = X.shape X = X.reshape((nsamples, nx, ny/nx)) Y = Y.reshape((nsamples, 1)) return X, Y
def test_sample_regular_half(): sampling_strategy = {0: 10, 1: 12} smote = SMOTEENN( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(sampling_strategy='all') smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ 0.61319159, -0.11571667 ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_smote_fit(): """Test the fitting method""" # Create the object smote = SMOTEENN(random_state=RND_SEED) # Fit the data smote.fit(X, Y) # Check if the data information have been computed assert_equal(smote.min_c_, 0) assert_equal(smote.maj_c_, 1) assert_equal(smote.stats_c_[0], 500) assert_equal(smote.stats_c_[1], 4500)
def test_sample_regular(): """Test sample function with regular SMOTE.""" # Create the object smote = SMOTEENN(random_state=RND_SEED) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_enn_reg_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_enn_reg_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_pass_smote_enn(): smote = SMOTEENN(smote=SMOTE(ratio='auto', random_state=RND_SEED), enn=EditedNearestNeighbours(ratio='all', random_state=RND_SEED), random_state=RND_SEED) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def SMOTE(self, bug_rate, X, Y): """ Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. 通过改进的SMOTE来对原来的数据集做处理 :param bug_rate: :param X:数据集除了lable以外的部分 :param Y:lable信息 :return:处理过的X,Y。 """ from collections import Counter from imblearn.combine import SMOTEENN sme = SMOTEENN(ratio=bug_rate) x_res, y_res = sme.fit_sample(X, Y) import numpy as np nx = np.column_stack((x_res, y_res)) self.new_list_SMOTE = nx
def __init__(self,kind,data,target,verbose = False, ratio = 'auto'): assert len(data) == len(target) self.data = data self.target = target if kind in [Undersampling.ClusterCentroids]: if verbose: print('> CLUSTER CENTROIDS') # Undersampling por Cluster Centroids self.undersampler = ClusterCentroids(verbose = verbose, ratio=ratio) elif kind in [Undersampling.SMOTEENN]: if verbose: print('> SMOTEENN') # Undersampling por SMOTEENN self.undersampler = SMOTEENN(verbose = verbose, ratio=ratio) else: raise("Nonexistent undersampling type: "+kind.name)
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, export=False): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def test_error_wrong_object(): smote = 'rnd' enn = 'rnd' smt = SMOTEENN(smote=smote, random_state=RND_SEED) with raises(ValueError, match="smote needs to be a SMOTE"): smt.fit_resample(X, Y) smt = SMOTEENN(enn=enn, random_state=RND_SEED) with raises(ValueError, match="enn needs to be an "): smt.fit_resample(X, Y)
def test_parallelisation(): # Check if default job count is 1 smt = SMOTEENN(random_state=RND_SEED) smt._validate_estimator() assert smt.n_jobs == 1 assert smt.smote_.n_jobs == 1 assert smt.enn_.n_jobs == 1 # Check if job count is set smt = SMOTEENN(random_state=RND_SEED, n_jobs=8) smt._validate_estimator() assert smt.n_jobs == 8 assert smt.smote_.n_jobs == 8 assert smt.enn_.n_jobs == 8
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #smoteen sme = SMOTEENN(random_state=42) os_X,os_y = sme.fit_sample(X_train,y_train) #QDA clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True) clf_QDA.fit(os_X, os_y) y_true, y_pred = y_test, clf_QDA.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred)
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=100, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply SMOTE + ENN sm = SMOTEENN() X_resampled, y_resampled = sm.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=0.5) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
i = n//2 return (data[i - 1] + data[i])/2 start = time() n_iter = 100 ## Number of evaluations (SMAC) n_validations = 7 ## Number of Monte-Carlo Cross-Validations for each model's accuracy evaluated ## Dataset 11 url11 = "https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticdata2000.txt" dataset11 = np.genfromtxt(urllib.urlopen(url11)) X = dataset11[:,0:85] Y = dataset11[:,85] sm = SMOTEENN() X, Y = sm.fit_sample(X, Y) # We fit the MLP with the hyperparameters given and return the model's median accuracy from 7 trials def mlp(number_layers, number_neurons_1, number_neurons_2, number_neurons_3, number_neurons_4, dropout_rate): layers = [] number_neurons = [] number_neurons.append(number_neurons_1) number_neurons.append(number_neurons_2) number_neurons.append(number_neurons_3) number_neurons.append(number_neurons_4) for i in np.arange(number_layers): layers.append(Layer("Sigmoid", units=number_neurons[i], dropout = dropout_rate))
def test_error_wrong_object(smote_params, err_msg): smt = SMOTEENN(**smote_params) with pytest.raises(ValueError, match=err_msg): smt.fit_resample(X, Y)