def test_borderline_smote(kind, data): bsmote = BorderlineSMOTE(kind=kind, random_state=42) bsmote_nn = BorderlineSMOTE(kind=kind, random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11)) X_res_1, y_res_1 = bsmote.fit_resample(*data) X_res_2, y_res_2 = bsmote_nn.fit_resample(*data) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2)
def test_borderline_smote(kind): bsmote = BorderlineSMOTE(kind=kind, random_state=42) bsmote_nn = BorderlineSMOTE(kind=kind, random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11)) X_res_1, y_res_1 = bsmote.fit_resample(X, Y) X_res_2, y_res_2 = bsmote_nn.fit_resample(X, Y) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2)
def train(self, gridsearch=False): tic = time.time() self.set_pipeline() X_train_preproc = self.pipeline_feature.fit_transform(self.X_train) bm = BorderlineSMOTE(random_state=2, sampling_strategy='minority', k_neighbors=1, m_neighbors=20) self.X_train_smote, self.y_train_smote = bm.fit_resample( X_train_preproc, self.y_train) if gridsearch: self.model = RandomizedSearchCV( estimator=self.get_estimator(), param_distributions=self.model_params, n_iter=10, cv=2, verbose=5, random_state=42, n_jobs=None, ) self.model.fit(self.X_train_smote, self.y_train_smote) self.mlflow_log_metric("train_time", int(time.time() - tic)) print(colored(f'best score: {self.model.best_score_}', "blue")) print(colored(f'best params: {self.model.best_params_}', "blue")) self.model = self.model.best_estimator_ else: self.model = self.get_estimator() self.model.fit(self.X_train_smote, self.y_train_smote) self.mlflow_log_metric("train_time", int(time.time() - tic))
def create_metric(soft, metric, release, fold=3, boderlinesmote=False): all = [] for i in range(release): path = 'F:\\orca-master\\exampledata\\mData\\ordinalRegressionData\\Three severity\\' + metric + '\\' + soft + '\\' + str( i + 1) + '_code&network_metrics&bugs.csv' auto_spearman_metric, auto_spearman_metric_data = getAutoSpearmanMetric( path) all.append(auto_spearman_metric) for k in range(fold): if boderlinesmote: # 使用borderlinSMOTE auto_spearman_metric_data = auto_spearman_metric_data.dropna( axis=1) x = auto_spearman_metric_data.iloc[:, 0:-1] y = auto_spearman_metric_data.iloc[:, -1:] bord_smote = BorderlineSMOTE(random_state=16, kind="borderline-1") x_res, y_res = bord_smote.fit_resample(x, y) auto_spearman_metric_data = pd.merge(x_res, y_res, how='left', left_index=True, right_index=True) save_path = 'F:\\orca-master\\exampledata\\' + metric + '\\' + soft + '\\' + str( fold) + '-fold\\' + soft + str( i + 1) + '\\matlab\\' + 'train_' + soft + str(i + 1) + '.' + str(k) tmp = shuffle(auto_spearman_metric_data) tmp.to_csv(save_path, header=None, index=False, sep=" ") return all
def over_under_sampling(x, y): print('Generating synthetic samples...') over = BorderlineSMOTE() # under = RandomUnderSampler(sampling_strategy=0.5) # steps = [('o', over), ('u', under)] # pipeline = Pipeline(steps=steps) # x, y = pipeline.fit_resample(x, y) x, y = over.fit_resample(x, y.idxmax(axis=1)) y = pd.get_dummies(y) return x, y
def bordersmote(x, y): # Borderline-SMOTE k_neighbors = math.ceil(sum(y) * 0.01) m_neighbors = math.ceil(sum(y) * 0.01) bordersmote = BorderlineSMOTE(sampling_strategy=1, k_neighbors=k_neighbors, m_neighbors=m_neighbors) return bordersmote.fit_resample(x, y)
def up_sampling(X_train, y_train, ratio=2): pos_num = (y_train == 1).sum() if pos_num == 0: return X_train, y_train pos_sap_num = int(pos_num * ratio) X_train.fillna(0, inplace=True) smo = BorderlineSMOTE(sampling_strategy={1: pos_sap_num}, random_state=2019, n_jobs=8) X_train, y_train = smo.fit_resample(X_train, y_train) return X_train, y_train
def borderline_smote(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): sm = BorderlineSMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def smote_tomek(x_train, y_train): oversample = BorderlineSMOTE(sampling_strategy=0.5, random_state=0, k_neighbors=5, m_neighbors=10, n_jobs=-1, kind='borderline-1') X, y = oversample.fit_resample(x_train, y_train) tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1) X, y = tom_lin.fit_resample(X, y) # print(len([i for i in y_train.values if i==1])) # print(len([i for i in y.values if i==1])) # print(len(y_train)) # print(len(y)) return X, y
def oversample_borderline_SMOTE(df, variant=1, debug=True): X = df.values[:, :-1] y = df.values[:, -1].astype(int) if debug: print('Original dataset shape %s' % Counter(y)) if variant == 1: sm = BorderlineSMOTE(random_state=0, kind="borderline-1") else: sm = BorderlineSMOTE(random_state=0, kind="borderline-2") X_res, y_res = sm.fit_resample(X, y) df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1]) df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res) if debug: print('Resampled dataset shape %s' % Counter(y_res)) return df_resampled
def Borderline_DBSCAN(train_data, label, eps=20.1, min_samples=5): label_index = 0 if label == 'c': label_index = 1 if label == 'b': label_index = 0 print(train_data['label'].value_counts()) boSMOTE = BorderlineSMOTE(kind='borderline-1') x, y = boSMOTE.fit_resample(train_data.iloc[:, :-1], train_data.iloc[:, -1]) # print(boSMOTE.sample) BMG_sample = boSMOTE.sample[label_index][1] BMG_sample = pd.DataFrame(BMG_sample, columns=train_data.columns.values.tolist()[:-1]) BMG_sample['label'] = label max_sample = [] min_sample = [] # print(train_data.shape[0]) for temp in range(train_data.shape[0]): if train_data.iloc[temp, -1] == label: min_sample.append(train_data.iloc[temp, :].values) else: max_sample.append(train_data.iloc[temp, :].values) max_sample = pd.DataFrame(max_sample, columns=train_data.columns.values.tolist()) min_sample = pd.DataFrame(min_sample, columns=train_data.columns.values.tolist()) mergeSample = pd.concat([max_sample, BMG_sample], ignore_index=False) # print(min_sample.shape[0]) # print(max_sample.shape[0]) # print("**9**") # print(mergeSample.shape[0]) dbsc = DBSCAN(eps=eps, min_samples=min_samples).danger_fit(X=mergeSample, danger_sample=BMG_sample) array_neighborhoods = dbsc.neighborhoods neighborhoods_index = [] array_n_neighbors = dbsc.n_neighbors for temp in range(len(array_n_neighbors)): if array_n_neighbors[temp] >= 5: for i in range(array_n_neighbors[temp]): neighborhoods_index.append(array_neighborhoods[temp][i]) new_sample_index = list(set(neighborhoods_index)) num_sample = BMG_sample.shape[0] # print(array_neighborhoods) # print(len(new_sample_index)) # print(train_data.shape[0]) return min_sample, mergeSample, new_sample_index, num_sample
def imbalanced_sampler(input_data, input_labels, method='SMOTE'): if method == 'SMOTE': sampler = BorderlineSMOTE(n_jobs=4, random_state=RANDOM_STATE) elif method == 'Near Miss': sampler = NearMiss(n_jobs=4, random_state=RANDOM_STATE) else: print('Invalid sampler type. Only `SMOTE` (Borderline) and `Near Miss` are supported...') sys.exit(0) # TODO save samples by class to reduce file size max_class_num = np.max(input_labels) class_range = np.arange(1, max_class_num) x_sampled, y_sampled = sampler.fit_resample(input_data, input_labels) for i in class_range: idx = np.argwhere(y_sampled == i) pickle.dump(x_sampled[idx][:], open(method + '_Class_' + str(i) + '_data_samples.pkl', 'wb')) pickle.dump(y_sampled[idx], open(method + '_Class_' + str(i) + '_label_samples.pkl', 'wb')) return x_sampled, y_sampled
def Borderline_SMOTE_os(X_train, Y_train, seed, sampling_strategy, k_neighbors=5): if not isinstance(sampling_strategy, str): sampling_strategy = compute_sampling_strategy(sampling_strategy, Y_train, 'oversampling') smote = BorderlineSMOTE(random_state=seed, n_jobs=-1, k_neighbors=k_neighbors, sampling_strategy=sampling_strategy) print('Before Borderline SMOTE oversampling : ', sorted(Counter(Y_train).items())) X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train) print('After Borderline SMOTE oversampling : ', sorted(Counter(Y_train_resampled).items())) X_train_resampled, Y_train_resampled = shuffle_dataset( X_train_resampled, Y_train_resampled, seed) return X_train_resampled, Y_train_resampled
def k_folds(): for train_ind, val_ind in kf_gen: fold_train = (train_data[train_ind], train_exist[train_ind], train_label[train_ind]) fold_val = (train_data[val_ind], train_exist[val_ind], train_label[val_ind]) fold_val = TensorDataset(*from_numpy(*fold_val)) if oversampling in ['borderline_smote', 'svm_smote', 'smotenc']: from imblearn.over_sampling import SMOTENC, BorderlineSMOTE, SVMSMOTE if oversampling == 'borderline_smote': smote = BorderlineSMOTE(random_state=random_state) elif oversampling == 'svm_smote': smote = SVMSMOTE(random_state=random_state) elif oversampling == 'smotenc': categorical_features = [ 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 24, 25, 26, 31, 47, 49, 50, 51, 52, 53, 54, 56, 57, 58 ] + [1, 48] smote = SMOTENC( categorical_features=categorical_features, random_state=random_state, sampling_strategy='auto', neighbors=5, ) X, y = smote.fit_resample(fold_train[0], fold_train[2].argmax(axis=1)) y = np.array([[1, 0], [0, 1]])[y] exist = np.ones_like(X) fold_train = (X, exist, y) pos_weight = fold_train[2].shape[0] / np.sum(fold_train[2][:, 1]) - 1 fold_train = TensorDataset(*from_numpy(*fold_train)) yield fold_train, fold_val, pos_weight
def resample_to_csv(X, y, random_state, path, method): """Re-samples dataset using desired method of oversampling and writes output to CSV. :param X: Original Features :param y: Original Labels :param randomState: Random intialization :param path: Path to output location and name of CSV :param method: Either SMOTE-NN method or BorderLineSMOTE (borderline) method. See imbalanced-learn documentation for more information. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.BorderlineSMOTE.html :return: none """ if method == 'SMOTE-NN': smote_enn = SMOTEENN(random_state=random_state) X_resampled, y_resampled = smote_enn.fit_resample(X, y) X_resampled['BK'] = y_resampled X_resampled.to_csv(path) elif method == 'borderline': borderlineSmote = BorderlineSMOTE(random_state=random_state) X_resampled, y_resampled = borderlineSmote.fit_resample(X, y) X_resampled['BK'] = y_resampled X_resampled.to_csv(path) elif method == 'adasyn': adasyn = ADASYN(random_state=random_state) X_resampled, y_resampled = adasyn.fit_resample(X, y) X_resampled['BK'] = y_resampled X_resampled.to_csv(path) elif method == 'tomek': tomek = SMOTETomek(random_state=random_state) X_resampled, y_resampled = tomek.fit_resample(X, y) X_resampled['BK'] = y_resampled X_resampled.to_csv(path)
print('valor c ideal SVM SMOTE', best_params_smote['C'], 'valor gamma ideal SVM SMOTE', best_params_smote['gamma']) border_sm = BorderlineSMOTE(k_neighbors=27, random_state=91, sampling_strategy=1) sm = SVMSMOTE(random_state=91, k_neighbors=2, sampling_strategy=1, svm_estimator=SVM_smote) ada = ADASYN(random_state=91, n_neighbors=27, sampling_strategy=1, n_jobs=6) Kmeans = KMeansSMOTE(random_state=91, k_neighbors=2, sampling_strategy=1, n_jobs=6, kmeans_estimator=MiniBatchKMeans(n_clusters=20)) '''Muestreo Sintetico''' #Xtrain, ytrain = SMOTE().fit_resample(Xtrain, ytrain) Xtrain, ytrain = border_sm.fit_resample(Xtrain, ytrain) '''Selección de caracteristicas''' # rel_MI = SelectKBest(score_func=score_func, k=num_features) # Xtrain = rel_MI.fit_transform(Xtrain, ytrain) # Xtest = rel_MI.transform(Xtest) # rel_MI_support = rel_MI.get_support() # rel_MI_feature = X_frame.loc[:, rel_MI_support].columns.tolist() # rel_MI_scores = rel_MI.scores_[rel_MI_support].tolist() # feature_selection_df = pd.DataFrame({'Feature': rel_MI_feature, 'Score':rel_MI_scores}) Xtrain = Xtrain[:, [71, 83, 88, 70, 89, 56, 86, 53, 58, 59, 29, 28, 69, 41, 74, 23, 87]] Xtest = Xtest[:, [71, 83, 88, 70, 89, 56, 86, 53, 58, 59, 29, 28, 69, 41, 74, 23, 87]] '''
def do_oversampling_and_plot(self): X = self.X y = self.y sm = SMOTE(random_state=111) ad = ADASYN(random_state=111) bs = BorderlineSMOTE(random_state=111) X_new_sm, y_new_sm = sm.fit_resample(X, y) X_new_ad, y_new_ad = ad.fit_resample(X, y) X_new_bs, y_new_bs = bs.fit_resample(X, y) #before oversampling data_old = np.concatenate((X, y.reshape(-1, 1)), axis=1) data_1_old = data_old[data_old[:, 7] == 1] #data with class '1' data_0_old = data_old[data_old[:, 7] == 0] #data with class '0' #after oversampling (SMOTE) a = X_new_sm[:, 0:7] b = y_new_sm.reshape(-1, 1) #class/target column data = np.concatenate((a, b), axis=1) data_1 = data[data[:, 7] == 1] #data with class '1' data_0 = data[data[:, 7] == 0] #data with class '0' #after oversampling (ADASYN) a_ad = X_new_ad[:, 0:7] b_ad = y_new_ad.reshape(-1, 1) #class/target column data_ad = np.concatenate((a_ad, b_ad), axis=1) data_1_ad = data_ad[data_ad[:, 7] == 1] #data with class '1' data_0_ad = data_ad[data_ad[:, 7] == 0] #data with class '0' #after oversampling (Borderline_SMOTE) a_bs = X_new_bs[:, 0:7] b_bs = y_new_bs.reshape(-1, 1) #class/target column data_bs = np.concatenate((a_bs, b_bs), axis=1) data_1_bs = data_bs[data_bs[:, 7] == 1] #data with class '1' data_0_bs = data_bs[data_bs[:, 7] == 0] #data with class '0' ### create 3D plot fig = plt.figure(constrained_layout=True, figsize=(12, 7.5)) gs = GridSpec(2, 2, figure=fig) ax1 = fig.add_subplot(gs[0, 0], projection='3d') ax2 = fig.add_subplot(gs[0, 1], projection='3d') ax3 = fig.add_subplot(gs[1, 1], projection='3d') ax4 = fig.add_subplot(gs[1, 0], projection='3d') size = 10.5 #label font size ## scatter plot before oversampling scatter1 = ax1.scatter(data_1_old[:, [1]], data_1_old[:, [2]], data_1_old[:, [4]], c='yellow', marker='o', s=40, edgecolors='k', depthshade=0) scatter2 = ax1.scatter(data_0_old[:, [1]], data_0_old[:, [2]], data_0_old[:, [4]], c='r', marker='o', s=40, edgecolors='k', depthshade=0) ax1.set_xlabel('Age (years)', fontsize=size) ax1.set_ylabel('Elapsed time (months)', fontsize=size) ax1.set_zlabel('Wart type', fontsize=size) ax1.set_zticks(range(1, 4, 1)) ax1.set_title('(a) Before oversampling\n', fontsize=14, fontweight='bold') # set legend legend = ax1.legend([scatter1, scatter2], ['Y=1 (Yes)', 'Y=0 (No)'], numpoints=1, loc='best', fontsize=size) legend.get_frame().set_edgecolor('k') ## scatter plot after oversampling (SMOTE) scatter1 = ax2.scatter(data_1[:, [1]], data_1[:, [2]], data_1[:, [4]], c='yellow', marker='o', s=40, edgecolors='k', depthshade=0) scatter2 = ax2.scatter(data_0[:, [1]], data_0[:, [2]], data_0[:, [4]], c='r', marker='o', s=40, edgecolors='k', depthshade=0) ax2.set_xlabel('Age (years)', fontsize=size) ax2.set_ylabel('Elapsed time (months)', fontsize=size) ax2.set_zlabel('Wart type', fontsize=size) ax2.set_zticks(range(1, 4, 1)) ax2.set_title('(b) After oversampling (SMOTE)\n', fontsize=14, fontweight='bold') # set legend legend = ax2.legend([scatter1, scatter2], ['Y=1 (Yes)', 'Y=0 (No)'], numpoints=1, loc='best', fontsize=size) legend.get_frame().set_edgecolor('k') ## scatter plot after oversampling (ADASYN) scatter1 = ax3.scatter(data_1_ad[:, [1]], data_1_ad[:, [2]], data_1_ad[:, [4]], c='yellow', marker='o', s=40, edgecolors='k', depthshade=0) scatter2 = ax3.scatter(data_0_ad[:, [1]], data_0_ad[:, [2]], data_0_ad[:, [4]], c='r', marker='o', s=40, edgecolors='k', depthshade=0) ax3.set_xlabel('Age (years)', fontsize=size) ax3.set_ylabel('Elapsed time (months)', fontsize=size) ax3.set_zlabel('Wart type', fontsize=size) ax3.set_zticks(range(1, 4, 1)) ax3.set_title('\n(d) After oversampling (ADASYN)\n', fontsize=14, fontweight='bold') # set legend legend = ax3.legend([scatter1, scatter2], ['Y=1 (Yes)', 'Y=0 (No)'], numpoints=1, loc='best', fontsize=size) legend.get_frame().set_edgecolor('k') ## scatter plot after oversampling (ADASYN) scatter1 = ax4.scatter(data_1_bs[:, [1]], data_1_bs[:, [2]], data_1_bs[:, [4]], c='yellow', marker='o', s=40, edgecolors='k', depthshade=0) scatter2 = ax4.scatter(data_0_bs[:, [1]], data_0_bs[:, [2]], data_0_bs[:, [4]], c='r', marker='o', s=40, edgecolors='k', depthshade=0) ax4.set_xlabel('Age (years)', fontsize=size) ax4.set_ylabel('Elapsed time (months)', fontsize=size) ax4.set_zlabel('Wart type', fontsize=size) ax4.set_zticks(range(1, 4, 1)) ax4.set_title('\n(c) After oversampling (Borderline-SMOTE)\n', fontsize=14, fontweight='bold') # set legend legend = ax4.legend([scatter1, scatter2], ['Y=1 (Yes)', 'Y=0 (No)'], numpoints=1, loc='best', fontsize=size) legend.get_frame().set_edgecolor('k') # #save figure # fig.savefig("Oversampling plot_3D.png", dpi=300, bbox_inches='tight') plt.show()
def fit(self, X, y): """Fitting.""" # if not hasattr(self, "base_estimator"): # self.set_base_clf() X, y = check_X_y(X, y) self.classes_ = unique_labels(y) self.X_ = X self.y_ = y minority_X = X[y == 1] minority_y = y[y == 1] majority_X = X[y == 0] majority_y = y[y == 0] for i in range(self.ensemble_size): self.estimators_.append(base.clone(self.base_estimator)) for n, estimator in enumerate(self.estimators_): np.random.seed(self.random_state + (n * 2)) bagXminority = minority_X[np.random.choice( minority_X.shape[0], len(minority_y), replace=True), :] bagXmajority = majority_X[np.random.choice( majority_X.shape[0], len(majority_y), replace=True), :] bagyminority = np.ones(len(minority_y)).astype('int') bagymajority = np.zeros(len(majority_y)).astype('int') train_X = np.concatenate((bagXmajority, bagXminority)) train_y = np.concatenate((bagymajority, bagyminority)) unique, counts = np.unique(train_y, return_counts=True) if self.oversampler == "ROS": ros = RandomOverSampler(random_state=self.random_state + (n * 2)) try: train_X, train_y = ros.fit_resample(train_X, train_y) except: pass elif self.oversampler == "B2": b2 = BorderlineSMOTE(random_state=self.random_state + (n * 2), kind='borderline-2') try: train_X, train_y = b2.fit_resample(train_X, train_y) except: pass elif self.oversampler == "RUS": rus = RandomUnderSampler(random_state=self.random_state + (n * 2)) try: train_X, train_y = rus.fit_resample(train_X, train_y) # _, ys_counter = np.unique(train_ys, return_counts=True) # if np.sum(ys_counter) < 9: # rus = RandomUnderSampler(random_state=self.random_state+(n*2), sampling_strategy={0:(9-ys_counter[1]), 1:ys_counter[1]}) # train_Xs, train_ys = rus.fit_resample(train_X, train_y) # train_X, train_y = train_Xs, train_ys # else: # train_X, train_y = train_Xs, train_ys except: pass elif self.oversampler == "CNN": cnn = CondensedNearestNeighbour( random_state=self.random_state + (n * 2)) try: train_X, train_y = cnn.fit_resample(train_X, train_y) except: pass # if train_X.shape[0] >= 5: estimator.fit(train_X, train_y) # else: # print("Padlem, więc biorę %i sasiadow" % train_X.shape[0]) # self.estimators_[n] = KNeighborsClassifier(weights='distance', n_neighbors=train_X.shape[0]).fit(train_X, train_y) # Return the classifier return self
def test_borderline_smote_wrong_kind(): bsmote = BorderlineSMOTE(kind='rand') with pytest.raises(ValueError, match='The possible "kind" of algorithm'): bsmote.fit_resample(X, Y)
def over_sample(self, x_train, y_train): sm = BorderlineSMOTE(random_state=151, kind='borderline-1') # 获取过采样后的数据集 x_train, y_train = sm.fit_resample(x_train, y_train) return x_train, y_train
ki67_exp = ki67[:, 1:] ki67_exp = np.array(ki67_exp, dtype=np.float32) ki67_exp.shape ki67_label = ki67[:, 0] ki67_label = np.array(ki67_label, dtype=np.int32) ki67_label.shape # use oversampling to balance the training set # use borderlineSMOTE oversample = BorderlineSMOTE() # fit and apply the transform ki67_exp, ki67_label = oversample.fit_resample(ki67_exp, ki67_label) ki67_exp = np.reshape(ki67_exp, (-1, 64, 64, 4)) # del array objects to save memory del df del df2 del ki67 # batch size and numEpochs numEpoch = 200 batchSize = 25 LR = 0.001 DECAY = 0.000 EPSILON = 0.99 DROPOUT = 0.5
def borderline_smote(X, y): sm = BorderlineSMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train, PM_ova_X_train, PM_ova_y_train, SC_ova_X_train, SC_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN() AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ALLKNN": AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN() AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "CNN": AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ENN": AP_enn, PM_enn, SC_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "IHT": AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NCR": AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AP_ova_y_train = [ 0 if i == "Add penalty" else 1 for i in AP_ova_y_train ] AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train] PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_ova_y_train = [ 0 if i == "Send for Credit Collection" else 1 for i in SC_ova_y_train ] SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NM": AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss() AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "OSS": AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RENN": AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTE": AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE() AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "BSMOTE": AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTEENN": AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN() AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTETOMEK": AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek() AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "TOMEK": AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks() AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ROS": AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RUS": AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train, SC_ova_y_train) return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
x_train, x_test, y_train, y_test = train_test_split(bank_X, bank_y, stratify=bank_y, train_size=0.7, random_state=0) #classifier = svm.SVC(kernel = 'rbf',C=1000,gamma=0.001) classifier = LogisticRegression(max_iter=10000, C=0.1) #easy_ensemble = imblearn.ensemble.EasyEnsembleClassifier(n_estimators=35, base_estimator=classifier, sampling_strategy='majority', n_jobs=-1) oversample = BorderlineSMOTE(sampling_strategy=0.5, n_jobs=-1, kind='borderline-1') x_train, y_train = oversample.fit_resample(x_train, y_train) tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1) x_train, y_train = tom_lin.fit_resample(x_train, y_train) classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) h.printResults2(y_test, y_pred) h.plotConfusionMatrix(y_test, y_pred, norm=True) h.plotConfusionMatrix(y_test, y_pred, norm=False) #White-box explanation feature_names = bank_X.columns.values interpr.plotFeaturesCoefficientGlobal(classifier, feature_names) new_x_train = x_train
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.X_, self.y_ = X, y train_X, train_y = X, y unique, counts = np.unique(train_y, return_counts=True) k_neighbors = 5 if counts[0] - 1 < 5: k_neighbors = counts[0] - 1 if self.oversampler == "SMOTE" and k_neighbors > 0: smote = SMOTE(random_state=42, k_neighbors=k_neighbors) train_X, train_y = smote.fit_resample(train_X, train_y) elif self.oversampler == "svmSMOTE" and k_neighbors > 0: try: svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors) train_X, train_y = svmSmote.fit_resample(train_X, train_y) except ValueError: pass elif self.oversampler == "borderline1" and k_neighbors > 0: borderlineSmote1 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-1') train_X, train_y = borderlineSmote1.fit_resample(train_X, train_y) elif self.oversampler == "borderline2" and k_neighbors > 0: borderlineSmote2 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-2') train_X, train_y = borderlineSmote2.fit_resample(train_X, train_y) elif self.oversampler == "ADASYN" and k_neighbors > 0: try: adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors) train_X, train_y = adasyn.fit_resample(train_X, train_y) except RuntimeError: pass elif self.oversampler == "SLS" and k_neighbors > 0: sls = Safe_Level_SMOTE(n_neighbors=k_neighbors) train_X, train_y = sls.sample(train_X, train_y) # Testing all models scores = np.array([ba(y, clf.predict(X)) for clf in self.ensemble_]) # Pruning if len(self.ensemble_) > 1: alpha_good = scores > (0.5 + self.alpha) self.ensemble_ = [ self.ensemble_[i] for i in np.where(alpha_good)[0] ] if len(self.ensemble_) > self.ensemble_size - 1: worst = np.argmin(scores) del self.ensemble_[worst] # Preparing and training new candidate self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train, AI_ova_X_train, AI_ova_y_train, AW_ova_X_train, AW_ova_y_train, CC_ova_X_train, CC_ova_y_train, QA_ova_X_train, QA_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN( ), ADASYN(), ADASYN() AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ALLKNN": AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN( ), AllKNN(), AllKNN(), AllKNN(), AllKNN() AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "CNN": AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ENN": AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "IHT": AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NCR": AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AA_ova_y_train = [ 0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train ] AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_ova_y_train = [ 0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train ] AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_ova_y_train = [ 0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train ] AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_ova_y_train = [ 0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train ] CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_ova_y_train = [ 0 if i == "Queued/Awaiting Assignment" else 1 for i in QA_ova_y_train ] QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NM": AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss( ), NearMiss(), NearMiss() AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "OSS": AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection( ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RENN": AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTE": AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE( ), SMOTE() AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "BSMOTE": AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE( ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTEENN": AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN( ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN() AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTETOMEK": AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek( ), SMOTETomek(), SMOTETomek(), SMOTETomek() AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "TOMEK": AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks( ), TomekLinks(), TomekLinks(), TomekLinks() AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ROS": AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler( ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RUS": AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler( ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train, QA_ova_y_train) return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
label_autism_data.head() X = label_autism_data.iloc[:, 0:14] y = label_autism_data.iloc[:, 14] X y # Borderline smote from collections import Counter from sklearn.datasets import make_classification from imblearn.over_sampling import BorderlineSMOTE sm = BorderlineSMOTE(random_state=42) X_res_borderline, y_res_bol = sm.fit_resample(X, y) print('Resampled dataset shape %s' % Counter(y)) print('Resampled dataset shape %s' % Counter(y_res_bol)) from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X_res_borderline, y_res_bol, test_size=0.25, random_state=0) len(X_train) from sklearn.preprocessing import StandardScaler sc = StandardScaler()
from imblearn.over_sampling import SMOTE, BorderlineSMOTE from xgboost import XGBClassifier train = pd.read_csv('train.csv', index_col=0) test = pd.read_csv('test.csv', index_col=0) sample_submission = pd.read_csv('sample_submission.csv', index_col=0) train_x = train.drop(columns='class', axis=1) # class 열을 삭제한 새로운 객체 y = train['class'] # 결과 레이블(class) test_x = test class0 = len(y[y == 0]) class1 = len(y[y == 1]) class2 = len(y[y == 2]) total = len(y) scaler = StandardScaler() x = scaler.fit_transform(train_x) TEST = scaler.transform(test_x) BS = BorderlineSMOTE(random_state=42, n_jobs=-1, k_neighbors=3) x, y = BS.fit_resample(x, y) print(pd.value_counts(y)) train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.1, stratify=y, random_state=42) evals = [(test_x, test_y)] xgb = XGBClassifier(n_estimators=600, n_jobs=-1, learning_rate=0.05, subsample=0.65, max_depth=50, objective="multi:softmax", random_state=42) xgb.fit(train_x, train_y, early_stopping_rounds=30, eval_set=evals) # ans_pred += forest_pred # ans_pred += xgb_pred # ans_pred /= 2.0 print("acc: {}".format(xgb.score(train_x, train_y))) print("acc: {}".format(xgb.score(test_x, test_y))) y_pred = np.argmax(xgb.predict_proba(TEST), axis=1) submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index) submission.to_csv('submission.csv', index=True) # 테스트 데이터에 대해 94.6% 정도의 accuracy가 나왔지만 실제로는 92.215%가 나옴
def test_borderline_smote_wrong_kind(data): bsmote = BorderlineSMOTE(kind='rand') with pytest.raises(ValueError, match='The possible "kind" of algorithm'): bsmote.fit_resample(*data)
# borderline-SMOTE for imbalanced dataset from collections import Counter from sklearn.datasets import make_classification from imblearn.over_sampling import BorderlineSMOTE from plotDataset import plot_dataset X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1) counter = Counter(y) print(counter) plot_dataset(X, y, counter) oversample = BorderlineSMOTE() X, y = oversample.fit_resample(X, y) counter = Counter(y) print(counter) plot_dataset(X, y, counter)
test_size=0.2) model_oversample = lr.fit(x_train, y_train) y_predict = model_oversample.predict(x_test) print(classification_report(y_test, y_predict)) #SMOTE from imblearn.over_sampling import SMOTE X_resampled, y_resampled = SMOTE(random_state=2020).fit_resample(x, y) x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2) model_resample = lr.fit(x_train, y_train) y_predict = model_resample.predict(x_test) print(classification_report(y_test, y_predict)) from imblearn.over_sampling import BorderlineSMOTE sm = BorderlineSMOTE(random_state=2020) X_res, y_res = sm.fit_resample(x, y) x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2) model_resample = lr.fit(x_train, y_train) y_predict = model_resample.predict(x_test) print(classification_report(y_test, y_predict)) from imblearn.over_sampling import KMeansSMOTE sm = KMeansSMOTE(random_state=2020, cluster_balance_threshold=0.1) X_res, y_res = sm.fit_resample(x, y) x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2) model_resample = lr.fit(x_train, y_train) y_predict = model_resample.predict(x_test) print(classification_report(y_test, y_predict))
nhg_exp = np.array(nhg_exp, dtype=np.float32) # nhg_exp = np.reshape(nhg_exp, (-1, 64, 64, 4)) nhg_exp.shape # get diagnosis nhg_label = nhg[:, 0] nhg_label = np.array(nhg_label, dtype=np.int32) nhg_label = to_categorical(nhg_label) # use oversampling to balance the training set # use borderlineSMOTE oversample = BorderlineSMOTE() # fit and apply the transform nhg_exp, nhg_label = oversample.fit_resample(nhg_exp, nhg_label) nhg_exp = np.reshape(nhg_exp, (-1, 64, 64, 4)) # delete array objects to save memory del nhg1 del nhg2 del nhg # fix random seed for reproducibility seed = 1234 np.random.seed(seed) # batch size and numEpochs numEpoch = 500 batchSize = 100