def fit(self, X, y): sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(all_ft_idx) if len(idxs) == 1: self.hsic_idx_ = idxs[0] else: self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) print("HSIC done.", len(self.hsic_idx_)) print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self
def test_ada_fit_resample_nn_obj(): nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array([ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.88161986, -0.2829741], [0.35681689, -0.18814597], [1.4148276, 0.05308106], [0.3136591, -0.31327875], ]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def oversampling(): categories = ['Tools', 'Hardware', 'Other', 'Script', 'Software'] docs_to_train = sklearn.datasets.load_files( '/Users/rishabm/Desktop/MergeFileJTOrg/data1', description=None, categories=categories, load_content=True, encoding='utf-8') X_train, X_test, y_train, y_test = train_test_split(docs_to_train.data, docs_to_train.target, test_size=0.2) ogY = Counter(y_train) print('OLD SAMPLES: ') for key, value in ogY.items(): print(key, value) matrix = loadmatrix('matrix/2019_Jun_21_14_44_mtrx.joblib') adasyn = ADASYN() matrix_resampled, y_resampled = adasyn.fit_resample(matrix, y_train) y_resampled.astype(int) newY = Counter(y_resampled) print('NEW SAMPLES: ') for key, value in newY.items(): print(key, value)
def create_adasyn_samples(self, num_ADASYN, train_p, train_n): # train_x_expanded, train_y_binary = self.pre_process(test_data=False) # inos_p_old = train_x_expanded[train_y_binary == 1] # inos_n = train_x_expanded[train_y_binary == 0] # generate 30% ADASYN samples # prepare data to run ADASYN: ADASYN trains on entire original training data X = pd.concat((train_p.transpose(), train_n.transpose()), axis=0) # create y y_p = np.ones(train_p.shape[1]) y_n = np.zeros(train_n.shape[1]) y = np.concatenate((y_p, y_n)) # We will generate equal number of minority samples as majority samples majority_sample_cnt = train_n.shape[1] # ada = ADASYN(sampling_strategy={1: majority_sample_cnt, 0: majority_sample_cnt}) ada = ADASYN(sampling_strategy=1.0, n_neighbors=10) # X contains all data, should be in format of n_samples*n_features X_res, y_res = ada.fit_resample(X, y) # In X_res, the first segment is original minority class samples, 2nd segment is original majority class samples # last segment is synthesized minority samples, we only want the last segment num_adasyn_samples_generated = X_res.shape[0] - train_p.shape[1] - train_n.shape[1] starting_index = X_res.shape[0] - num_adasyn_samples_generated X_adasyn = X_res.iloc[starting_index:X_res.shape[0], :] print("debug, X_adasyn shape") print(X_adasyn.shape) return X_adasyn
def adasyn(x, y): # Adaptive Synthetic n_neighbors = math.ceil(sum(y) * 0.01) adasyn = ADASYN(sampling_strategy=1, n_neighbors=n_neighbors) return adasyn.fit_resample(x, y)
def create_train_and_test_sets(training_data_json: JSONType) -> List[np.ndarray]: # print("create_train_and_test_sets") data_as_list_of_dicts = json.loads(training_data_json) x, y = split_data_to_x_and_y(data_as_list_of_dicts) # x = df_one_hot_vectors.iloc[:, 3:].values # y = df_one_hot_vectors.iloc[:, :1].values.ravel() # tutaj powinny być chyba 3 kolumny x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y) print('Liczba etykiet w zbiorze y:', np.bincount(y)) print('Liczba etykiet w zbiorze y_train:', np.bincount(y_train)) print('Liczba etykiet w zbiorze y_test:', np.bincount(y_test)) pca = PCA(n_components=500, random_state=1) x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test) adasyn = ADASYN(random_state=1) x_train, y_train = adasyn.fit_resample(x_train, y_train) sc = StandardScaler() sc.fit(x_train) x_train_std = sc.transform(x_train) x_train_std = normalize(x_train_std, norm='l2') x_test_std = sc.transform(x_test) x_test_std = normalize(x_test_std, norm='l2') return [pca, sc, x_test_std, x_train_std, y_test, y_train]
def boost_select(data, target, n_feat): import xgboost as xgb import sklearn as sk X_train, X_test, y_train, y_test = sk.model_selection.train_test_split( data, target, shuffle=True, test_size=0.33) # Oversampling from imblearn.over_sampling import ADASYN, SMOTE ada = ADASYN(sampling_strategy='minority') X_train_, y_train_ = ada.fit_resample(X_train, y_train) print(X_train_.shape) from numpy import sort from sklearn.feature_selection import SelectFromModel param = { 'max_depth': 9, 'eta': 0.7, 'silent': 1, 'objective': 'binary:logistic' } param['booster'] = 'dart' param['nthread'] = 4 param['silent'] = 1 param['eval_metric'] = 'auc' model = xgb.XGBClassifier(params=param) model.fit(X_train_, y_train_) selection = SelectFromModel(model, max_features=n_feat, prefit=True) new_data = selection.transform(data) return new_data
def boost(X_train, X_test, y_train, y_test): ada = ADASYN(sampling_strategy='minority') X_train_, y_train_ = ada.fit_resample(X_train, y_train) print(X_train_.shape) from numpy import sort from sklearn.feature_selection import SelectFromModel param = { 'max_depth': 9, 'eta': 0.7, 'silent': 1, 'objective': 'binary:logistic' } param['booster'] = 'dart' param['nthread'] = 4 param['silent'] = 1 param['eval_metric'] = 'auc' model = xgb.XGBClassifier(params=param) model.fit(X_train_, y_train_) y_pred = model.predict(X_test) return y_pred
def load_dataset(method="randomover"): dataset = pd.read_csv("Pretreatment Data/Data.csv", encoding="GBK") Y_orig = np.array(dataset["Y"]) X_orig = dataset.drop(["Y"], axis=1) X_orig = np.array(X_orig) X_train, X_test, Y_train, Y_test = train_test_split(X_orig, Y_orig, test_size=0.25, random_state=0) X_train = StandardScaler().fit_transform(X_train) X_test = StandardScaler().fit_transform(X_test) # 过采样 if method == "randomover": ros = RandomOverSampler(random_state=0) X_resampled, Y_resampled = ros.fit_sample(X_train, Y_train) # 使用SMOTE模型生成新的样本 if method == "smote": smo = SMOTE(random_state=0) X_resampled, Y_resampled = smo.fit_resample(X_train, Y_train) # 使用ADASYN模型 if method == "adasyn": ada = ADASYN(random_state=0) X_resampled, Y_resampled = ada.fit_resample(X_train, Y_train) # 欠采样 if method == "randomunder": rus = RandomUnderSampler(random_state=0) X_resampled, Y_resampled = rus.fit_sample(X_train, Y_train) return X_resampled, Y_resampled, X_test, Y_test
def unba_adasyn(x,y): x1 = x.reshape(x.shape[0],-1)# 7259*480 adasyn = ADASYN(sampling_strategy='minority') # 建立ros模型对象 x1,y1 = adasyn.fit_resample(x1,y)# 扩增以后*480 x2 = np.zeros((x1.shape[0],x.shape[1],x.shape[2],1)) for i in tqdm(range(x1.shape[0])): x2[i,:,:,0] = np.reshape(x1[i],(60,8)) return x2,y1
def balance_df(x_train, y_train, neighborgs): oversampler = ADASYN(sampling_strategy='not majority', n_neighbors=neighborgs, random_state=42) x_train, y_train = oversampler.fit_resample(x_train, y_train) return x_train, y_train
def select_ratio_ADASYN(df,ratio,features=["stanford_polite","perspective_score"]): sm = ADASYN(sampling_strategy={0: int(len(df)), 1: int(len(df) // ratio)}) labels = list(df["toxic"]) df, new_labels = sm.fit_resample(df[features], labels) df = pd.DataFrame(df, columns=features) df['toxic'] = new_labels return df
def boost_select(data, target): import xgboost as xgb import sklearn as sk X_train, X_test, y_train, y_test = sk.model_selection.train_test_split( data, target, shuffle=True, test_size=0.33) # Oversampling from imblearn.over_sampling import ADASYN, SMOTE ada = ADASYN(sampling_strategy='minority') X_train_, y_train_ = ada.fit_resample(X_train, y_train) print(X_train_.shape) from numpy import sort from sklearn.feature_selection import SelectFromModel param = { 'max_depth': 9, 'eta': 0.7, 'silent': 1, 'objective': 'binary:logistic' } param['booster'] = 'dart' param['nthread'] = 4 param['silent'] = 1 param['eval_metric'] = 'auc' model = xgb.XGBClassifier(params=param) model.fit(X_train_, y_train_) # This code is to show how many features surpass a good limit, minimum number = 42 # By generating the code below, we can see the fit of the model given number of components. # Here, we simply waited for the minimum number of components to reach 0.85 threshold # then using these components to optimise the deep learning model below. # Uncomment the code below to see the feature importance with their governing thresholds. # thresholds = sort(model.feature_importances_)[::-1] # print(thresholds) # for thresh in thresholds: # # select features using threshold # selection = SelectFromModel(model, threshold=thresh, prefit=True) # select_X_train = selection.transform(X_train_) # # train model # selection_model = xgb.XGBClassifier(params=param) # selection_model.fit(select_X_train, y_train_) # # eval model # select_X_test = selection.transform(X_test) # y_pred = selection_model.predict(select_X_test) # predictions = [round(value) for value in y_pred] # accuracy = sk.metrics.accuracy_score(y_test, predictions) # recall = sk.metrics.recall_score(y_test,predictions) # precision = sk.metrics.precision_score(y_test,predictions) # print("Thresh=%.3f, n=%d, Accuracy: %.2f%%%%, Recall: %.2f%%%%, Precision: %.2f%%%%" % (thresh, select_X_train.shape[1], accuracy*100.0,recall*100.0,precision*100.0)) selection = SelectFromModel(model, max_features=60, prefit=True) new_data = selection.transform(data) return new_data
def test_ada_fit_resample(): ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array([ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.94899098, -0.30508981], [0.28204936, -0.13953426], [1.58028868, -0.04089947], [0.66117333, -0.28009063], ]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def run(X_train, y_train): adasyn_sampler = ADASYN(sampling_strategy='not majority') X_adasyn, y_adasyn = adasyn_sampler.fit_resample(X_train, y_train) print("######################") print("ADASYN") print("######################") print("\n") return X_adasyn, y_adasyn
def over_sample(self, method="BorderLine", sampling_strategy="minority", random_state=42, k_neighbors=5, n_neighbors=10, kind="borderline-1"): """ 过采样方法 :param method: str, option: ADASYN, BorderLine,KMeans,Random,SVM :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m} :param random_state:int :param k_neighbors:int :param n_neighbors:int :param kind:str, borderline-1,borderline-2 :return:df """ feature_name = self._df.columns.difference(["id", self._target]).tolist() X = self._df[feature_name].values y = self._df[self._target].values print("Original label shape {}".format(Counter(y))) if method == "ADASYN": overSm = ADASYN(sampling_strategy=sampling_strategy, random_state=random_state, n_neighbors=k_neighbors) elif method == "BorderLine": overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, kind=kind) elif method == "KMeans": overSm = KMeansSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors) elif method == "Random": overSm = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=random_state) elif method == "SVM": overSm = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, m_neighbors=n_neighbors, out_step=0.5) else: print("不支持{}该抽样方法".format(method)) return self._df X_res, y_res = overSm.fit_resample(X, y) print("overSample label shape {}".format(Counter(y_res))) _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1) df_new = pd.DataFrame(data=_data, columns=feature_name + [self._target]) return df_new
def classify(self, X, type: str, classifier: str, test_prop: float, res: None, res_method: None): if type == 'binary': y = self.df['class'].replace(0,1) elif type == 'multi': y = self.df['class'] else: raise TypeError("Choose a proper type of classification") X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_prop, stratify=y) if res == True: if res_method == 'down': nm = NearMiss() X_res, Y_res = nm.fit_resample(X_train, Y_train) elif res_method == 'up': sm = ADASYN() X_res, Y_res = sm.fit_resample(X_train, Y_train) else: raise TypeError("Resampling method not provided. Please use 'up' for oversampling or 'down' for undersampling.") if classifier == 'lr': model = LogisticRegression(solver='liblinear', class_weight='balanced', C=0.04, penalty='l2') elif classifier == 'svc': model = LinearSVC(C=0.004, penalty='l2') elif classifier == 'rf': n_est = int(input("Type in number of trees to estimate from: ").strip()) model = RandomForestClassifier(n_estimators=n_est, bootstrap=True, max_depth=5) elif classifier == 'xgb': n_est = int(input("Type in number of trees to estimate from: ").strip()) model = XGBClassifier(n_estimators=n_est, bootstrap=True, max_depth=5, reg_lamba=0.4) elif classifier == 'ada': n_est = int(input("Type in number of trees to estimate from: ").strip()) model = AdaBoostClassifier(n_estimators=n_est, learning_rate=0.005) else: raise TypeError("Choose a proper classifier. Possible inputs: 'lr', 'svc', 'rf', 'xgb', 'ada' .") if res == True: model.fit(X_res, Y_res) else: model.fit(X_train, Y_train) Y_pred = model.predict(X_test) # Accuracy Percentage print(f"Accuracy is {round(accuracy_score(Y_test, Y_pred), 2)*100}%") # Classification Report print(classification_report(Y_pred, Y_test)) # Matthew's Correlation Coefficient print(f"Matthew's Correlation Coefficient is {matthews_corrcoef(Y_test, Y_pred)}") # Plots of Confusion Matrix and ROC Curve plot_confusion_matrix(Y_test, Y_pred, figsize=(10,10)) return model
def export_csv(name_dataset, data, labels, name_method): # over-sampling resampler = ADASYN(random_state=42) data_resampled, labels_resampled = resampler.fit_resample(data, labels) carpet = "../MO2P/data/all/" print("generate") full_data = carpet + "./" + name_dataset + "_data_train_" + \ name_method + "_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv" data_A = carpet + "./" + name_dataset + "_data_train_" + \ name_method + "_A_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv" data_B = carpet + "./" + name_dataset + "_data_train_" + \ name_method + "_B_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv" labels_A = carpet + "./" + name_dataset + "_truth_train_" + \ name_method + "_A_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv" labels_B = carpet + "./" + name_dataset + "_truth_train_" + \ name_method + "_B_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv" full_data_resampled = carpet + "./" + name_dataset + "_data_train_" + \ name_method + "_ADASYN_" + str(nb_att_1) + "_" + str(nb_att_2) + ".csv" data_A_resampled = carpet + "./" + name_dataset + "_data_train_" + \ name_method + "_A_ADASYN_" + \ str(nb_att_1) + "_" + str(nb_att_2) + ".csv" data_B_resampled = carpet + "./" + name_dataset + "_data_train_" + \ name_method + "_B_ADASYN_" + \ str(nb_att_1) + "_" + str(nb_att_2) + ".csv" labels_A_resampled = carpet + "./" + name_dataset + "_truth_train_" + \ name_method + "_A_ADASYN_" + \ str(nb_att_1) + "_" + str(nb_att_2) + ".csv" labels_B_resampled = carpet + "./" + name_dataset + "_truth_train_" + \ name_method + "_B_ADASYN_" + \ str(nb_att_1) + "_" + str(nb_att_2) + ".csv" print(full_data) print(data_A) print(data_B) print(labels_A) print(labels_B) print(full_data_resampled) print(data_A_resampled) print(data_B_resampled) print(labels_A_resampled) print(labels_B_resampled) data_resampled.to_csv(full_data_resampled, index=False) data_resampled.iloc[:, :nb_att_1].to_csv(data_A_resampled, index=False) data_resampled.iloc[:, nb_att_1:].to_csv(data_B_resampled, index=False) labels_resampled.to_csv(labels_A_resampled, index=False) labels_resampled.to_csv(labels_B_resampled, index=False) data.to_csv(full_data, index=False) data.iloc[:, :nb_att_1].to_csv(data_A, index=False) data.iloc[:, nb_att_1:].to_csv(data_B, index=False) labels.to_csv(labels_A, index=False) labels.to_csv(labels_B, index=False)
def cross_val_score_weighted(model, X, y, weights, cv=2, metrics=[ sklearn.metrics.accuracy_score, sklearn.metrics.precision_score, sklearn.metrics.recall_score ]): from sklearn.model_selection import StratifiedKFold, KFold from imblearn.over_sampling import ADASYN, SMOTE from imblearn.under_sampling import NearMiss from sklearn.preprocessing import StandardScaler, MinMaxScaler # Split data kf = StratifiedKFold(n_splits=cv, shuffle=True) kf.get_n_splits(X, y) scores = [[] for metric in metrics] scores.append([]) for train_index, test_index in kf.split(X, y): Z_train = StandardScaler().fit(X[train_index]) model_clone = sklearn.base.clone(model) X_test, y_test = Z_train.transform(X[test_index]), y[test_index] X_train, y_train = Z_train.transform(X[train_index]), y[train_index] # Sampling # Oversample print("Oversampling\n") ada = ADASYN(sampling_strategy='minority') X_train, y_train = ada.fit_resample(X[train_index], y[train_index]) print(X_train.shape) # Undersample # print("Undersampling\n") # nm = NearMiss() # X_train, y_train = nm.fit_resample(X[train_index],y[train_index]) # print(X_train.shape) model_clone.fit(X_train, y_train, class_weight=weights) y_pred = model_clone.predict(X_test) for i, metric in enumerate(metrics): score = metric(y_test, y_pred) scores[i].append(score) print(i) model_clone.fit(X_train, y_train, class_weight=weights) y_pred_prob = model_clone.predict_proba(X_test)[:, 1] score = sklearn.metrics.roc_auc_score(y_test, y_pred_prob) scores[3].append(score) return scores
def expand(x: np.ndarray, y: np.ndarray): """ enlarge the number of the minority class use ADASYN :param x: :param y: :return: """ ada = ADASYN(sampling_strategy=1) x_res, y_res = ada.fit_resample(x.reshape(-1, 600 * 3), y.reshape(-1)) return x_res.reshape(-1, 600, 3), y_res.reshape( -1, 1), len(x_res.reshape(-1, 600, 3)) - len(x)
def upsampleData(self, df): ros = RandomOverSampler(random_state=42, sampling_strategy='minority') x_train_sampled, y_train_sampled = ros.fit_resample(df.drop('Response', axis=1), df['Response']) ada = ADASYN(random_state=42) x_train_sampled, y_train_sampled = ada.fit_resample(df.drop('Response', axis=1), df['Response']) x_train_sampled['Response'] = y_train_sampled print(len(x_train_sampled)) return x_train_sampled
def Adasyn_sampling(self): ds = self.training.copy() self.report.append('Adasyn_sampling') Y = ds["Response"] X = ds.drop(columns=["Response"]) ada = ADASYN(random_state=self.seed) X_res, Y_res = ada.fit_resample(X, Y) sampled_ds = pd.DataFrame(X_res) sampled_ds['Response'] = Y_res # sampled_ds.index=ds.index sampled_ds.columns = ds.columns self.training = round(sampled_ds, 2)
class ADASYNStep(): def __init__(self, kwargs={}): """ Adaptive Synthetic Over-Sampling Technique (ADASYN) to create balanced samples. Uses imblearn’s ADASYN class. Parameters ---------- kwargs (dict, default={}) : arguments to pass to imblearn ADASYN class """ self.description = "ADASYN Data Augmentation" self.kwargs = kwargs self.fitted = None self.changes_num_samples = True def fit(self, X, y=None): """ Fits the ADASYN to given data Parameters ---------- X (DataFrame) : the data to fit y (DataFrame) : the labels for X Returns ------- (DataFrame, DataFrame) : a tuple of the transformed DataFrames, the first being the X data and the second being the y data """ if y is None: print(f"{self.description} step is supervised and needs target values") raise ValueError self.fitted = ADASYN(**self.kwargs) return self.transform(X, y=y) def transform(self, X, y=None): """ Transforms the given data using previously fitted ADASYN Parameters ---------- X (DataFrame) : the data to fit y (DataFrame) : the labels for X Returns ------- (DataFrame, DataFrame) : a tuple of the transformed DataFrames, the first being the X data and the second being the y data """ if self.fitted is None: raise TransformError X_rs, y_rs = self.fitted.fit_resample(X, y) X_rs = pd.DataFrame(X_rs, columns=X.columns) y_rs = pd.Series(y_rs, name=y.name) return X_rs, y_rs
def oversample_ADASYN(df, debug=True): X = df.values[:, :-1] y = df.values[:, -1].astype(int) if debug: print('Original dataset shape %s' % Counter(y)) ada = ADASYN(random_state=0) X_res, y_res = ada.fit_resample(X, y) df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1]) df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res) if debug: print('Resampled dataset shape %s' % Counter(y_res)) return df_resampled
def adasyn(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): ada = ADASYN(random_state=42) X_res, y_res = ada.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def imbalanceProcess(self, method='None'): if method == 'RandomOverSample': ros = RandomOverSampler(random_state=999) self.x_train, self.y_train = ros.fit_resample( self.x_train, self.y_train) if method == 'ADASYN': ada = ADASYN(random_state=999) self.x_train, self.y_train = ada.fit_resample( self.x_train, self.y_train) if method == 'SMOTE': sm = SMOTE(random_state=999) self.x_train, self.y_train = sm.fit_resample( self.x_train, self.y_train)
def adaptive_synthetic_sampling_func(train_x, train_y, target): try: logger.info( f"counter before ADASYN is: {train_y[target].value_counts()}") # transform the dataset oversample = ADASYN() train_x, train_y = oversample.fit_resample(train_x, train_y) # summarize the new class distribution logger.info( f"counter after ADASYN is: {train_y[target].value_counts()}") return train_x, train_y except Exception as ex: logger.error( f"failed to run adaptive_synthetic_sampling_func due to: {ex}")
def DealwithSample(data, label, method="ADA"): if method == "ADASYN": ada = ADASYN(random_state=42) X_res, y_res = ada.fit_resample(data, label) return X_res, y_res elif method == "RandomOverSampler": ros = RandomOverSampler(random_state=42) X_res, y_res = ros.fit_resample(data, label) print("has oversampled the data {}".format(len(X_res))) return X_res, y_res elif method == "SMOTE": smote = SMOTE(random_state=42) X_res, y_res = smote.fit_resample(data, label) return X_res, y_res
def readFile(path, y_label, method, encode_features=[], skew_exempted=[], training_ratio=0.7, shuffle=True, needSkew=False, fea_eng=True): raw = pd.read_csv(path) n, d = raw.shape if (shuffle): raw = raw.sample(frac=1).reset_index(drop=True) # shuffle if (needSkew): skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop( skew_exempted)].apply(lambda x: skew(x.dropna())) skewed = skewed[skewed > 0.75].index raw[skewed] = np.log1p(raw[skewed]) # reduce skewness raw = pd.get_dummies( raw, columns=encode_features) # encode categorical features raw = raw.fillna(raw.mean()) # if(method=='OverSample'): # ind_more=np.argmax(np.bincount(raw[y_label])) # more=raw[ind] # less=raw[-ind] # x = [randint(0, len(less)) for a in range(0, len(more)-len(less))] # raw. X = raw.drop(y_label, axis=1) y = raw[y_label] X_train, X_test, y_train, y_test = split(X, y, training_ratio) if (method == 'OverSample'): ada = ADASYN(random_state=42) X_res, y_res = ada.fit_resample(X_train, y_train) X_train = X_res y_train = y_res if (method == 'UnderSample'): # for i in [] #model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP model = EditedNearestNeighbours(random_state=42) X_res, y_res = model.fit_resample(X_train, y_train) X_train = X_res y_train = y_res # if(method=='Weights'): # if(fea_eng==True): # # X,y=feature_eng(X,y) return X_train, X_test, y_train, y_test
def adasyn(X, y): """Balancing data using ADASYN Args: X: Training set without Class Target y:Training set Class Target Returns: balanced train_x, test_x """ sample = ADASYN(random_state=42, sampling_strategy='minority') X, y = sample.fit_resample(X, y) print('after balancing:', X.shape) return X, y
def test_ada_fit_resample(): ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ -0.28162401, -2.10400981 ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ 0.70472253, -0.73309052 ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ 0.88407872, 0.35454207 ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ -0.18410027, -0.45194484 ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ -0.41635887, -0.38299653 ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.94899098, -0.30508981], [0.28204936, -0.13953426], [1.58028868, -0.04089947], [0.66117333, -0.28009063]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_adasyn_error(adasyn_params, err_msg): adasyn = ADASYN(**adasyn_params) with pytest.raises(ValueError, match=err_msg): adasyn.fit_resample(X, Y)
def test_ada_fit_sampling_strategy_error(): sampling_strategy = {0: 9, 1: 12} ada = ADASYN(sampling_strategy=sampling_strategy, random_state=RND_SEED) with raises(ValueError, match="No samples will be generated."): ada.fit_resample(X, Y)
def test_ada_wrong_nn_obj(): nn = 'rnd' ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) with raises(ValueError, match="has to be one of"): ada.fit_resample(X, Y)