def pca_components(X, y, X_train, y_train, X_test, y_test): for n in range(1, 11): sm = ADASYN(random_state=2) X_sm, y_sm = sm.fit_sample(X, y) X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train) pca_test = PCA(n_components=n).fit(X_sm) X_sm_pca_test = pca_test.transform(X_sm) pca_test = PCA(n_components=n).fit(X_train_sm) X_train_sm_pca_test = pca_test.transform(X_train_sm) X_test_sm_pca_test = pca_test.transform(X_test) # pca_test = PCA(n_components=n).fit(X) # X_pca_test = pca_test.transform(X) # pca_test = PCA(n_components=n).fit(X_train) # X_train_pca_test = pca_test.transform(X_train) # X_test_pca_test = pca_test.transform(X_test) clf = GaussianNB() clf.fit(X_train_sm_pca_test, y_train_sm) y_pred = clf.predict(X_test_sm_pca_test) print("Accuracy score for %d components: %f" % (n, (accuracy_score(y_test, y_pred))))
def dataSampling(dados, label): #sm = SMOTE(ratio='minority') #to sample data sm = ADASYN(ratio='minority') dadosSample, labelSample = sm.fit_sample(dados, label) return dadosSample, labelSample
def _ANASYN(self): """ADAptive SYNthetic (ADASYN) is based on the idea of adaptively generating minority data samples according to their distributions using K nearest neighbor. The algorithm adaptively updates the distribution and there are no assumptions made for the underlying distribution of the data.""" print("before: ", len(self.x_train)) resampler = uns.InstanceHardnessThreshold( sampling_strategy=0.2, random_state=self.seed ) self.X_train_smote2, self.y_train_smote2 = resampler.fit_resample( self.x_train, self.y_train ) self.x_train = pd.DataFrame(self.X_train_smote2, columns=self.x_train.columns) self.y_train = pd.DataFrame( self.y_train_smote2, columns=["Local Relapse Y(1) /N(0)"] ) print("after: ", len(self.x_train)) adasyn = ADASYN(random_state=self.seed) self.X_train_smote, self.y_train_smote = adasyn.fit_sample( self.x_train, self.y_train ) print("X_train_SMOTE:\n", self.X_train_smote[1]) self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns) self.y_train = pd.DataFrame( self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"] ) print("len smote: \n", len(self.X_train_smote)) print("len new x_train: \n", len(self.x_train)) number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1] print("number positive responses y_train:\n", len(number_pos_x))
def over_sampling(x_train, y_train, model='ADASYN', ratio='minority'): """ It generate synthetic sampling for the minority class using the model specificed. Always it has to be applied to the training set. :param x_train: X training set. :param y_train: Y training set. :param model: 'ADASYN' or 'SMOTE' :param neighbors: number of nearest neighbours to used to construct synthetic samples. :param ratio :return: xTrain and yTrain oversampled """ neighbors = config.parameters.get("neighbors") x_train_names = x_train.columns.values.tolist() y_train_names = y_train.columns.values.tolist() if model == 'ADASYN': model = ADASYN(random_state=42, ratio=ratio, n_neighbors=neighbors) if model == 'SMOTE': model = SMOTE(random_state=42, ratio=ratio, k_neighbors=neighbors, m_neighbors='svm') x_train, y_train = model.fit_sample(x_train, y_train) x_train = pd.DataFrame(x_train, columns=[x_train_names]) y_train = pd.DataFrame(y_train, columns=[y_train_names]) return x_train, y_train
def ada_model(X, y, names): ada = ADASYN(random_state=42) X_syn, y_syn = ada.fit_sample(X, y) #X_train, X_test, y_train, y_test = train_test_split(X_syn, y_syn, test_size=0.25, random_state=1) #logistic = linear_model.LogisticRegressionCV() #yy = logistic.fit(X_train, y_train).predict(X_test) logistic = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear') #logistic = linear_model.LogisticRegression(C=1000,penalty='l1') auc = cross_val_score(logistic, X_syn, y_syn, cv=5, scoring='roc_auc').mean() acc = cross_val_score(logistic, X_syn, y_syn, cv=5, scoring='accuracy').mean() recall = cross_val_score(logistic, X_syn, y_syn, cv=5, scoring='recall').mean() print("cross validation results:") print("-------------------------") print("auc:", auc) print("acc:", acc) print("recall:", recall) X_train, X_test, y_train, y_test = train_test_split(X_syn, y_syn, test_size=0.25, random_state=1) print(logistic.fit(X_train, y_train).coef_) RFC = RandomForestClassifier(max_depth=8, random_state=0) yy = RFC.fit(X_train, y_train).predict(X_test) importance = pd.DataFrame(RFC.feature_importances_, columns=['Feature Importance']) importance.index = names importance.sort_values( 'Feature Importance', ascending=True)[len(names) - 10:len(names)].plot.barh(figsize=(8, 16))
def resampling(datadict, labldict, savepath): ratiodic = {} for domnitem in datadict: ratiodic[domnitem] = {} for lablcode in range(0, 31): ratiodic['amazon'][lablcode] = 145 ratiodic['dslr'][lablcode] = 100 ratiodic['webcam'][lablcode] = 100 for domnitem in datadict: lablcout, lablnumb = {}, {} sorcdata = datadict[domnitem]['X'] sorclabl = datadict[domnitem]['Y'] print('Resampling data in domain {}'.format(domnitem)) adasyn = ADASYN(ratio = ratiodic[domnitem], random_state = 42) targdata, targlabl = adasyn.fit_sample(sorcdata, sorclabl) print('Saving data in domain {}'.format(domnitem)) for imagcode, targimag in enumerate(targdata): lablcode = targlabl[imagcode] if lablcode not in lablcout: lablcout[lablcode] = 0 lablnumb[lablcode] = 0 else: lablcout[lablcode] += 1 for imagcode, targimag in enumerate(targdata): lablcode = targlabl[imagcode] lablname = labldict[domnitem][lablcode] lablnumb[lablcode] += 1 strsleng = len(str(lablcout[lablcode])) numbstrs = str(lablnumb[lablcode]).zfill(strsleng) targpath = os.path.join(savepath, domnitem, lablname) if not os.path.exists(targpath): os.makedirs(targpath) imagpath = os.path.join(targpath, 'img_' + numbstrs) targimag = targimag.reshape(256, 256, 3) cv2.imwrite(imagpath + '.jpg', targimag)
def plot_roc_curves(X, y): plt.figure(figsize=(10, 6)) lw = 2 # train-val split and oversample X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=0) adasyn = ADASYN(random_state=44) X_oversampled_train, y_oversampled_train = adasyn.fit_sample( X_train, y_train) # Logistic Regression # fit model and predict probabilities of validation data log_reg = LogisticRegression(max_iter=5000, n_jobs=-1, random_state=44) log_reg.fit(X_oversampled_train, y_oversampled_train) y_pred = log_reg.predict_proba(X_val) fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1]) model_auc = roc_auc_score(y_val, y_pred[:, 1]) plt.plot(fpr, tpr, color='b', lw=lw, label=f'Logistic Regression, AUC: {model_auc:.4f}') # Naive Bayes # fit model and predict probabilities of validation data nb = BernoulliNB() nb.fit(X_oversampled_train, y_oversampled_train) y_pred = nb.predict_proba(X_val) fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1]) model_auc = roc_auc_score(y_val, y_pred[:, 1]) plt.plot(fpr, tpr, color='r', lw=lw, label=f'Bernoulli Naive Bayes, AUC: {model_auc:.4f}') # SVC # fit model and predict probabilities of validation data svc = SVC(probability=True, random_state=1) svc.fit(X_oversampled_train, y_oversampled_train) y_pred = svc.predict_proba(X_val) fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1]) model_auc = roc_auc_score(y_val, y_pred[:, 1]) plt.plot(fpr, tpr, color='g', lw=lw, label=f'SVC, AUC: {model_auc:.4f}') plt.plot([0, 1], [0, 1], c='violet', ls='--', label='Chance Line') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curves for top 3 Contending Models') plt.legend(loc='lower right', prop={'size': 10}, frameon=True) plt.savefig('ROC Curves for top 3 Contending Models')
def test_ada_fit_sample_half(): """Test the fit sample routine with a 0.5 ratio""" # Resample the data ratio = 0.8 ada = ADASYN(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = ada.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_ada_fit_sample_nn_obj(): """Test fit-sample with nn object""" # Resample the data nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29427267, 0.21740707], [0.68118697, -0.25220353], [1.37180201, 0.37279378], [-0.59243851, -0.80715327]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def synthetic_balance(data): """ Balances samples with ADASYN algorithm: http://sci2s.ugr.es/keel/pdf/algorithm/congreso/2008-He-ieee.pdf :param data: the dataframe :return: balanced dataframe """ target = data[TARGET] features = data.drop(TARGET, axis=1) print('unbalanced positive weight: ' + str(np.mean(target))) # Apply the random over-sampling ada = ADASYN() try: features, target = ada.fit_sample(features, target) except ValueError: # ValueError: No samples will be generated with the provided ratio settings. pass print('balanced positive weight: ' + str(np.mean(target))) columns = list(data) columns.remove(TARGET) data = pd.DataFrame(features, columns=columns) data.loc[:, TARGET] = target return data
def over_sampling(xTrain, yTrain, model='ADASYN', neighbors=200): """ It generate synthetic sampling for the minority class using the model specificed. Always it has to be applied to the training set. :param xTrain: X training set. :param yTrain: Y training set. :param model: 'ADASYN' or 'SMOTE' :param neighbors: number of nearest neighbours to used to construct synthetic samples. :return: xTrain and yTrain oversampled """ xTrainNames = xTrain.columns.values.tolist() yTrainNames = ['target'] if model == 'ADASYN': model = ADASYN(random_state=42, ratio='minority', n_neighbors=neighbors) if model == 'SMOTE': model = SMOTE(random_state=42, ratio='minority', k_neighbors=neighbors, m_neighbors='svm') xTrain, yTrain = model.fit_sample(xTrain, yTrain) xTrain = pd.DataFrame(xTrain, columns=[xTrainNames]) yTrain = pd.DataFrame(yTrain, columns=[yTrainNames]) return xTrain, yTrain
def ADASYN_oversampling(X, y): # input DataFrame # X →Independent Variable in DataFrame\ # y →dependent Variable in Pandas DataFrame format sm = ADASYN() X, y = sm.fit_sample(X, y) return (X, y)
def test_ada_fit_sample_nn_obj(): """Test fit-sample with nn object""" # Resample the data nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29427267, 0.21740707], [0.68118697, -0.25220353], [1.37180201, 0.37279378], [-0.59243851, -0.80715327]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def oversample_ADASYN(X, y, ratio=0.15): """ Oversample minority class using the ADASYN algorithm Arguments: X (2d array-like): feature set y (1d array-lile): target values ratio (float): desired ratio between minority and majority (optional) Return: X_os (2d array-like): oversampled feature set y_os (1d array-lile): oversampled target values Example: X_train_os, y_train_os = models.oversample_ADASYN(X_train, y_train, 0.3) """ # construct the ADASYN object os = ADASYN(sampling_strategy=ratio, n_neighbors=5, random_state=42) # oversample X and y data X_os, y_os = os.fit_sample(X, y) print('Oversampled minority-ratio of: {:3.1f}%'.format(100 * sum(y_os) / y_os.count())) return X_os, y_os
def test_ada_fit_sample_nn_obj(): nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_sample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.94899098, -0.30508981], [0.28204936, -0.13953426], [1.58028868, -0.04089947], [0.66117333, -0.28009063]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def augment_data_adasyn(input_data, desired_samples=50): """ Augments data using the ADASYN algorithm. For more information see the documentation: http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.over_sampling.ADASYN.html # noqa Will probably give a user warning stating: "The number of smaples in class x will be larger than the number of samples in the majority class", but we can ignore this since we are using ADASYN to augment data, not to correct for imbalanced data. Args: input_data (tuple): x_train, y_train, x_test, y_test desired_samples (int): The number of samples to be added to each class. Returns: tuple: x_train, y_train, x_test, y_test, with samples added to x_train and y_train. """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] classes, counts = np.unique(y_train, return_counts=True) ratio = {} for index, item in enumerate(classes): ratio[item] = counts[index] + desired_samples adasyn = ADASYN(ratio=ratio) x_train, y_train = adasyn.fit_sample(x_train, y_train) return (x_train, y_train, x_test, y_test)
def apply_simple_adasyn(X, y): from imblearn.over_sampling import ADASYN from collections import Counter simple_adasyn = ADASYN(sampling_strategy='minority') print(Counter(y)) X_smt, y_smt = simple_adasyn.fit_sample(X, y) print(Counter(y_smt)) return X_smt, y_smt
def makeOverSamplesADASYN(X, y): #input DataFrame #X →Independent Variable in DataFrame\ #y →dependent Variable in Pandas DataFrame format from imblearn.over_sampling import ADASYN sm = ADASYN() X, y = sm.fit_sample(X, y) return (X, y)
def ADASYN_oversampling(x, y): print('Original dataset shape {}'.format(Counter(y))) adasyn = ADASYN(random_state=42) x_sampled, y_sampled = adasyn.fit_sample(x, y) print('With ADASYN sampled dataset shape {}'.format(Counter(y_sampled))) return x_sampled, y_sampled
def ADASYNOversampling(self, featureMatrix, Labels): ada = ADASYN(random_state=42) #print type(featureMatrix[0][0]) #print type(Labels[0]) feature_Resampled, Labels_Resampled = ada.fit_sample( featureMatrix, Labels) #print type(feature_Resampled[0][0]) #print type(Labels_Resampled[0]) #print ("ADASYN Oversampling Completed") return feature_Resampled, Labels_Resampled
def makeOverSamplesADASYN(X, y): """ Creates new data with oversampled variables by using ADASYN @param X: Independent Variable in DataFrame @param y: dependent variable in Pandas DataFrame formats @return: an oversampled version of the variables """ sm = ADASYN() X, y = sm.fit_sample(X, y) return X, y
def oversample_dataset(X, y): under = RandomUnderSampler(sampling_strategy={0.0: 700}) X, y = under.fit_sample(X, y) # print('Under {}'.format(Counter(y))) sampler = ADASYN(random_state=42) X_rs, y_rs = sampler.fit_sample(X, y) # print('ADASYN {}'.format(Counter(y_rs))) return X_rs, y_rs
def resample_data(x, y, sample_choice=RUS_CONSTANT): if sample_choice == SMOTE_CONSTANT: sm = SMOTE(random_state=42) x, y = sm.fit_sample(x, y) elif sample_choice == ADASYN_CONSTANT: ada = ADASYN(random_state=42) x, y = ada.fit_sample(x, y) elif sample_choice == RUS_CONSTANT: rus = RandomUnderSampler(random_state=42) x, y = rus.fit_sample(x, y) return x, y
def test_ada_fit_sample(): """Test the fit sample routine""" # Resample the data ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'ada_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'ada_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def runADASYN(self): ada = ADASYN() self.Xadasyn, self.Yadasyn = ada.fit_sample(self.X, self.Y) self.rebalanced['ADASYN'] = { 'X': self.Xadasyn, 'y': self.Yadasyn, 'f': self.featureList } self.log.emit('ADASYN: Original dataset shape {}'.format( Counter(self.Y)), indents=1) self.log.emit('ADASYN: Resampled dataset shape {}'.format( Counter(self.Yadasyn)), indents=1)
def balanceDataset(self,train): from imblearn.over_sampling import ADASYN ada = ADASYN(random_state=10, ratio="minority") x = train.loc[:,train.columns != "TARGET"] y = train.loc[:,train.columns == "TARGET"] #pdb.set_trace() X,Y = ada.fit_sample(x,y) tmpDs = pd.concat( [pd.DataFrame(X,columns = x.columns),pd.DataFrame(Y,columns = y.columns)] ,axis = 1) return tmpDs
def cross_validate(X, y, model): # Split into train and test to crossvalidate X_train, X_test, y_train, y_test = train_test_split(X, y) # Balance training data ads = ADASYN(random_state=10) X_train_b, y_train_b = ads.fit_sample(X_train, y_train) if model == 'RF': return RF(X_train_b, y_train_b, X_test, y_test) elif model == 'GBC': return GBC(X_train_b, y_train_b, X_test, y_test) elif model == 'ABC': return ABC(X_train_b, y_train_b, X_test, y_test) else: print('Enter a valid model')
def oversample(X: pd.DataFrame, y: pd.DataFrame, technique: str = 'adasyn'): """ Oversamples the minority class to balance the classes :param X: unbalanced dataset as a dataframe :param y: labels for the dataset :param technique: either 'SMOTE' or 'ADASYN' :return: the balanced dataset and labels """ if technique is 'adasyn': os_method = ADASYN() elif technique is 'smote': os_method = SMOTE() X, y = os_method.fit_sample(X, y) return X, y
def runAdasyn(self, ensem_folder, model_h5, save_dir): if not os.path.exists(save_dir): os.makedirs(save_dir) # build and load models autoencoder, encoder, decoder = self.loadAutoencoder(model_h5) for ensem in range(self.Config.NUM_ENSEMBLES): dat = np.load(ensem_folder + 'ensem_dat' + str(ensem) + '.npy') lab = np.load(ensem_folder + 'ensem_lab' + str(ensem) + '.npy') dat_ = encoder.predict(dat) # resize data if len(lab.shape) == 3: lab = lab[:, -1, :] lab = np.argmax(lab, axis=1) else: lab = np.argmax(lab, axis=1) # run adasyn print(ensem) print('run ADASYN') ada = ADASYN(ratio='minority', random_state=42) # fit smote object print('fit smote object for ensem ' + str(ensem)) x_res, y_res = ada.fit_sample(dat_, lab) x_syn = decoder.predict(x_res) y_res_ = [] for i in range(len(y_res)): if y_res[i] == 0: y_res_ += [np.array([1, 0])] else: y_res_ += [np.array([0, 1])] y_res_ = np.array(y_res_) # save data print('save ensem ' + str(ensem)) np.save(save_dir + 'ensem_dat' + str(ensem) + '.npy', x_syn) np.save(save_dir + 'ensem_lab' + str(ensem) + '.npy', y_res_) return
def balance_classes_adasyn(X, y, ratio='auto', random_state=None, k=5): """ Function to balance the distribute of classes by using Adaptive Synthetic Sampling Approach for Imbalanced Learning (ADASYN) :param X: Feature data :param y: Class labels :param ratio: (str/float) If ‘auto’, the ratio will be defined automatically to balance the dataset. Otherwise, the ratio is defined as the number of samples in the minority class over the the number of samples in the majority class. :param random_state: (None/Int) If int, seed used for random number generator :param k: (int) Number of nearest neighbors used to construct synthetic samples :return: Data set with synthetic samples added """ ad = ADASYN(ratio=ratio, random_state=random_state, n_jobs=1, k=k) X_adasyn, y_adasyn = ad.fit_sample(X, y) return X_adasyn, y_adasyn
def sensor_balancing(X_train, y_train): # Drop all rows with a very rare results, since SMOTEEN cannot handle them cc = y_train.value_counts()[y_train.value_counts() <= 3] y_train = y_train[~y_train.isin(cc.index.values)] X_train = pd.DataFrame(X_train[X_train.index.isin(list(y_train.index))]) y_train = pd.Series(y_train) columns = pd.DataFrame(X_train).columns.values # Perform oversampling adasyn = ADASYN(sampling_strategy='not majority', n_neighbors=2, n_jobs=1) # X_train, y_train = adasyn.fit_sample(X_train, np.ravel(y_train.values)) X_train, y_train = adasyn.fit_sample(X_train, y_train) X_train = pd.DataFrame(X_train, columns=list(columns)) return X_train, pd.Series(y_train)
def oversample(X, y, bal_strategy): if(bal_strategy == "SMOTESVN" or bal_strategy == "ALL"): # Apply SMOTE SVM sm = SMOTE(kind='svm') X_sampled, y_sampled = sm.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "SMOTE" or bal_strategy == "ALL"): # Apply regular SMOTE sm = SMOTE(kind='regular') X_sampled, y_sampled = sm.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "ADASYN" or bal_strategy == "ALL"): # Apply the random over-sampling ada = ADASYN() X_sampled, y_sampled = ada.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == 'NONE'): X_sampled = X y_sampled = y print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape else: print 'bal_stragegy not in SMOTESVN, SMOTE, ADASYN, ALL, NONE' sys.exit(1) return (X_sampled, y_sampled)
from imblearn.over_sampling import ADASYN # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random over-sampling ada = ADASYN() X_resampled, y_resampled = ada.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #define the size of test #sklearn.model_selection.train_test_split随机划分训练集与测试集 #train_test_split(train_data,train_target,test_size=数字, random_state=0) #ADASYN ada = ADASYN() os_X,os_y = ada.fit_sample(X_train,y_train) os_X = pd.DataFrame(os_X) os_y = pd.DataFrame(os_y) #logistic best_c = printing_Kfold_scores(os_X,os_y) clf_l = LogisticRegression(C = best_c, penalty = 'l1') clf_l.fit(os_X,os_y.values.ravel()) y_pred = clf_l.predict(X_test) #调用ravel()函数将矩阵转变成一维数组 #(ravel()函数与flatten()的区别) # 两者所要实现的功能是一致的(将多维数组降为一维), # 两者的区别在于返回拷贝(copy)还是返回视图(view), # numpy.flatten() 返回一份拷贝,对拷贝所做的修改不会影响(reflects)原始矩阵, # 而numpy.ravel()返回的是视图(view),会影响(reflects)原始矩阵。 y_true, y_pred = y_test, clf_l.predict(X_test)
def test_ada_wrong_nn_obj(): nn = 'rnd' ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) with raises(ValueError, match="has to be one of"): ada.fit_sample(X, Y)
def test_ada_fit_ratio_error(): ratio = {0: 9, 1: 12} ada = ADASYN(ratio=ratio, random_state=RND_SEED) with raises(ValueError, match="No samples will be generated."): ada.fit_sample(X, Y)