def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_trans2, y_trans2 = pipeline.fit_sample(X, y) X_trans3, y_trans3 = rus.fit_sample(X, y) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) assert_array_almost_equal(y_trans, y_trans2) assert_array_almost_equal(y_trans, y_trans3) pca = PCA() pipeline = Pipeline([('pca', pca), ('rus', rus)]) X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_sample(X_pca, y) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(y_trans, y_trans2)
def down_sample(X, y, random_state): rus = RandomUnderSampler(random_state=random_state) X_rus, y_rus = rus.fit_sample(X, y) indices = rus.sample_indices_ X_train = X.iloc[indices, :].reset_index(drop=True) y_train = y.iloc[indices].reset_index(drop=True) return X_train, y_train
def balanceInputData(self): # Deal with imbalanced class sizes below # Make Data 1D for compatability upsampling methods X_trainShape = self.input_train.shape[1] * self.input_train.shape[ 2] * self.input_train.shape[3] #X_testShape = self.input_test.shape[1]*self.input_test.shape[2]*self.input_test.shape[3] X_trainFlat = self.input_train.reshape(self.input_train.shape[0], X_trainShape) #X_testFlat = self.input_test.reshape(self.input_test.shape[0], X_testShape) Y_train = self.output_train #Y_test = self.output_test ros = RandomUnderSampler(ratio='auto') X_trainRos, Y_trainRos = ros.fit_sample(X_trainFlat, Y_train) #X_testRos, Y_testRos = ros.fit_sample(X_testFlat, Y_test) # Encode labels to hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0]) #print(Y_trainRos.shape) #print(Y_trainRos) Y_trainRosHot = to_categorical(Y_trainRos, num_classes=self._NUM_CLASSES) #Y_testRosHot = to_categorical(Y_testRos, num_classes = self._NUM_CLASSES) # Make Data 2D again for i in range(len(X_trainRos)): height, width, channels = 100, 150, 3 X_trainRosReshaped = X_trainRos.reshape(len(X_trainRos), height, width, channels) #for i in range(len(X_testRos)): #height, width, channels = 100,150,3 #X_testRosReshaped = X_testRos.reshape(len(X_testRos),height,width,channels) self.input_train = X_trainRosReshaped self.output_train = Y_trainRosHot
def SVM_classify(X: np.array, lbl: np.array): #divide into training and test set using sklearn from sklearn.model_selection import train_test_split, cross_val_score X_train, X_test, lbl_train, lbl_test = train_test_split(X, lbl, test_size=0.20) #apply random undersampling from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler rus = RandomUnderSampler(return_indices=True) X_train_rus, lbl_train_rus, id_rus = rus.fit_sample(X_train, lbl_train) # X_train_rus = X_train # lbl_train_rus = lbl_train #import and train SVC (classifier) from sklearn.svm import SVC svclassifier = SVC(kernel='poly', degree=10, gamma='auto') svclassifier.fit(X_train_rus, lbl_train_rus) #predict labels for test data lbl_pred = svclassifier.predict(X_test) #evaluate results using built in report and confusion matrix from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(lbl_test, lbl_pred)) print(classification_report(lbl_test, lbl_pred)) print(lbl_test) print(lbl_pred) scores = cross_val_score(svclassifier, X, lbl, cv=5) print(scores.mean())
def resample(X_train=None, y_train=None, df=None, balance=None, nclass=None): """ Perform resampling based on chosen method :param X_train: tensor of training data (size, dimension) :param y_train: tensor of training target (size, 1) :param df: dataframe to resample from :param balance: type of resample to perform :return: resampled data """ if balance == 'SMOTE': X_train, y_train = SMOTE().fit_sample(X_train, y_train) logger.info(f'Using {balance} oversampling') elif balance == 'RandomUnderSampler': rus = RandomUnderSampler(random_state=0) X_train, y_train = rus.fit_sample(X_train, y_train) logger.info(f'Using {balance} oversampling') elif balance == 'Bootstrap': logger.info(f'Using {balance} oversampling') df = bootstrap(df, nclass) return df elif balance == 'Handsample': logger.info(f'Using {balance} oversampling') df = bootstrap(df, nclass, if_new=True) return df return X_train, y_train
def downsample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) rus = RandomUnderSampler() self.X, self.y = rus.fit_sample(self.X, self.y) self.Xview = self.X.view()[:,:self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def downsample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) # to use a random sampling seed at random: # rus = RandomUnderSampler() # self.X, self.y = rus.fit_sample(self.X, self.y) # to fix the random sampling seed at a certain value & return indices: rus = RandomUnderSampler(random_state=0,return_indices=True) self.X, self.y, ds_idx = rus.fit_sample(self.X, self.y) # print out the downsampled index to file: file = open('downsampled_idx','a') file.write(str(ds_idx)+'\n') file.close() # print out the downsampled y to file: file = open('downsampled_y','a') file.write(str(self.y)+'\n') file.close() self.Xview = self.X.view()[:, :self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def fit_divided_hab(self, df_X, df_y): for i in range(self.n_estimators): imb = RandomUnderSampler() X_resampled, y_resampled = imb.fit_sample(df_X, df_y) X_res_vis = self.decomposor.transform(X_resampled) X_train, X_test, y_train, y_test = train_test_split(X_res_vis, y_resampled, test_size=0.3, random_state=0) clf = clone(self.estimator) score = cross_val_score(clf, X_train, y_train, cv=10, scoring=make_scorer(roc_auc_score)).mean() if self.verbose: print("Estimator %d:" % (i + 1)) print("AUC: %.2f" % score) self.score_weights.append(score) # clf.fit(X_train, y_train) y_score = clf.fit(X_train, y_train).decision_function(X_test) self.clfs.append(clf) self.imbs.append(imb) self.y_tests.append(y_test) self.y_scores.append(y_score)
def customSampler(self, X, Y): neighbours = 3 sampler = RandomUnderSampler(return_indices=True, replacement=False, ratio=.9) # sampler = CondensedNearestNeighbour(return_indices=True,n_neighbors=neighbours) # sampler = RepeatedEditedNearestNeighbours(return_indices=True,n_neighbors=neighbours) # sampler = NeighbourhoodCleaningRule(return_indices=True,n_neighbors=2) # sampler = EditedNearestNeighbours(return_indices=True,n_neighbors=neighbours) # sampler = SMOTE() # sampler = ClusterCentroids(ratio='auto' , n_jobs= 4 ) # sampler = AllKNN(return_indices=True,n_neighbors=neighbours,n_jobs=4) index = [] for i in range(len(X)): index.append(i) # print("before x " , len(X) , ' y ', len(Y) ) a, b, c = sampler.fit_sample(X, Y) # print("after x " , len(a) , ' y ', len(b) , ' c ' , len(c)) # print(c) # return a,b,c return X, Y, index
def downsample(self): """Balance class data based on outcome""" print('Current outcome sampling {}'.format(Counter(self.y))) rus = RandomUnderSampler() self.X, self.y = rus.fit_sample(self.X, self.y) self.Xview = self.X.view()[:, :self.n_features] print('Resampled dataset shape {}'.format(Counter(self.y)))
def data_under_sampling(X, y, under_sample_type, ratio, random_state=None): ''' 数据欠采样 :param X: 数据 :param y: 标签 :param under_sample_type: 欠采样类型(str) :param ratio: ratio = (少类样本)/(多类样本) :param random_state: 随机采样的种子 :return: 采样后的数据(nparray) ''' under_sampler = None if under_sample_type == 'Random': from imblearn.under_sampling import RandomUnderSampler under_sampler = RandomUnderSampler(ratio=ratio, random_state=random_state) elif under_sample_type == 'NearMiss': from imblearn.under_sampling import NearMiss under_sampler = NearMiss(ratio=ratio, random_state=random_state, version=1) elif under_sample_type == 'EditedNearestNeighbours': from imblearn.under_sampling import EditedNearestNeighbours under_sampler = EditedNearestNeighbours(random_state=random_state) X_resampled, y_resampled = under_sampler.fit_sample(X, y) return X_resampled, y_resampled
def pipeline(X_train, X_test, y_train, y_test, model): dict1 = dict() dataset = [[X_train, X_test, y_train, y_test]] # Create the Under samplers rus = RandomUnderSampler(random_state=9) X_sample2, y_sample2 = rus.fit_sample(X_train, y_train) dataset.append([X_sample2, X_test, y_sample2, y_test]) ros = RandomOverSampler(random_state=9) X_sample3, y_sample3 = ros.fit_sample(X_train, y_train) dataset.append([X_sample3, X_test, y_sample3, y_test]) smote = SMOTE(random_state=9, kind='borderline2') X_sample4, y_sample4 = smote.fit_sample(X_train, y_train) dataset.append([X_sample4, X_test, y_sample4, y_test]) roc_old = 0 roc_new = 0 for m in model: for X_train, X_test, y_train, y_test in dataset: m.fit(X_train, y_train) roc_new = roc_auc_score(y_test, m.predict(X_test)) if (roc_new >= roc_old): dict1.clear() dict1[m] = roc_new roc_old = roc_new return list(dict1.keys())[0], list(dict1.values())[0]
def undersample(X, y, bal_strategy): print 'Shape of X: ', X.shape print 'Shape of y_Train: ', y.shape if(bal_strategy == "RANDOM" or bal_strategy == "ALL"): # apply random under-sampling rus = RandomUnderSampler() X_sampled, y_sampled = rus.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == "TOMEK" or bal_strategy == "ALL"): # Apply Tomek Links cleaning tl = TomekLinks() X_sampled, y_sampled = tl.fit_sample(X, y) print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape elif(bal_strategy == 'NONE'): X_sampled = X y_sampled = y print 'Shape of X_sampled: ', X_sampled.shape print 'Shape of y_sampled: ', y_sampled.shape else: print 'bal_stragegy not in ALL, RANDOM, TOMEK, NONE' sys.exit(1) return (X_sampled, y_sampled)
def __init__(self, X, y, Number_trials, maxdepth_settings=None, scaler=None): score_train = [] score_test = [] if maxdepth_settings is not None: self.maxdepth_settings = maxdepth_settings self.var = maxdepth_settings with tqdm(total=Number_trials * len(self.maxdepth_settings)) as pb: for seed in range(1, Number_trials + 1, 1): X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, test_size=0.25, random_state=seed) under_samp = RandomUnderSampler() X_train, y_train = under_samp.fit_sample(X_train, y_train) if scaler is not None: scaler_inst = scaler.fit(X_train) X_train = scaler_inst.transform(X_train) X_test = scaler_inst.transform(X_test) pb.set_description(f'Trial: {seed}') training_accuracy = [] test_accuracy = [] for depth in maxdepth_settings: tree = DecisionTreeClassifier( max_depth=depth, random_state=42) # build the model tree.fit(X_train, y_train) training_accuracy.append(tree.score( X_train, y_train)) # record training set accuracy test_accuracy.append(tree.score( X_test, y_test)) # record generalization accuracy pb.update(1) score_train.append(training_accuracy) score_test.append(test_accuracy) self.score = np.mean(score_test, axis=0) self.sc_train = np.mean(score_train, axis=0) self.std_score = np.std(score_test, axis=0) self.std_train = np.std(score_train, axis=0) # get top predictor best_depth = maxdepth_settings[np.argmax(self.score)] tree = DecisionTreeClassifier(max_depth=best_depth, random_state=42) # build the model tree.fit(X_train, y_train) self.top_predictor = X.columns[np.argmax(tree.feature_importances_)] abs_mean_coefs = np.abs(tree.feature_importances_) coefs_count = len(abs_mean_coefs) fig, ax = plt.subplots(figsize=(3, 5)) ax.barh(np.arange(coefs_count), sorted(abs_mean_coefs)) ax.set_yticks(np.arange(coefs_count)) ax.set_yticklabels(X.columns[np.argsort(abs_mean_coefs)]) #self.top_predictor='NA' return
def rando_under(df, to_drop, flag, verbose=0): a = [] # As the dataset is biased towards non-churners we must perform sampling y = df[flag].copy() X = df[:].copy() X = X.reset_index(drop=False) for n in range(len(to_drop)): try: X.drop(to_drop[n], axis=1, inplace=True) except: a.append(to_drop[n]) if verbose > 0 and len(a) > 0: print("\nUnable to drop:\n", list(set(a))) rus = RandomUnderSampler(ratio='majority', random_state=42, replacement=False) X_rus, y_rus = rus.fit_sample(X, y) df_rus = pd.DataFrame(X_rus, columns=X.columns) df_rus[flag] = y_rus df_rus = df_rus.set_index('id_subs_id', drop=True) return df_rus
def under_sampling(x, y): n_positive = sum(y) x_indice = np.array(range(len(x))).reshape((-1, 1)) rus = RandomUnderSampler(ratio={0: n_positive * 3, 1: n_positive}) x_indice_resampled, y_resampled = rus.fit_sample(x_indice, y) x_resampled = x[x_indice_resampled.reshape((-1, ))] return x_indice_resampled.reshape((-1, ))
def transform(self, X, y=None): # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the # TODO simple trainer in the correct order and leave this to advanced users? # Extract predicted column y = np.squeeze(X[[self.predicted_column]]) # Copy the dataframe without the predicted column temp_dataframe = X.drop([self.predicted_column], axis=1) # Initialize and fit the under sampler under_sampler = RandomUnderSampler(random_state=self.random_seed) x_under_sampled, y_under_sampled = under_sampler.fit_sample(temp_dataframe, y) # Build the resulting under sampled dataframe result = pd.DataFrame(x_under_sampled) # Restore the column names result.columns = temp_dataframe.columns # Restore the y values y_under_sampled = pd.Series(y_under_sampled) result[self.predicted_column] = y_under_sampled return result
def train_linear_SVC(): df = encode_features() print(df) classifier = LinearSVC() X_train, X_test, y_train, y_test = create_train_test_split(df) rus = RandomUnderSampler(return_indices=True) X_rus, y_rus, id_rus = rus.fit_sample( X_train, y_train ) # TODO other sampling techniques, e.g. ros = RandomOverSampler() X_ros, y_ros = ros.fit_sample(X, y) print('Removed indexes:', len(X_train), len(id_rus), id_rus) classifier.fit(X_rus, y_rus) # X_train, y_train.values.ravel() pickle.dump(classifier, open("linear_SVC_relations.pkl", 'wb')) # Predict Class y_pred = classifier.predict(X_test) print(y_pred) # Accuracy accuracy = accuracy_score(y_test, y_pred) print("acc: ", accuracy)
def main(): train_datas = b.train_generator() # train_datas.to_csv('X.csv') train_datas.set_index(['vipno', 'pluno'], inplace=True, drop=True) train = train_datas.as_matrix() X = np.delete(train, train.shape[1] - 1, axis=1) y = train[:, train.shape[1] - 1] # 用于做降采样,以确保正负样本的数量相近 rus = RandomUnderSampler(return_indices=True) X, y, idx_resampled = rus.fit_sample(X, y) param_test1 = { 'n_estimators': range(30, 101, 10), 'learning_rate': np.arange(0.01, 0.1, 10) } gsearch1 = GridSearchCV(estimator=AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=20)), param_grid=param_test1, scoring='roc_auc', cv=5) gsearch1.fit(X, y) print("Best param: {}".format(gsearch1.best_params_)) print("Best score: {}".format(gsearch1.best_score_)) print("Best estimator: {}".format(gsearch1.best_estimator_)) print("****************************")
def main(): train_datas = ci.train_generator() train_datas.to_csv('X.csv') train_datas.set_index(['vipno'], inplace=True, drop=True) train = train_datas.as_matrix() X = np.delete(train, train.shape[1] - 1, axis=1) y = train[:, train.shape[1] - 1] # 用于做降采样,以确保正负样本的数量相近 rus = RandomUnderSampler(return_indices=True) X, y, idx_resampled = rus.fit_sample(X, y) param_test1 = {'n_estimators': range(10, 61, 10), 'learning_rate': np.arange(0.01, 0.1, 10), 'max_depth': range(1, 14, 2), 'max_features': range(7, 20, 2), 'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9] } param_test2 = {'max_depth': range(1, 14, 2)} param_test3 = {'max_features': range(7, 20, 2), 'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]} gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.01, n_estimators=50, max_depth=7), param_grid=param_test1, scoring='precision', cv=5) gsearch1.fit(X, y) print("Best param: {}".format(gsearch1.best_params_)) print("Best score: {}".format(gsearch1.best_score_)) print("Best estimator: {}".format(gsearch1.best_estimator_)) print("****************************")
def transform(self, X, y=None): # TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the # TODO simple trainer in the correct order and leave this to advanced users? # Extract predicted column y = np.squeeze(X[[self.predicted_column]]) # Copy the dataframe without the predicted column temp_dataframe = X.drop([self.predicted_column], axis=1) # Initialize and fit the under sampler under_sampler = RandomUnderSampler(random_state=self.random_seed) x_under_sampled, y_under_sampled = under_sampler.fit_sample( temp_dataframe, y) # Build the resulting under sampled dataframe result = pd.DataFrame(x_under_sampled) # Restore the column names result.columns = temp_dataframe.columns # Restore the y values y_under_sampled = pd.Series(y_under_sampled) result[self.predicted_column] = y_under_sampled return result
def RUS(X_train, y_train): a = list(X_train) rus = RandomUnderSampler() X_resampled, y_resampled = rus.fit_sample(X_train, y_train) X_resampled = pd.DataFrame(X_resampled, columns=a) y_resampled = pd.DataFrame(y_resampled, columns=['Target']) return X_resampled, y_resampled
def train_and_evaluate_classifier(clf, model, X_train, y_train, X_test, y_test, operation='hadamard', undersample=True, ratio=0.8, print_progress=True): num_negative = len(y_test[y_test == 0]) num_positive = len(y_test[y_test == 1]) positive_to_sample = min(int(num_negative * ratio), num_positive) ratio_dict = {0: num_negative, 1: positive_to_sample} rus = RandomUnderSampler(return_indices=False, ratio=ratio_dict) X_sampled, y_sampled = X_train, y_train if undersample: X_sampled, y_sampled = rus.fit_sample(X_train, y_train) if print_progress: print('Assembling training set features....') X = [] for u, v in X_sampled: X.append(model.get_edge_features(u, v, operation)) y = y_sampled X = np.array(X) if print_progress: print('Fitting classifier model') clf.fit(X, y) if print_progress: print('Assembling testing set features') X_sampled, y_sampled = X_test, y_test #X_sampled, y_sampled = rus.fit_sample(X_test, y_test) X = [] for u, v in X_sampled: X.append(model.get_edge_features(u, v, operation)) y = y_sampled X = np.array(X) predictions = clf.predict(X) probs = clf.predict_proba(X)[:,1] macro_f1 = metrics.f1_score(y, predictions, average='macro') micro_f1 = metrics.f1_score(y, predictions, average='micro') average_percision_score = metrics.average_precision_score(y, probs) auc = metrics.roc_auc_score(y, probs) kappa = metrics.cohen_kappa_score(y, predictions) mathew = metrics.matthews_corrcoef(y, predictions) confustion_matrix = metrics.confusion_matrix(y, predictions) classification_report = metrics.classification_report(y, predictions) metric_reports = { 'micro_f1': micro_f1, 'macro_f1': macro_f1, 'average_percision_score':average_percision_score, 'auc': auc, 'confusion_matrix': confustion_matrix, 'classification_report': classification_report, 'kappa': kappa, 'mathew': mathew } return metric_reports
def subsample_data(X, y, ratio=0.5, seed=None): size = 1100 rus = RandomUnderSampler(ratio={ 0: int(size * ratio), 1: int(size * (1 - ratio)), }, random_state=seed) return rus.fit_sample(X.as_matrix(), y.as_matrix().ravel())
def get_training_data(X, y): ''' balances classes for training ''' rus = RandomUnderSampler() X_resampled, y_resampled = rus.fit_sample(X, y) return X_resampled, y_resampled
def _get_trained_model(X, y): rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_sample(X, y) model = LinearSVC() clf = CalibratedClassifierCV(model, cv=5) clf.fit(X_resampled, y_resampled) return clf
def _random_sampler(self, X, y, strategy, sample_ratio): if strategy == 'oversample': ros = RandomOverSampler(ratio=sample_ratio) X_resampled, y_resampled = ros.fit_sample(X, y) else: rus = RandomUnderSampler(ratio=sample_ratio) X_resampled, y_resampled = rus.fit_sample(X, y) return X_resampled, y_resampled
def create_train_test_model(df, cnt_col, feature_col): scaled_features = scaled_fit_transform(df, cnt_col) X, y = create_x_y(scaled_features, feature_col) rus = RandomUnderSampler(random_state=42) X, y = rus.fit_sample(X, y) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42) return X_train, y_train, X_test, y_test
def under_sampling(self, _X, _y): """ under-sampling for unbalanced training set :return: X_resampled, y_resampled """ rus = RandomUnderSampler() return rus.fit_sample(_X, _y)
def under_sampler(df): X = df.copy() y = X.pop('y') print(X.shape, y.shape) under = RandomUnderSampler(random_state=42) X_under, y_under = under.fit_sample(X, y) print(X_under.shape, y_under.shape) return X_under, y_under
class kp_classifier_dataset(object): """ This class is dedicated to building and rearranging classifier datasource """ def load_data(self): file_store = BASE_LOCATION + "{ext}_%d.h5".replace( "{ext}", str(self.file_ext)) self.O = pd.DataFrame() for year in range(1995, 2019): fname = file_store % year self.O = pd.concat([ self.O, pd.read_hdf(fname, mode="r", key="df", parse_dates=True) ]) pass _x = utils.to_linear_Kp(delay_unit=1) self.O["Kp"] = _x["Kp"] self.O["STORM"] = _x["STORM"] return def __init__(self, look_back=3, file_ext=180): self.look_back = look_back self.file_ext = file_ext self.load_data() self.xparams = [ "Bx", "B_T", "THETA", "SIN_THETA", "V", "n", "T", "P_DYN", "BETA", "MACH_A", "STORM" ] self.yparam = ["STORM"] self.O = utils.transform_variables(self.O) self.sclX = MinMaxScaler(feature_range=(0, 1)) self.sclY = MinMaxScaler(feature_range=(0, 1)) return def form_look_back_array(self, X, y): dataX, dataY = [], [] for i in range(self.look_back + 1, len(X)): a = X[i - self.look_back:i, :].T dataX.append(a) dataY.append(y[i].tolist()) pass return np.array(dataX), np.array(dataY) def create_master_model_data(self): X, y = self.O[self.xparams].values, self.O[self.yparam].values.ravel() X = self.sclX.fit_transform(X) self.rus = RandomUnderSampler(return_indices=True) X_resampled, y_resampled, idx_resampled = self.rus.fit_sample(X, y) y_bin = to_categorical(y_resampled) X_resampled, y_resampled = self.form_look_back_array( X_resampled, y_bin) X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=1.0 / 3.0, random_state=42) return X_train, X_test, y_train, y_test
def random_undersampling(x, y): print('Original dataset shape {}'.format(Counter(y))) rus = RandomUnderSampler(random_state=42) x_sampled, y_sampled = rus.fit_sample(x, y) print('With RandomUnderSampler sampled dataset shape {}'.format(Counter(y_sampled))) return x_sampled, y_sampled
def model_(X, y, rs): osp = RandomUnderSampler(random_state=rs) x_train_, y_train_ = osp.fit_sample(X, y) # 基础模型 clf = CatBoostClassifier(loss_function='Logloss', logging_level='Silent', cat_features=categorical_features_indices) clf.fit(x_train_, y_train_) return clf
def DownsampleMajority(self): print('Original dataset shape {}'.format(Counter(self.y))) rus = RandomUnderSampler(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_sample(self.x, self.y) print('Resampled dataset shape {}'.format(Counter(y_resampled))) self.x = X_resampled self.y = y_resampled return
def cross_validation(classifier, data, labels, n_of_fold, threshold=0.5, method="SMOTE", ratio=0.5): true_pos = [] false_pos = [] true_neg = [] false_neg = [] auc = [] real_labels = [] probability_labels = [] #KFold creates the fold with respect to the number given as an input kfold = KFold(n_splits=n_of_fold, shuffle=True, random_state=42) for train_ind, test_ind in kfold.split(data): #split the dataset into train and test as the kfold indicates data_train, data_test = data.iloc[train_ind], data.iloc[test_ind] labels_train, labels_test = labels.iloc[train_ind], labels.iloc[ test_ind] #apply resampling method if method == 'under': und = RandomUnderSampler(ratio=float(ratio)) data_resampling, labels_resampling = und.fit_sample( data_train, labels_train) elif method == 'smotetomek': resampling = SMOTETomek(ratio=float(ratio)) data_resampling, labels_resampling = resampling.fit_sample( data_train, labels_train) else: resampling = SMOTE(ratio=float(ratio)) data_resampling, labels_resampling = resampling.fit_sample( data_train, labels_train) #train the classifier with the train data after the resampling classifier.fit(data_resampling, labels_resampling) #evaluate the model for the output predictions using confusion matrix labels_prediction = classifier.predict(data_test) tn, fp, fn, tp = confusion_matrix(labels_test, labels_prediction).ravel() #computhe the probabilities and use them to construct the roc curve labels_prediction_probability = classifier.predict_proba(data_test)[:, 1] #store the data in a array true_pos.append(tp) false_pos.append(fp) true_neg.append(tn) false_neg.append(fn) #keep the labels and the probabilities real_labels.extend(labels_test) probability_labels.extend(labels_prediction_probability) #transform the list to nparray true_pos = np.array(true_pos) false_pos = np.array(false_pos) true_neg = np.array(true_neg) false_neg = np.array(false_neg) return true_pos, false_pos, true_neg, false_neg, real_labels, probability_labels
def test_multiclass_fit_sample(): y = Y.copy() y[5] = 2 y[6] = 2 rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_sample(X, y) count_y_res = Counter(y_resampled) assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2
def test_rus_fit_sample(): """Test the fit sample routine""" # Resample the data rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_rus_fit_sample(): rus = RandomUnderSampler(random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_pipeline_sample(): # Test whether pipeline works with a sampler at the end. # Also test pipeline.sampler X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0) rus = RandomUnderSampler(random_state=0) pipeline = Pipeline([('rus', rus)]) # test transform and fit_transform: X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_trans2, y_trans2 = pipeline.fit_sample(X, y) X_trans3, y_trans3 = rus.fit_sample(X, y) assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(X_trans, X_trans3, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans3, rtol=R_TOL) pca = PCA() pipeline = Pipeline([('pca', PCA()), ('rus', rus)]) X_trans, y_trans = pipeline.fit(X, y).sample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = rus.fit_sample(X_pca, y) # We round the value near to zero. It seems that PCA has some issue # with that X_trans[np.bitwise_and(X_trans < R_TOL, X_trans > -R_TOL)] = 0 X_trans2[np.bitwise_and(X_trans2 < R_TOL, X_trans2 > -R_TOL)] = 0 assert_allclose(X_trans, X_trans2, rtol=R_TOL) assert_allclose(y_trans, y_trans2, rtol=R_TOL)
def test_rus_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'rus_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'rus_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'rus_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_rus_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) idx_gt = np.array([1, 3, 8, 6, 7, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def CrossVal(estimator, X, y,procsessor=None,cv=3,times=10,random_state=0,imb=False): """ 交叉验证 estimator: 模型 X: 数据集X部分 y: 数据集的label procsessor: 预处理器,其实就是做特征选择 cv: 做cv折交叉验证 times: 重复times次交叉验证 random_state: 随机数种子 imb: 是否使用SMOTE使得正负样本数平衡 """ res=[] for t in range(times): skf=StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state+t) indices=list(skf.split(X=X,y=y)) for k in indices: x_train,y_train,x_test,y_test=X[k[0]],y[k[0]],X[k[1]],y[k[1]] if(imb==True): n,p=__lableCount(y_train) rus=RandomUnderSampler(random_state=random_state+t) x_train,y_train=rus.fit_sample(x_train,y_train) if(procsessor is not None): procsessor.fit(x_train,y_train) x_train,y_train=procsessor.transform(x_train,y_train) x_test,y_test=procsessor.transform(x_test,y_test) estimator.fit(x_train,y_train) res.append(Metrics.Score(estimator,x_test,y_test)) res=np.array(res) return res
def test_rus_fit_sample_half(): """Test the fit sample routine with a 0.5 ratio""" # Resample the data ratio = 0.5 rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.15490546, 0.3130677]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_multiclass_fit_sample(): """Test fit sample method with multiclass target""" # Make y to be multiclass y = Y.copy() y[5] = 2 y[6] = 2 # Resample the data rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_sample(X, y) # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 2) assert_equal(count_y_res[1], 2) assert_equal(count_y_res[2], 2)
def test_rus_fit_sample_half(): ratio = {0: 3, 1: 6} rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED, replacement=True) X_resampled, y_resampled = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.92923648, 0.76103773], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.15490546, 0.3130677], [0.20792588, 1.49407907], [0.15490546, 0.3130677], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) print(X_resampled) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling rus = RandomUnderSampler(return_indices=True) X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) idx_class_0 = y_resampled == 0 plt.scatter(X_res_vis[idx_class_0, 0], X_res_vis[idx_class_0, 1], alpha=.8, label='Class #0') plt.scatter(X_res_vis[~idx_class_0, 0], X_res_vis[~idx_class_0, 1], alpha=.8, label='Class #1') plt.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.8, label='Removed samples')
def func(X, y, ratio, random_state): rus = RandomUnderSampler(ratio=ratio, random_state=random_state) return rus.fit_sample(X, y)