def test_error_wrong_object(): smote = 'rnd' enn = 'rnd' smt = SMOTEENN(smote=smote, random_state=RND_SEED) with raises(ValueError, match="smote needs to be a SMOTE"): smt.fit_resample(X, Y) smt = SMOTEENN(enn=enn, random_state=RND_SEED) with raises(ValueError, match="enn needs to be an "): smt.fit_resample(X, Y)
def get_smotenn(X_trn, y_trn, seed=int(623 * 449)): """ Resamples using SMOTENN """ SME = SMOTEENN(random_state=seed) X_trn, y_trn = SME.fit_resample(X_trn, y_trn) return X_trn, y_trn
def smoter(df): IDs = df.Quote_ID target = df.QuoteConversion_Flag data = df.drop(['QuoteConversion_Flag'], axis=1).values print("Before SMOTE: ", sorted(Counter(target).items())) #### # ENN #### enn = ENN(sampling_strategy="not majority", kind_sel="mode", n_neighbors=5, n_jobs=-1, random_state=RANDOM_STATE) smote_enn = SMOTEENN(enn=enn, random_state=RANDOM_STATE) X_resampled, y_resampled = smote_enn.fit_resample(data, target) print("SMOTE ENN: ", sorted(Counter(y_resampled).items())) #### # Tomeks #### # smote_tomek = SMOTETomek(random_state=0) # X_resampled, y_resampled = smote_tomek.fit_resample(data, target) # print("Using SMOTE: ", sorted(Counter(y_resampled).items())) data = pd.DataFrame(data=X_resampled, columns=FIELDS) target = pd.DataFrame(data=y_resampled, columns=['QuoteConversion_Flag']) return data, target
def split_data_resampling(X, y, test_percentage=0.2): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_percentage, random_state=42) smote_enn = SMOTEENN(random_state=0) X_train_resampled, y_train_resampled = smote_enn.fit_resample( X_train, y_train) return X_train_resampled, y_train_resampled, X_test, y_test
def resample_dataset(df, feature_list, repo_type): num_rows = len(df.index) # number of rows in <df> num_features = len(feature_list) # number of feature columns to resample cur_row = [] # list to hold the current row of <df> feat_val_mat = [] # the matrix (list of lists) to hold all feature values counter = 0 # counter for progress print "\nResampling data for the " + repo_type + " dataset..." for idx, row in tqdm(df.iterrows(), desc="\tProgress"): # loop <num_rows> times counter += 1 # print_progress(counter, num_rows) for j in range(num_features): # loop <num_features> times cur_row.append( row[feature_list[j]]) # form list of current row values feat_val_mat.append(cur_row) # append <cur_row> to <feat_val_mat> cur_row = [] smote_obj = SMOTEENN( sampling_strategy="all", random_state=99 ) # <smote_obj> should over/under-sample both the "NEUTRAL" and "INSECURE" classes resampled_data, resampled_targets = smote_obj.fit_resample( feat_val_mat, list(df["SECU_FLAG"])) resampled_df = pd.DataFrame( resampled_data, columns=feature_list) # recreate the reduced dataframe resampled_df[ "SECU_FLAG"] = resampled_targets # re-initialize the "SECU_FLAG" column resampled_df["REPO_TYPE"] = [repo_type] * len( resampled_df.index) # re-initialize the "REPO_TYPE" column return resampled_df
def SMOTE_ENN(X_train, Y_train, seed, sampling_strategy, k_neighbors_smote=5, n_neighbors_enn=3, kind_sel='all'): enn = EditedNearestNeighbours(random_state=seed, n_jobs=-1, n_neighbors=n_neighbors_enn, kind_sel=kind_sel, sampling_strategy=sampling_strategy) smote = SMOTE(random_state=seed, n_jobs=-1, k_neighbors=k_neighbors_smote, sampling_strategy=sampling_strategy) smote_enn = SMOTEENN(random_state=seed, smote=smote, enn=enn, sampling_strategy=sampling_strategy) print('Before SMOTE + ENN : ', sorted(Counter(Y_train).items())) X_train_resampled, Y_train_resampled = smote_enn.fit_resample( X_train, Y_train) print('After SMOTE + ENN : ', sorted(Counter(Y_train_resampled).items())) X_train_resampled, Y_train_resampled = shuffle_dataset( X_train_resampled, Y_train_resampled, seed) return X_train_resampled, Y_train_resampled
def get_simple_train_test_split(self): X_train, X_test, y_train, y_test = train_test_split( self.X, self.y, test_size=self.test_size, random_state=self.random_state ) if self.missingvals: # Impute missing vals with column mean imp = SimpleImputer() imp.fit(X_train) X_train = imp.transform(X_train) X_test = imp.transform(X_test) if self.balance: # Balance out classes # Not needed when we use frequency binning! balancer = SMOTEENN(random_state=self.random_state) X_train, y_train = balancer.fit_resample(X_train, y_train) if self.standardize: scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) return X_train, y_train, X_test, y_test
def over_sampling(x_train, y_train): print() print("Doing over sampling...") print("Before over sampling:") class0_num = np.sum(y_train == 0) class1_num = np.sum(y_train == 1) class2_num = np.sum(y_train == 2) print("#Sample in Class 0: {}".format(class0_num)) print("#Sample in Class 1: {}".format(class1_num)) print("#Sample in Class 2: {}".format(class2_num)) # Using SMOTE: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html # an Over-sampling approach # Over sampling on training and validation data # sm = SMOTE(sampling_strategy='auto', random_state=10) # sm = SVMSMOTE(random_state=0) sm = SMOTEENN(random_state=0) # sm = SMOTETomek(ratio='auto') x_train, y_train = sm.fit_resample(x_train, y_train) # x_train, y_train = sm.fit_resample(x_train, y_train) # X_train, X_val, y_train, y_val = train_test_split(X_train,y,test_size=0.2,random_state=7) x_out = x_train y_out = y_train print("After over sampling:") class0_num = np.sum(y_out == 0) class1_num = np.sum(y_out == 1) class2_num = np.sum(y_out == 2) print("#Sample in Class 0: {}".format(class0_num)) print("#Sample in Class 1: {}".format(class1_num)) print("#Sample in Class 2: {}".format(class2_num)) return x_out, y_out
def unba_smoteenn(x,y): x1 = x.reshape(x.shape[0],-1)# 7259*480 smoteenn = SMOTEENN(random_state=0) # 建立smoteenn模型对象 x1,y1 = smoteenn.fit_resample(x1,y)# 扩增以后*480 x2 = np.zeros((x1.shape[0],x.shape[1],x.shape[2],1)) for i in tqdm(range(x1.shape[0])): x2[i,:,:,0] = np.reshape(x1[i],(60,8)) return x2,y1
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, export=False, **kwargs): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data max_depth = None if "max_depth" not in kwargs else kwargs["max_depth"] dtc = DecisionTreeClassifier(criterion="entropy", random_state=0, max_depth=max_depth) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) selector = SelectKBest(score_function, k=k) selector = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in selector.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: print("Exporting tree to graph...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def imbalance_undersampling(datafile): df = filling_missing(datafile) # combine oversampling and undersampling togeter with SMOTEENN smote_enn = SMOTEENN(random_state=0) X_resampled, y_resampled = smote_enn.fit_resample(df[features], df.country_destination) print(sorted(Counter(y_resampled).items())) back = pd.DataFrame(np.hstack((X_resampled, y_resampled[:, None]))) #[516489 rows x 14 columns] # print(back) return back
def balancingClassesSmoteenn(x_train, y_train): # Using SMOTEEN to balance our training data points smn = SMOTEENN(random_state=7) features_balanced, target_balanced = smn.fit_resample(x_train, y_train) print("Count for each class value after SMOTEEN:", collections.Counter(target_balanced)) return features_balanced, target_balanced
def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ 0.61319159, -0.11571667 ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half(): sampling_strategy = {0: 10, 1: 12} smote = SMOTEENN( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def smoteenn_sffs_reduction_classify_full(): (X, Y), feature_names = read_dataset( screening='') # no screening results, only risk factors # dataset resampling for imbalanced data compensation smoteenn = SMOTEENN() Xres, Yres = smoteenn.fit_resample(X, Y) # resampled dataset print('Resampling') print('Original dataset size:', Counter(Y)) print('Resampled dataset size:', Counter(Yres)) # feature selection using sequential forward floating selection and tuned SVM scoring = [ 'accuracy', 'precision', 'recall', 'balanced_accuracy', 'average_precision', 'brier_score_loss', 'neg_log_loss' ] param_grid = {'C': np.logspace(-3, 3, 7), 'kernel': ['rbf']} grid = GridSearchCV(estimator=SVC(probability=True, gamma='scale'), param_grid=param_grid, n_jobs=-1, verbose=10, cv=5, scoring=scoring, refit='balanced_accuracy', iid=False, error_score=0) grid.fit(Xres, Yres) print(grid.best_params_) selector = SequentialFeatureSelector( forward=False, floating=True, k_features='best', verbose=2, n_jobs=-1, scoring='balanced_accuracy', cv=5, estimator=SVC(probability=True, gamma='scale', kernel=grid.best_params_['kernel'], C=grid.best_params_['C'])) selector.fit(Xres, Yres, custom_feature_names=feature_names) with open('smoteenn_sbfs.pkl', 'wb') as f: pickle.dump(selector, f, -1) df = pd.DataFrame(selector.subsets_) df.to_csv('smoteenn_sbfs.csv')
def main(): PARSER = argparse.ArgumentParser(description="Prediction of subscription") PARSER.add_argument( "--filename_to_predict", "-fp", required=True, help="File to predict ( only csv supported for now )", ) ARGS = PARSER.parse_args() SAVED_FILENAME = f"{ARGS.filename_to_predict}" update_progress(0) print("progress : Predict subscription") dataset_merged = DatasetBuilder( filename_bank=stg.FILENAME_BANK, filename_socio=stg.FILENAME_SOCIO_ECO).create_dataset() X_train = dataset_merged.drop(columns=stg.COL_RAW_SUBSCRIPTION) y_train = dataset_merged[stg.COL_RAW_SUBSCRIPTION].values update_progress(0.2) print("progress : Build Dataset") preprocessor_pipeline = PipelineCreator().preprocessor X_train_processed = preprocessor_pipeline.fit_transform(X_train) update_progress(0.3) print("progress : Deal with imbalenced classes") smote_enn = SMOTEENN(sampling_strategy=0.8, random_state=stg.RANDOM_STATE, n_jobs=-1) X_train, y_train = smote_enn.fit_resample(X_train_processed, y_train) update_progress(0.6) print("progress : Fit model") random_forest_classifier = RandomForestClassifier(**stg.RFC_PARAMS) random_forest_classifier.fit(X_train, y_train) update_progress(0.9) print("progress : Predict") X_test = DatasetBuilder( filename_bank=SAVED_FILENAME, filename_socio=stg.FILENAME_SOCIO_ECO_TEST, is_test=True, ).create_dataset() X_test_transformed = preprocessor_pipeline.transform(X_test) predictions = random_forest_classifier.predict(X_test_transformed) X_test["PREDICTED_SUBSCRIPTION"] = predictions X_test.to_csv(join(stg.PROCESSED_DATA_DIR, "predictions.csv")) print("Completed!") print( "You can find the csv file with the predictions inside in data/processed/predictions.csv ! " )
def test_sample_regular_half(): sampling_strategy = {0: 10, 1: 12} smote = SMOTEENN(sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(sampling_strategy='all') smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) enn = EditedNearestNeighbours(sampling_strategy='all') smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ 0.61319159, -0.11571667 ], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def prep_data(self, test_ratio, smoteenn, smotomek): # split data into train and test X_train, X_test, y_train, y_test = train_test_split( self.X, self.y, test_size=test_ratio, random_state=4) # if smoteenn is true, use smoteenn sampling if smoteenn: sme = SMOTEENN(random_state=1) X_train, y_train = sme.fit_resample(X_train, y_train) # if smotomek is true, use smotomek sampling if smotomek: smt = SMOTETomek(random_state=1) X_train, y_train = smt.fit_resample(X_train, y_train) return X_train, X_test, y_train, y_test
def smote_enn(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): sme = SMOTEENN(random_state=42) X_res, y_res = sme.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
class PreProcessor: """ Perform pre-processing Import dataframe and apply sklearn transformations Vectorize (non full text) strings with TF/IDF Normalise int values, remove mean and scale to unit variance Instantiate an embedding transformer for message text feature Returns a sparse matrix of features """ def __init__(self): stopwords = set(corpus.stopwords.words('english')) self.mapper = DataFrameMapper([ (['created_at'], StandardScaler()), (['user_created_at'], StandardScaler()), (['favorite_count'], StandardScaler()), (['retweet_count'], StandardScaler()), (['user_followers_count'], StandardScaler()), (['user_following_count'], StandardScaler()), ('hashtags', TfidfVectorizer(stop_words=stopwords, max_features=1_000)), ('urls', TfidfVectorizer(stop_words=stopwords, max_features=1_000)), ('user_description', TfidfVectorizer(stop_words=stopwords, max_features=10_000)), ('user_location', TfidfVectorizer(stop_words=stopwords, max_features=1_000)), ('user_name', TfidfVectorizer(stop_words=stopwords, max_features=1_000)), ('user_screen_name', TfidfVectorizer(stop_words=stopwords, max_features=1_000)), ('user_profile_urls', TfidfVectorizer(stop_words=stopwords, max_features=1_000)), ('full_text', EmbedTransformer()) ], sparse=True) self.svd = TruncatedSVD(algorithm='randomized') self.balancer = SMOTEENN(n_jobs=12) def transform(self, df): labels = label_binarize(df.pop('label'), classes=['none', 'astroturf']) return self.mapper.fit_transform(df), labels def truncate(self, data_array, components=1000): """ Run feature dimensionality reduction process In this case LSA using a randomised sampling methodology (https://arxiv.org/abs/0909.4061) Returns a dense array """ self.svd.n_components = components return self.svd.fit_transform(data_array) def balance(self, data_array, labels): """ Balance classes for training Re-sample with SMOTE oversampling (https://arxiv.org/abs/1106.1813) & edited nearest-neighbours cleaning of the synthetic data points (http://www.inf.ufrgs.br/maslab/pergamus/pubs/balancing-training-data-for.pdf) """ return self.balancer.fit_resample(data_array, labels.ravel())
def preprocess_data(PARAMS, train_data, train_label, test_data): if PARAMS['data_balancing']: from imblearn.combine import SMOTEENN print('Unbalanced data: ', np.shape(train_data)) # Over and under sampling smote_enn = SMOTEENN(sampling_strategy=1.0) train_data, train_label = smote_enn.fit_resample( train_data, train_label) print('Balanced data: ', np.shape(train_data)) if PARAMS['scale_data']: train_data, test_data = scale_data(train_data, test_data) return train_data, train_label, test_data
def rebalance(): sm = SMOTEENN() train_data.replace(to_replace=np.nan, value=0, inplace=True) train_data.replace(to_replace=-np.inf, value=0, inplace=True) train_data.replace(to_replace=np.inf, value=0, inplace=True) print("rebalance data:", now()) X_resampled, y_resampled = sm.fit_resample(train_data[features], train_data[target]) X_resampled = pd.DataFrame(X_resampled, columns=features) y_resampled = pd.DataFrame(y_resampled, columns=target) X_resampled['is_trade'] = y_resampled['is_trade'] del y_resampled gc.collect() return X_resampled
def Smote_ENN(self): ''' First oversamples the minority classes using SMOTE and then cleans all the data using ENN. Returns ------- None. ''' X_train = self.X_train.copy() y_train = self.y_train.copy() sme = SMOTEENN(random_state=2020) (self.X_train_balanced, self.y_train_balanced) = sme.fit_resample(X_train, y_train)
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectFpr(score_function) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) # if export: print("Exporting decision tree image...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def applying_classifier_to_all_accounts(): df = pickle.load(open(m4r_data + "us_and_georgia_accounts.p", "rb")) trainset = get_full_dataset() users = get_full_dataset(df) X_users = users[features] X_trn = trainset[features] y_trn = trainset["class"].replace({"bot": 1, "human": 0}) SME = SMOTEENN(random_state=2727841) X_trn, y_trn = SME.fit_resample(X_trn, y_trn) scaling = StandardScaler() X_trn = scaling.fit_transform(X_trn) X_users = scaling.transform(X_users) clf = AdaBoostClassifier(n_estimators=50, random_state=9926737) clf.fit(X_trn, y_trn) p_users = np.round(clf.predict(X_users)) users["predicted_class"] = p_users users["predicted_class"] = users["predicted_class"].replace({ 0: "human", 1: "bot" }) print("% Bots (ALL): ", sum(p_users) / len(p_users)) # adding predicted class column to df df = df.merge(users[["user.id", "predicted_class"]], how="left", on="user.id") # adding donald trump to users d_row = { "user.id": 25073877, "user.name": "realDonaldTrump", "user.screen_name": "realDonaldTrump", "user.verified": True, "predicted_class": "human" } df = df.append(d_row, ignore_index=True)
def SMOTEENN(self,configFile,data): from imblearn.combine import SMOTEENN cf = configparser.ConfigParser() cf.read(configFile) sampling_strategy = str(cf.get("resample","sampling_strategy")) random = int(cf.get("data","random")) matchObj = re.match( r'.*[^0-9\.].*', sampling_strategy, re.I) if not matchObj: sampling_strategy = float(sampling_strategy) model = SMOTEENN(sampling_strategy=sampling_strategy,random_state=random) data['y'].index = [int(x) for x in range(0,len(data['y']))] data['x'].index = [int(x) for x in range(0,len(data['x']))] X_resampled, y_resampled = model.fit_resample(data['x'], data['y']) self.logging_config(u"resample class \n {}".format(y_resampled.value_counts()),"info") data_dict = {'x':X_resampled,'y':y_resampled} return data_dict
def run_smoteenn(X, y, sampling_strategy='auto'): ''' INPUT: X; a numpy array of predictors y; a binary target vector **kwargs, other keyword arguments to SMOTEENN; see imblearn docs OUTPUT: smx; predictor array with synthetically oversampled minority examples smy; target vector with synthetically oversampled minority examples NOTES: Takes a predictor numpy array, X, and a binary target vector, y, and returns arrays, smx, and smy, where the minority class has been synthetically oversampled using the SMOTE method, then cleaned with ENN ''' sm = SMOTEENN(sampling_strategy=sampling_strategy, n_jobs=-1, random_state=1) smx, smy = sm.fit_resample(X, y) return smx, smy
def train(data, batch_size=10000, test_every=10, max_steps=int(1e6), n_epochs=1, log_file=None, model_path=None): """ Peform training :param data: take an opened zipfile :param batch_size: size of the bach :param test_every: number of test over time. This also define the train test split :param max_steps: number of total line to read from the zip file :param n_epochs: number of epochs over a single minibatch :param log_file: log file for training :return: column name """ columns = next(data) feed = feeder(data, batch_size) sampler = SMOTEENN() if model_path is not None: with open(model_path,'rb') as f: clf = pkl.load(f) else: clf = SGDClassifier() for global_step in tqdm(range(max_steps)): try: x_tr, y_tr = next(feed) x_tr = minmax_scale(x_tr) except StopIteration: feed = feeder(data, batch_size) continue for _ in range(n_epochs): try: x_tr, y_tr = sampler.fit_resample(x_tr, y_tr) except ValueError as e: tqdm.write(str(e)) continue clf.fit(x_tr, y_tr) if global_step % test_every == 0: y_hat = clf.predict(x_tr) tqdm.write(classification_report_imbalanced(y_tr, y_hat), file=log_file) fname = f"model/clf_{global_step}.pkl" with open(fname, 'wb') as f: pkl.dump(clf, f) tqdm.write(f"File saved as {fname}") return columns
def test_sample_regular_pass_smote_enn(): smote = SMOTEENN( smote=SMOTE(sampling_strategy="auto", random_state=RND_SEED), enn=EditedNearestNeighbours(sampling_strategy="all"), random_state=RND_SEED, ) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([ [1.52091956, -0.49283504], [0.84976473, -0.15570176], [0.61319159, -0.11571667], [0.66052536, -0.28246518], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.08711622, 0.93259929], ]) y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, export=False): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def fit_resample(self, X, y): """ Resample the dataset. First standardize X, then perform SMOTEENN, then de-standardize to return results in the same "units" as input. Parameters ---------- X : ndarray Dense, feature matrix where rows are observations. y : ndarray 1-D array of responses. Returns ------- X_resample, y_resampled : ndarray, ndarray Resampled X and y in the original "units" of X. """ ss = StandardScaler() X_std = ss.fit_transform(X) sm = SMOTEENN( sampling_strategy=self.sampling_strategy_smoteenn, random_state=self.random_state, smote=SMOTE( random_state=self.random_state, k_neighbors=self.k_smote, sampling_strategy=self.sampling_strategy_smote, ), enn=ENN( sampling_strategy=self.sampling_strategy_enn, n_neighbors=self.k_enn, kind_sel=self.kind_sel_enn, ), ) X_res, y_res = sm.fit_resample(X_std, y) return ss.inverse_transform(X_res), y_res
print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=100, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply SMOTE + ENN sm = SMOTEENN() X_resampled, y_resampled = sm.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=0.5) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], label="Class #1", alpha=0.5)
def test_error_wrong_object(smote_params, err_msg): smt = SMOTEENN(**smote_params) with pytest.raises(ValueError, match=err_msg): smt.fit_resample(X, Y)