def data_oversample(self): x_train, x_val, x_test, y_train, y_val, y_test = self.data_sample_split( ) for i in [0, 1, 2, 3, 4]: if i not in y_train: print("lesion " + i + " not in y_train. Redoing sample split...") data_sample_split() print("Presampled train dataset: %s" % Counter(y_train)) resample = SVMSMOTE(random_state=42) # SVMSMOTE, SMOTENC x_train, y_train = resample.fit_resample(x_train, y_train) x_val, y_val = resample.fit_resample(x_val, y_val) print("Resampled train dataset: %s" % Counter(y_train)) ## x_test, y_test = resample.fit_resample(x_test, y_test) return x_train, x_val, x_test, y_train, y_val, y_test
def test_svm_smote(data): svm_smote = SVMSMOTE(random_state=42) svm_smote_nn = SVMSMOTE(random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11), svm_estimator=SVC(gamma='scale', random_state=42)) X_res_1, y_res_1 = svm_smote.fit_resample(*data) X_res_2, y_res_2 = svm_smote_nn.fit_resample(*data) assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2)
def svm_smote(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): sm = SVMSMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def svm_smote(X, y): """Balancing data using SVMSMOTE Args: X: Training set without Class Target y:Training set Class Target Returns: balanced train_x, test_x """ sample = SVMSMOTE(random_state=42) X, y = sample.fit_resample(X, y) print('after balancing:', X.shape) return X, y
def borderline_smoth_func(train_x, train_y, target): try: logger.info( f"counter before border line SMOTH is: {train_y[target].value_counts()}" ) # transform the dataset #oversample = BorderlineSMOTE() oversample = SVMSMOTE() train_x, train_y = oversample.fit_resample(train_x, train_y) # summarize the new class distribution logger.info( f"counter after borderline SMOTH is: {train_y[target].value_counts()}" ) return train_x, train_y except Exception as ex: logger.error(f"failed to run borderline_smoth_func due to: {ex}")
def run_upsample(json_file_path, fmt_file_path): json_manager = JsonManager(json_file_path) if json_manager.get_upsample_status() == True: print(f"Upsampling started using {json_file_path} and {fmt_file_path}") upsampled_path = json_manager.get_upsampled_path() constants.remove_folder_if_exists(\ constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path) hot_encoded_folder = os.fsdecode(os.path.join(\ json_manager.get_hot_encoded_path(), \ constants.HOT_ENCODED_CSV_FOLDER_NAME)) hot_encoded_file = os.fsdecode(os.path.join(\ hot_encoded_folder, \ constants.HOT_ENCODED_CSV_FILENAME)) hotEncoded_data = pd.read_csv(hot_encoded_file) features_data = pd.read_csv(hot_encoded_file, \ usecols = list(hotEncoded_data.columns)[:-1]) # everything except label labels_data = pd.read_csv(hot_encoded_file, \ usecols = [list(hotEncoded_data.columns)[-1]]) # label sm = SVMSMOTE(random_state=json_manager.get_random_state()) X_res, y_res = sm.fit_resample(features_data, labels_data) csv_ready = np.append(X_res, y_res, axis=constants.COLUMN_AXIS) upsampled_folder = constants.add_folder_to_directory(\ constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path) upsampled_file_path = os.fsdecode(os.path.join(\ upsampled_folder, constants.UPSAMPLED_CSV_FILENAME)) if os.path.exists(upsampled_file_path): os.remove(upsampled_file_path) f = open(fmt_file_path, "r") fmt = f.readline() f.close() header = ','.join(str(i) for i in hotEncoded_data.columns) np.savetxt(upsampled_file_path, csv_ready, \ fmt = fmt, \ delimiter = constants.CSV_DELIMITER, \ header = header, \ comments='') print(f"Upsampling finished, results in {upsampled_file_path}")
encoder = OneHotEncoder() encoder.fit(wr_data[('FF_Spaceman', 'Conf')].values.reshape(-1, 1)) X = process_df(vet_feats, encoder) X_rk = process_df(rook_feats, encoder) X_soph = process_df(soph_feats, encoder) #%% Define XGB model and data split est_tiers = xgb.XGBClassifier(objective='multi:softmax', num_class=6) param_grid = {} model_tiers = GridSearchCV(est_tiers, param_grid, cv=10) sm = SVMSMOTE(sampling_strategy='not majority') X_res, y_res_t = sm.fit_resample( X, tiers_vets ) #This is wrong, don't resample before split, I'm doing it for the memes X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_res, y_res_t, test_size=0.3, stratify=y_res_t, random_state=123) model_tiers.fit(X_train_t, y_train_t) preds = model_tiers.predict(X_test_t) cf = confusion_matrix(y_test_t, preds, normalize='pred') plotConfMatrix(cf, labels=range(6)) # %% est_hit = xgb.XGBClassifier(objective='binary:logistic') param_grid = {}
def getAllCleanedDataExperiment(binning=0): # https://www.kaggle.com/nasirislamsujan/bank-customer-churn-prediction for inspiration on feature engineering # https://www.kaggle.com/agustinpugliese/eda-86-ann-explained # https://www.kaggle.com/kmalit/bank-customer-churn-prediction # https://www.kaggle.com/nasirislamsujan/bank-customer-churn-prediction df = pd.read_csv('train.csv', header=0) # printFullRow(df) # print(df['Exited'].value_counts()) df.drop(['CustomerId', 'Surname', 'RowNumber'], axis=1, inplace=True) X = df.drop(['Exited'], axis=1) y = df['Exited'] # printFullRow(X_train.head()) df_test = pd.read_csv('testing.csv', header=0) # print(df_test.info()) test_train = df_test.drop(['Exited'], axis=1) test_val = df_test['Exited'] X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1) print(len(y_train), len(y_val)) ##### ENCODING ##### X_train['HasCrCard'] = X_train['HasCrCard'].apply(lambda x: 1. if x == 1 else 0.) X_val['HasCrCard'] = X_val['HasCrCard'].apply(lambda x: 1. if x == 1 else 0.) test_train['HasCrCard'] = test_train['HasCrCard'].apply(lambda x: 1. if x == 1 else 0.) X_train['IsActiveMember'] = X_train['IsActiveMember'].apply( lambda x: 1. if x == 1 else 0.) X_val['IsActiveMember'] = X_val['IsActiveMember'].apply(lambda x: 1. if x == 1 else 0.) test_train['IsActiveMember'] = test_train['IsActiveMember'].apply( lambda x: 1. if x == 1 else 0.) X_train_cat_df = X_train[['Geography', 'Gender']] X_val_cat_df = X_val[['Geography', 'Gender']] test_cat_df = test_train[['Geography', 'Gender']] X_train = X_train.drop(['Geography', 'Gender'], axis=1) X_val = X_val.drop(['Geography', 'Gender'], axis=1) test_train = test_train.drop(['Geography', 'Gender'], axis=1) X_train.reset_index(drop=True, inplace=True) X_val.reset_index(drop=True, inplace=True) test_train.reset_index(drop=True, inplace=True) X_train_cat = X_train_cat_df.to_numpy() X_val_cat = X_val_cat_df.to_numpy() test_cat = test_cat_df.to_numpy() enc = OneHotEncoder().fit(X_train_cat) X_train_enc_array = enc.transform(X_train_cat).toarray() X_val_enc_array = enc.transform(X_val_cat).toarray() test_enc_array = enc.transform(test_cat).toarray() X_train_enc_df = pd.DataFrame( data=X_train_enc_array, columns=['France', 'Germany', 'Spain', 'Female', 'Male']) X_val_enc_df = pd.DataFrame( data=X_val_enc_array, columns=['France', 'Germany', 'Spain', 'Female', 'Male']) test_enc_df = pd.DataFrame( data=test_enc_array, columns=['France', 'Germany', 'Spain', 'Female', 'Male']) X_train = pd.concat([X_train, X_train_enc_df], axis=1) X_val = pd.concat([X_val, X_val_enc_df], axis=1) test_train = pd.concat([test_train, test_enc_df], axis=1) # drop the extra columns X_train.drop(['France', 'Female'], axis=1, inplace=True) X_val.drop(['France', 'Female'], axis=1, inplace=True) test_train.drop(['France', 'Female'], axis=1, inplace=True) ###### Oversample training data ##### svmsmote = SVMSMOTE(random_state=101) X_train, y_train = svmsmote.fit_resample(X_train, y_train) # binning num of products X_train['1 Product'] = X_train['NumOfProducts'].apply(lambda x: 1 if x == 1 else 0) X_train['2 Product'] = X_train['NumOfProducts'].apply(lambda x: 1 if x == 2 else 0) X_train['1/2 Product'] = X_train['NumOfProducts'].apply(lambda x: 1 if x <= 2 else 0) X_train['3/4 Product'] = X_train['NumOfProducts'].apply(lambda x: 1 if x >= 3 else 0) X_train.drop(['NumOfProducts'], axis=1, inplace=True) X_val['1 Product'] = X_val['NumOfProducts'].apply(lambda x: 1 if x == 1 else 0) X_val['2 Product'] = X_val['NumOfProducts'].apply(lambda x: 1 if x == 2 else 0) X_val['1/2 Product'] = X_val['NumOfProducts'].apply(lambda x: 1 if x <= 2 else 0) X_val['3/4 Product'] = X_val['NumOfProducts'].apply(lambda x: 1 if x >= 3 else 0) X_val.drop(['NumOfProducts'], axis=1, inplace=True) test_train['1 Product'] = test_train['NumOfProducts'].apply( lambda x: 1 if x == 1 else 0) test_train['2 Product'] = test_train['NumOfProducts'].apply( lambda x: 1 if x == 2 else 0) test_train['1/2 Product'] = test_train['NumOfProducts'].apply( lambda x: 1 if x <= 2 else 0) test_train['3/4 Product'] = test_train['NumOfProducts'].apply( lambda x: 1 if x >= 3 else 0) test_train.drop(['NumOfProducts'], axis=1, inplace=True) # X_train['Balance0'] = X_train['Balance'].apply(lambda x: 1 if x < 50000 else 0) # X_train['Balance1'] = X_train['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0) # X_train['Balance2'] = X_train['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0) # X_train['Balance3'] = X_train['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0) # X_train.drop(['Balance'], axis=1, inplace=True) # X_val['Balance0'] = X_val['Balance'].apply(lambda x: 1 if x < 50000 else 0) # X_val['Balance1'] = X_val['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0) # X_val['Balance2'] = X_val['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0) # X_val['Balance3'] = X_val['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0) # X_val.drop(['Balance'], axis=1, inplace=True) # test_train['Balance0'] = test_train['Balance'].apply(lambda x: 1 if x < 50000 else 0) # test_train['Balance1'] = test_train['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0) # test_train['Balance2'] = test_train['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0) # test_train['Balance3'] = test_train['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0) # test_train.drop(['Balance'], axis=1, inplace=True) # age X_train['Age40-70'] = X_train['Age'].apply(lambda x: 1 if (x >= 40 and x <= 70) else 0) X_val['Age40-70'] = X_val['Age'].apply(lambda x: 1 if (x >= 40 and x <= 70) else 0) test_train['Age40-70'] = test_train['Age'].apply( lambda x: 1 if (x >= 40 and x <= 70) else 0) # balance X_train['Balance-mid'] = X_train['Balance'].apply( lambda x: 1 if (x >= 75000 and x <= 160000) else 0) X_val['Balance-mid'] = X_val['Balance'].apply( lambda x: 1 if (x >= 75000 and x <= 160000) else 0) test_train['Balance-mid'] = test_train['Balance'].apply( lambda x: 1 if (x >= 75000 and x <= 160000) else 0) X_train['Balance-low'] = X_train['Balance'].apply(lambda x: 1 if (x <= 25000) else 0) X_val['Balance-low'] = X_val['Balance'].apply(lambda x: 1 if (x <= 25000) else 0) test_train['Balance-low'] = test_train['Balance'].apply( lambda x: 1 if (x <= 25000) else 0) # credit score X_train['LowCredit'] = X_train['CreditScore'].apply(lambda x: 1 if (x <= 400) else 0) X_val['LowCredit'] = X_val['CreditScore'].apply(lambda x: 1 if (x <= 400) else 0) test_train['LowCredit'] = test_train['CreditScore'].apply( lambda x: 1 if (x <= 400) else 0) X_train['HighCredit'] = X_train['CreditScore'].apply(lambda x: 1 if (x >= 800) else 0) X_val['HighCredit'] = X_val['CreditScore'].apply(lambda x: 1 if (x >= 800) else 0) test_train['HighCredit'] = test_train['CreditScore'].apply( lambda x: 1 if (x >= 800) else 0) # # tenure X_train['Tenure6-8'] = X_train['Tenure'].apply(lambda x: 1 if (x >= 6 and x <= 8) else 0) X_val['Tenure6-8'] = X_val['Tenure'].apply(lambda x: 1 if (x >= 6 and x <= 8) else 0) test_train['Tenure6-8'] = test_train['Tenure'].apply( lambda x: 1 if (x >= 6 and x <= 8) else 0) # X_train['Tenure1-2'] = X_train['Tenure'].apply(lambda x: 1 if (x >= 1 and x <= 2) else 0) # X_val['Tenure1-2'] = X_val['Tenure'].apply(lambda x: 1 if (x >= 1 and x <= 2) else 0) # test_train['Tenure1-2'] = test_train['Tenure'].apply(lambda x: 1 if (x >= 1 and x <= 2) else 0) # Age vs. Balance and CreditScore # X_train['Balance/Age'] = X_train['Balance']/X_train['Age'] # X_val['Balance/Age'] = X_val['Balance'] / X_val['Age'] # test_train['Balance/Age'] = test_train['CreditScore'] / test_train['Age'] # X_train['CreditScore/Age'] = X_train['CreditScore'] / X_train['Age'] # X_val['CreditScore/Age'] = X_val['CreditScore'] / X_val['Age'] # test_train['CreditScore/Age'] = test_train['CreditScore'] / test_train['Age'] X_train.drop( ['Age', 'CreditScore', 'Balance', 'EstimatedSalary', 'Tenure'], axis=1, inplace=True) X_val.drop(['Age', 'CreditScore', 'Balance', 'EstimatedSalary', 'Tenure'], axis=1, inplace=True) test_train.drop( ['Age', 'CreditScore', 'Balance', 'EstimatedSalary', 'Tenure'], axis=1, inplace=True) return X_train, y_train, X_val, y_val, test_train, test_val
def Retrain_Model_10_Iterates_SVMSMOTE( target,title,max_depth=3,n_esti=160,withexperience = False, color='YlGnBu'): matrics = [] seed(2145) for i in range(3): rnd = randint(1, 2021) print("Start Iterate Number {}:".format(i+1)) TRAIN_TEST_SPLIT_PERC = 0.8 uniques = df_model_draft["HospID"].unique() sep = int(len(uniques) * TRAIN_TEST_SPLIT_PERC) df = df_model_draft.sample(frac=1).reset_index(drop=True) # For shuffling your data train_ids, test_ids = uniques[:sep], uniques[sep:] train_df, test_df = df[df.HospID.isin(train_ids)], df[df.HospID.isin(test_ids)] print("\nTRAIN DATAFRAME\n", train_df.shape) print("\nTEST DATAFRAME\n", test_df.shape) if withexperience is False: X_train = train_df.drop( ['HospID', 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1) y_train = train_df[target] X_test = test_df.drop( ['HospID', 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1) y_test = test_df[target] else: X_train = train_df.drop( ['HospID', 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM', 'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear','HospID_total_cardiac_surgery', 'surgid_total_cardiac_surgery','surgid_total_CABG', 'surgid_Reop_CABG'], axis=1) y_train = train_df[target] X_test = test_df.drop( ['HospID', 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM', 'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear','HospID_total_cardiac_surgery', 'surgid_total_cardiac_surgery','surgid_total_CABG', 'surgid_Reop_CABG'], axis=1) y_test = test_df[target] sm = SVMSMOTE() # SVMSMOTE(random_state=21) # fit and apply the transform X_over, y_over = sm.fit_resample(X_train, y_train) # summarize class distribution print("after under sampling") counter = Counter(y_over) print(counter) estimate = counter[0] / counter[1] print('Estimate: %.3f' % estimate) model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=max_depth, learning_rate=0.1, n_estimators=n_esti) model.fit(X_over, y_over) y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred) mats = Make_Confusion_Matrix(cm, categories=categories, cmap=color, title=title, group_names=labels,y_pred=y_pred,y_test=y_test) auc = roc_auc_score(y_test, model.predict_proba(X_test.values)[:, 1]) mats['AUROC'] = auc matrics.append(mats) return matrics # matrics_xgb = Retrain_Model_10_Iterates_SVMSMOTE('STSRCOM', 'STSRCOM SVMSMOTE with experience',3,180) # matrics_xgb_df = pd.DataFrame(matrics_xgb) # matrics_xgb_df.loc['Mean'] = matrics_xgb_df.mean() # matrics_xgb_df.loc['Std'] = matrics_xgb_df.std() # print(matrics_xgb_df) # matrics_xgb_df.to_csv("/model_outputs/STSRCOM SVMSMOTE with experience.csv") # # matrics_xgb = Retrain_Model_10_Iterates_SVMSMOTE('STSRCOM', 'STSRCOM SVMSMOTE without experience',3,180,True) # matrics_xgb_df = pd.DataFrame(matrics_xgb) # matrics_xgb_df.loc['Mean'] = matrics_xgb_df.mean() # matrics_xgb_df.loc['Std'] = matrics_xgb_df.std() # print(matrics_xgb_df) # matrics_xgb_df.to_csv("/model_outputs/STSRCOM SVMSMOTE without experience.csv") # # matrics_xgb = Retrain_Model_10_Iterates_SVMSMOTE('STSRCHOSPD', 'STSRCHOSPD SVMSMOTE with experience',3,180,color='RdPu') # matrics_xgb_df = pd.DataFrame(matrics_xgb) # matrics_xgb_df.loc['Mean'] = matrics_xgb_df.mean() # matrics_xgb_df.loc['Std'] = matrics_xgb_df.std() # print(matrics_xgb_df) # matrics_xgb_df.to_csv("/model_outputs/STSRCHOSPD SVMSMOTE with experience.csv") # # matrics_xgb = Retrain_Model_10_Iterates_SVMSMOTE('STSRCHOSPD', 'STSRCHOSPD SVMSMOTE without experience',3,180,True,color='RdPu') # matrics_xgb_df = pd.DataFrame(matrics_xgb) # matrics_xgb_df.loc['Mean'] = matrics_xgb_df.mean() # matrics_xgb_df.loc['Std'] = matrics_xgb_df.std() # print(matrics_xgb_df) # matrics_xgb_df.to_csv("/model_outputs/STSRCHOSPD SVMSMOTE without experience.csv")
bt_enc_sc = np.concatenate([bt_enc, bt_sc], axis=1) # """#Separação das variaveis""" X = bt_enc_sc y = battle.iloc[:, -1].values # """Tratando os dados faltantes""" le = LabelEncoder() y = le.fit_transform(y) # """Como há poucos dados é possivel aplicar a técnica de dados sintéticos, aumentando o volume de dados""" sm = SVMSMOTE(random_state=42, k_neighbors=3, out_step=0.4) X, y = sm.fit_resample(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # """Treinamento do modelo""" classifier = SVC() classifier.fit(X_train, y_train) param_grid = { 'C': [0.01, 0.1, 1, 10, 100, 1000], 'gamma': [10, 1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
def svm_smote(X, y): sm = SVMSMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) return X_res, y_res
def Retrain_Model_10_Iterates_SVMSMOTE(target, title, max_depth=3, n_esti=160, lr=0.1, withexperience=False, color='YlGnBu'): matrics = [] seed(2145) groups = df_model_draft['HospID'] if withexperience is False: X = df_model_draft.drop( ['SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1) y = df_model_draft[target] else: X = df_model_draft.drop([ 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM', 'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear', 'HospID_total_cardiac_surgery', 'surgid_total_cardiac_surgery', 'surgid_total_CABG', 'surgid_Reop_CABG' ], axis=1) y = df_model_draft[target] print(groups.shape) print(groups.unique()) gss = GroupShuffleSplit(n_splits=10, train_size=.8, random_state=42) gss.get_n_splits() i = 1 for train_idx, test_idx in gss.split(X, y, groups): print("TRAIN:", train_idx, "TEST:", test_idx) if (i == 1): X = X.drop(['HospID'], axis=1) print(X.columns.tolist()) X_train = X.loc[train_idx] y_train = y.loc[train_idx] X_test = X.loc[test_idx] y_test = y.loc[test_idx] print("\nTRAIN DATAFRAME\n", X_train.shape) print("\nTEST DATAFRAME\n", X_test.shape) # summarize class distribution sm = SVMSMOTE() # SVMSMOTE(random_state=21) # fit and apply the transform X_over, y_over = sm.fit_resample(X_train, y_train) # summarize class distribution print("after under sampling") counter = Counter(y_over) print(counter) estimate = counter[0] / counter[1] print('Estimate: %.3f' % estimate) model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=max_depth, learning_rate=lr, n_estimators=n_esti) model.fit(X_over, y_over) y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred) mats = Make_Confusion_Matrix(cm, categories=categories, cmap=color, title=title, group_names=labels, y_pred=y_pred, y_test=y_test) auc = roc_auc_score(y_test, model.predict_proba(X_test.values)[:, 1]) mats['AUROC'] = auc matrics.append(mats) i = i + 1 return matrics
def process2(self, Xtr, ytr, Xts): # print(Xtr.shape,Xts.shape) # x1 = Xtr.shape[0] # x2 = Xts.shape[0] # X = np.concatenate(Xtr,Xts,axis=0) # if self.norm==True: # X = StandardScaler().fit(X).transform(X) # if self.pca==True: # if self.dim_red: # pca = PCA(dim_red).fit(X) # else: # pca = PCA().fit(X) # if self.retain_info==1: # info_retain = np.argmax(pca.explained_variance_ratio_.cumsum()) # pca = PCA(info_retain).fit(X) # else: # info_retain = np.where(pca.explained_variance_ratio_.cumsum() >= self.retain_info) # if (info_retain[0][0]>0): # pca = PCA(info_retain[0][0]).fit(X) # elif (info_retain[0][0]==0): #Randbedingung, s.d. wir nicht auf 0 dim reduzieren # pca = PCA(1).fit(X) # X = pca.transform(X) # Xtr = X[:x1,:] # Xts = X[x1:,:] # print(Xtr.shape,Xts.shape) if self.pca == True: if self.dim_red: pca = PCA(dim_red).fit(Xtr) #pca = PCA(dim_red).fit(Xts) else: pca = PCA().fit(Xtr) #pca = PCA().fit(Xts) if self.retain_info == 1: info_retain = np.argmax( pca.explained_variance_ratio_.cumsum()) pca = PCA(info_retain).fit(Xtr) #pca = PCA(info_retain).fit(Xts) else: info_retain = np.where(pca.explained_variance_ratio_. cumsum() >= self.retain_info) if (info_retain[0][0] > 0): pca = PCA(info_retain[0][0]).fit(Xtr) #pca = PCA(info_retain[0][0]).fit(Xts) elif ( info_retain[0][0] == 0 ): #Randbedingung, s.d. wir nicht auf 0 dim reduzieren pca = PCA(1).fit(Xtr) #pca = PCA(1).fit(Xts) Xtr = pca.transform(Xtr) Xts = pca.transform(Xts) ### Optional Oversampling SMOTE or ADAYSN if self.smote_fct == True: sm = SVMSMOTE(sampling_strategy='not majority', random_state=41, k_neighbors=self.k_neighbors) Xtr, ytr = sm.fit_resample(Xtr, ytr) print('Resampled dataset shape %s' % Counter(ytr)) if self.adasyn == True: print('Original dataset shape %s' % Counter(ytr)) ada = ADASYN(random_state=42, n_neighbors=self.k_neighbors) Xtr, ytr = ada.fit_resample(Xtr, ytr) print('Resampled dataset shape %s' % Counter(ytr)) return Xtr, ytr, Xts
def predict(self, X): # Check is fit had been called check_is_fitted(self, "classes_") # Input validation X = check_array(X) if X.shape[1] != self.X_.shape[1]: raise ValueError("number of features does not match") X_dsel = self.previous_X y_dsel = self.previous_y unique, counts = np.unique(y_dsel, return_counts=True) k_neighbors = 5 if counts[0] - 1 < 5: k_neighbors = counts[0] - 1 if self.oversampler == "SMOTE" and k_neighbors > 0: smote = SMOTE(random_state=42, k_neighbors=k_neighbors) X_dsel, y_dsel = smote.fit_resample(X_dsel, y_dsel) elif self.oversampler == "svmSMOTE" and k_neighbors > 0: try: svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors) X_dsel, y_dsel = svmSmote.fit_resample(X_dsel, y_dsel) except ValueError: pass elif self.oversampler == "borderline1" and k_neighbors > 0: borderlineSmote1 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-1') X_dsel, y_dsel = borderlineSmote1.fit_resample(X_dsel, y_dsel) elif self.oversampler == "borderline2" and k_neighbors > 0: borderlineSmote2 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-2') X_dsel, y_dsel = borderlineSmote2.fit_resample(X_dsel, y_dsel) elif self.oversampler == "ADASYN" and k_neighbors > 0: try: adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors) X_dsel, y_dsel = adasyn.fit_resample(X_dsel, y_dsel) except RuntimeError: pass except ValueError: pass elif self.oversampler == "SLS" and k_neighbors > 0: sls = Safe_Level_SMOTE(n_neighbors=k_neighbors) X_dsel, y_dsel = sls.sample(X_dsel, y_dsel) if self.desMethod == "KNORAE": des = KNORAE(self.ensemble_, random_state=42) elif self.desMethod == "KNORAU": des = KNORAU(self.ensemble_, random_state=42) elif self.desMethod == "KNN": des = DESKNN(self.ensemble_, random_state=42) elif self.desMethod == "Clustering": des = DESClustering(self.ensemble_, random_state=42) else: des = KNORAE(self.ensemble_, random_state=42) if len(self.ensemble_) < 2: prediction = self.ensemble_[0].predict(X) else: des.fit(X_dsel, y_dsel) prediction = des.predict(X) return prediction
# borderline-SMOTE with SVM for imbalanced dataset from collections import Counter from sklearn.datasets import make_classification from imblearn.over_sampling import SVMSMOTE from plotDataset import plot_dataset X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1) counter = Counter(y) print(counter) plot_dataset(X, y, counter) oversample = SVMSMOTE() X, y = oversample.fit_resample(X, y) counter = Counter(y) print(counter) plot_dataset(X, y, counter)
columns=X_train.columns[rfe.support_]) X_test_RFE = pd.DataFrame(rfe.transform(X_test), columns=X_test.columns[rfe.support_]) print(X_train.shape, X_train_RFE.shape) #%% from imblearn.over_sampling import ADASYN, SMOTE, SVMSMOTE #OVERSAMPLING sampler_smote = SMOTE(n_jobs=-1) sampler_svm = SVMSMOTE(n_jobs=-1) sampler_adasyn = ADASYN(n_jobs=-1) X_smote, y_smote = sampler_smote.fit_resample(X=X_train, y=y_train.ravel()) X_svm, y_svm = sampler_svm.fit_resample(X=X_train, y=y_train.ravel()) X_adasyn, y_adasyn = sampler_adasyn.fit_resample(X=X_train, y=y_train.ravel()) #%% X_smote.shape #%% #baseline rf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=8, criterion="entropy", n_jobs=-1) rf.fit(X=X_train, y=y_train.ravel()) #FROM NOW ON, USE THE TUNED VERSION ALTHOUGH WE SHOULD RE-TUNE
train_5 = [] test_5 = [] train_arr = [train_1, train_2, train_3, train_4, train_5] test_arr = [test_1, test_2, test_3, test_4, test_5] x_data = [] y_data = [] for i in data: for l in range(len(i)): i[l] = float(i[l]) x_data.append(np.array(i[1:])) y_data.append(np.array(np.array(i[0]))) sm = SVMSMOTE(random_state=1) x_data, y_data = sm.fit_resample(x_data, y_data) y_data = y_data.reshape([len(y_data), 1]) test_num = int(len(x_data) * 0.2) for i in range(1): result = {} id = 1 x_data_train = x_data y_data_train = y_data x_data_test = np.zeros([1, len(x_data_train[0])]) y_data_test = np.zeros([1, len(x_data_train[0])]) delPoint = test_num * i print(x_data_train[0]) print(x_data_test.shape) print(x_data_train.shape)
def getAllCleanedDataStable(standardize=0, binning=0): df = pd.read_csv('train.csv', header=0) # printFullRow(df) # print(df['Exited'].value_counts()) df.drop(['CustomerId', 'Surname', 'RowNumber'], axis=1, inplace=True) X = df.drop(['Exited'], axis=1) y = df['Exited'] # printFullRow(X_train.head()) df_test = pd.read_csv('testing.csv', header=0) # print(df_test.info()) test_train = df_test.drop(['Exited'], axis=1) test_val = df_test['Exited'] X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42) print(len(y_train), len(y_val)) ##### ENCODING ##### X_train['HasCrCard'] = X_train['HasCrCard'].apply(lambda x: 1. if x == 1 else 0.) X_val['HasCrCard'] = X_val['HasCrCard'].apply(lambda x: 1. if x == 1 else 0.) test_train['HasCrCard'] = test_train['HasCrCard'].apply(lambda x: 1. if x == 1 else 0.) X_train['IsActiveMember'] = X_train['IsActiveMember'].apply( lambda x: 1. if x == 1 else 0.) X_val['IsActiveMember'] = X_val['IsActiveMember'].apply(lambda x: 1. if x == 1 else 0.) test_train['IsActiveMember'] = test_train['IsActiveMember'].apply( lambda x: 1. if x == 1 else 0.) X_train_cat_df = X_train[['Geography', 'Gender']] X_val_cat_df = X_val[['Geography', 'Gender']] test_cat_df = test_train[['Geography', 'Gender']] X_train = X_train.drop(['Geography', 'Gender'], axis=1) X_val = X_val.drop(['Geography', 'Gender'], axis=1) test_train = test_train.drop(['Geography', 'Gender'], axis=1) X_train.reset_index(drop=True, inplace=True) X_val.reset_index(drop=True, inplace=True) test_train.reset_index(drop=True, inplace=True) X_train_cat = X_train_cat_df.to_numpy() X_val_cat = X_val_cat_df.to_numpy() test_cat = test_cat_df.to_numpy() enc = OneHotEncoder().fit(X_train_cat) X_train_enc_array = enc.transform(X_train_cat).toarray() X_val_enc_array = enc.transform(X_val_cat).toarray() test_enc_array = enc.transform(test_cat).toarray() X_train_enc_df = pd.DataFrame( data=X_train_enc_array, columns=['France', 'Germany', 'Spain', 'Female', 'Male']) X_val_enc_df = pd.DataFrame( data=X_val_enc_array, columns=['France', 'Germany', 'Spain', 'Female', 'Male']) test_enc_df = pd.DataFrame( data=test_enc_array, columns=['France', 'Germany', 'Spain', 'Female', 'Male']) X_train = pd.concat([X_train, X_train_enc_df], axis=1) X_val = pd.concat([X_val, X_val_enc_df], axis=1) test_train = pd.concat([test_train, test_enc_df], axis=1) # drop the extra columns X_train.drop(['France', 'Female'], axis=1, inplace=True) X_val.drop(['France', 'Female'], axis=1, inplace=True) test_train.drop(['France', 'Female'], axis=1, inplace=True) ###### Oversample training data ##### # random_state=101 gives 0.618 # random_state=5 gives 0.61846 svmsmote = SVMSMOTE(random_state=5) X_train, y_train = svmsmote.fit_resample(X_train, y_train) # smk = SMOTE() # X_train, y_train = smk.fit_sample(X_train, y_train) # adasyn = ADASYN(random_state=101) # X_train, y_train = adasyn.fit_resample(X_train, y_train) # over = RandomOverSampler(sampling_strategy=0.4, random_state=42) # under = RandomUnderSampler(sampling_strategy=0.5, random_state=42) # X_train, y_train = over.fit_sample(X_train, y_train) # X_train, y_train = under.fit_sample(X_train, y_train) # print(X_train.shape, X_val.shape) # printFullRow(X_train[:5]) # print(y_train.value_counts()) if binning == 1: # bin balance into 4 categories # X_train['Balance0'] = X_train['Balance'].apply(lambda x: 1 if x < 50000 else 0) # X_train['Balance1'] = X_train['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0) # X_train['Balance2'] = X_train['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0) # X_train['Balance3'] = X_train['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0) # X_train.drop(['Balance'], axis=1, inplace=True) # X_val['Balance0'] = X_val['Balance'].apply(lambda x: 1 if x < 50000 else 0) # X_val['Balance1'] = X_val['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0) # X_val['Balance2'] = X_val['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0) # X_val['Balance3'] = X_val['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0) # X_val.drop(['Balance'], axis=1, inplace=True) # test_train['Balance0'] = test_train['Balance'].apply(lambda x: 1 if x < 50000 else 0) # test_train['Balance1'] = test_train['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0) # test_train['Balance2'] = test_train['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0) # test_train['Balance3'] = test_train['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0) # test_train.drop(['Balance'], axis=1, inplace=True) # binning age # X_train['Age40-50'] = X_train['Age'].apply(lambda x: 1 if (x>=40 and x<50) else 0) # X_train['Age30-40'] = X_train['Age'].apply(lambda x: 1 if (x >= 30 and x < 40) else 0) # X_train['Ageless30'] = X_train['Age'].apply(lambda x: 1 if (x < 30) else 0) # X_train['Ageover50'] = X_train['Age'].apply(lambda x: 1 if (x > 50) else 0) # X_train.drop(['Age'], axis=1, inplace=True) # X_val['Age40-50'] = X_val['Age'].apply(lambda x: 1 if (x >= 40 and x < 50) else 0) # X_val['Age30-40'] = X_val['Age'].apply(lambda x: 1 if (x >= 30 and x < 40) else 0) # X_val['Ageless30'] = X_val['Age'].apply(lambda x: 1 if (x < 30) else 0) # X_val['Ageover50'] = X_val['Age'].apply(lambda x: 1 if (x > 50) else 0) # X_val.drop(['Age'], axis=1, inplace=True) # test_train['Age40-50'] = test_train['Age'].apply(lambda x: 1 if (x>=40 and x<50) else 0) # test_train['Age30-40'] = test_train['Age'].apply(lambda x: 1 if (x >= 30 and x < 40) else 0) # test_train['Ageless30'] = test_train['Age'].apply(lambda x: 1 if (x < 30) else 0) # test_train['Ageover50'] = test_train['Age'].apply(lambda x: 1 if (x > 50) else 0) # test_train.drop(['Age'], axis=1, inplace=True) # binning num of products X_train['1 Product'] = X_train['NumOfProducts'].apply(lambda x: 1 if x == 1 else 0) X_train['2 Product'] = X_train['NumOfProducts'].apply(lambda x: 1 if x == 2 else 0) X_train['3 Product'] = X_train['NumOfProducts'].apply(lambda x: 1 if x == 3 else 0) X_train['4 Product'] = X_train['NumOfProducts'].apply(lambda x: 1 if x == 4 else 0) X_train.drop(['NumOfProducts'], axis=1, inplace=True) X_val['1 Product'] = X_val['NumOfProducts'].apply(lambda x: 1 if x == 1 else 0) X_val['2 Product'] = X_val['NumOfProducts'].apply(lambda x: 1 if x == 2 else 0) X_val['3 Product'] = X_val['NumOfProducts'].apply(lambda x: 1 if x == 3 else 0) X_val['4 Product'] = X_val['NumOfProducts'].apply(lambda x: 1 if x == 4 else 0) X_val.drop(['NumOfProducts'], axis=1, inplace=True) test_train['1 Product'] = test_train['NumOfProducts'].apply( lambda x: 1 if x == 1 else 0) test_train['2 Product'] = test_train['NumOfProducts'].apply( lambda x: 1 if x == 2 else 0) test_train['3 Product'] = test_train['NumOfProducts'].apply( lambda x: 1 if x == 3 else 0) test_train['4 Product'] = test_train['NumOfProducts'].apply( lambda x: 1 if x == 4 else 0) test_train.drop(['NumOfProducts'], axis=1, inplace=True) ##### Control if standardize != 1: X_train_og = X_train X_val_og = X_val y_train_og = y_train y_val_og = y_val test_train_og = test_train test_val_og = test_val return X_train_og, y_train_og, X_val_og, y_val_og, test_train_og, test_val_og ###### Standarize and Normalize ##### # scale = StandardScaler().fit(X_train[['CreditScore', 'Age', 'NumOfProducts']]) # X_train[['CreditScore', 'Age', 'NumOfProducts']] = scale.transform(X_train[['CreditScore', 'Age', 'NumOfProducts']]) # X_val[['CreditScore', 'Age', 'NumOfProducts']] = scale.transform(X_val[['CreditScore', 'Age', 'NumOfProducts']]) # test_train[['CreditScore', 'Age', 'NumOfProducts']] = scale.transform( # test_train[['CreditScore', 'Age', 'NumOfProducts']]) scale = StandardScaler().fit(X_train[['CreditScore', 'NumOfProducts']]) X_train[['CreditScore', 'NumOfProducts' ]] = scale.transform(X_train[['CreditScore', 'NumOfProducts']]) X_val[['CreditScore', 'NumOfProducts' ]] = scale.transform(X_val[['CreditScore', 'NumOfProducts']]) test_train[['CreditScore', 'NumOfProducts']] = scale.transform( test_train[['CreditScore', 'NumOfProducts']]) robust_scale = RobustScaler().fit(X_train[['Balance']]) X_train[['Balance']] = robust_scale.transform(X_train[['Balance']]) X_val[['Balance']] = robust_scale.transform(X_val[['Balance']]) test_train[['Balance']] = robust_scale.transform(test_train[['Balance']]) # normalize salary salary_mean = X_train['EstimatedSalary'].mean() salary_std = X_train['EstimatedSalary'].std() X_train['EstimatedSalary'] = X_train['EstimatedSalary'].apply( lambda x: (x - salary_mean) / salary_std) salary_mean = X_val['EstimatedSalary'].mean() salary_std = X_val['EstimatedSalary'].std() X_val['EstimatedSalary'] = X_val['EstimatedSalary'].apply( lambda x: (x - salary_mean) / salary_std) salary_mean = test_train['EstimatedSalary'].mean() salary_std = test_train['EstimatedSalary'].std() test_train['EstimatedSalary'] = test_train['EstimatedSalary'].apply( lambda x: (x - salary_mean) / salary_std) # normalize tenure tenure_mean = X_train['Tenure'].mean() tenure_std = X_train['Tenure'].std() X_train['Tenure'] = X_train['Tenure'].apply(lambda x: (x - tenure_mean) / tenure_std) tenure_mean = X_val['Tenure'].mean() tenure_std = X_val['Tenure'].std() X_val['Tenure'] = X_val['Tenure'].apply(lambda x: (x - tenure_mean) / tenure_std) tenure_mean = test_train['Tenure'].mean() tenure_std = test_train['Tenure'].std() test_train['Tenure'] = test_train['Tenure'].apply( lambda x: (x - tenure_mean) / tenure_std) return X_train, y_train, X_val, y_val, test_train, test_val
def partial_fit(self, X, y, classes=None): """Partial fitting.""" if not hasattr(self, "_base_clf"): self.set_base_clf() X, y = check_X_y(X, y) if _check_partial_fit_first_call(self, classes): self.classes_ = classes self.ensemble_ = [] self.X_, self.y_ = X, y train_X, train_y = X, y unique, counts = np.unique(train_y, return_counts=True) k_neighbors = 5 if counts[0] - 1 < 5: k_neighbors = counts[0] - 1 if self.oversampler == "SMOTE" and k_neighbors > 0: smote = SMOTE(random_state=42, k_neighbors=k_neighbors) train_X, train_y = smote.fit_resample(train_X, train_y) elif self.oversampler == "svmSMOTE" and k_neighbors > 0: try: svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors) train_X, train_y = svmSmote.fit_resample(train_X, train_y) except ValueError: pass elif self.oversampler == "borderline1" and k_neighbors > 0: borderlineSmote1 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-1') train_X, train_y = borderlineSmote1.fit_resample(train_X, train_y) elif self.oversampler == "borderline2" and k_neighbors > 0: borderlineSmote2 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-2') train_X, train_y = borderlineSmote2.fit_resample(train_X, train_y) elif self.oversampler == "ADASYN" and k_neighbors > 0: try: adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors) train_X, train_y = adasyn.fit_resample(train_X, train_y) except RuntimeError: pass elif self.oversampler == "SLS" and k_neighbors > 0: sls = Safe_Level_SMOTE(n_neighbors=k_neighbors) train_X, train_y = sls.sample(train_X, train_y) # Testing all models scores = np.array([ba(y, clf.predict(X)) for clf in self.ensemble_]) # Pruning if len(self.ensemble_) > 1: alpha_good = scores > (0.5 + self.alpha) self.ensemble_ = [ self.ensemble_[i] for i in np.where(alpha_good)[0] ] if len(self.ensemble_) > self.ensemble_size - 1: worst = np.argmin(scores) del self.ensemble_[worst] # Preparing and training new candidate self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
predict_adasyn, average='binary') fpr_adasyn[idx, :], tpr_adasyn[idx, :], thresholds_adasyn[ idx, :] = roc_curve(predict_adasyn, adasyn_label_test, pos_label=1) auc_adasyn[idx, :] = auc(fpr_adasyn[idx, :], tpr_adasyn[idx, :]) ### benchmark set ### predict_adasyn_benchmark[idx, :] = model_adasyn.predict(benchmark_data) ############################################################################################################################################################# svmsmote = SVMSMOTE(sampling_strategy="minority") X_svmsmote, y_svmsmote = svmsmote.fit_resample( data.reshape( (num_channels * (size_freq_parameters + size_time_parameters), -1)).T, labels) num_svmsmote_samples_preictal[idx, :] = np.count_nonzero( y_svmsmote) # need this only to report it in a paper num_svmsmote_samples_interictal[ idx, :] = np.shape(y_svmsmote)[0] - np.count_nonzero(y_svmsmote) svmsmote_data_train, svmsmote_data_test, svmsmote_label_train, svmsmote_label_test = train_test_split( X_svmsmote, y_svmsmote, test_size=0.3, shuffle=True) model_svmsmote = LinearSVC(penalty="l1", dual=False, max_iter=5000) model_svmsmote.fit(svmsmote_data_train, svmsmote_label_train) parameters_svmsmote = model_svmsmote.get_params() coefficients_svmsmote.append(model_svmsmote.coef_.tolist())
print(cleaned_train_data) X = cleaned_train_data.drop('target', 1) y = cleaned_train_data.target X_train, X_test, y_train, y_test = tts(X, y, test_size=0.25, random_state=42) sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) svm_smote = SVMSMOTE(sampling_strategy='minority', random_state=42, k_neighbors=5) X_svm_smote, y_svm_smote = svm_smote.fit_resample(X, y) X_train_svm, X_test_svm, y_train_svm, y_test_svm = tts(X_svm_smote, y_svm_smote, test_size=0.25, random_state=42) sc = StandardScaler() X_train_svm = sc.fit_transform(X_train_svm) X_test_svm = sc.transform(X_test_svm) def evaluate(model, X_test, y_test): y_pred = model.predict(X_test) errors = abs(y_pred - y_test) print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))