class SMOTER: def __init__(self, *args, **kwargs): self.smote = SMOTE(*args, **kwargs) self.params = dict() for key, value in kwargs.items(): self.params[key] = value def fit(self, X, y): self.smote.fit(X, y) return None def transform(self, X, y=None): return self.smote.sample(X, y) def get_params(self, deep): return self.params
for i in lcol_num: colname.append("{}".format(i)) X_train = pipeline_preprocess.transform(X_train) X_test = pipeline_preprocess.transform(X_test) joblib.dump([dtype,categorical_feat_classes,list_col_cat,list_idx_cat,categorical_onehot_idx,categorical_onehot_nval,colname], './model/las_kupedes_ultramikro_v3_var.sav') #joblib.dump([le,pipeline_preprocess], './model/las_kupedes_ultramikro_v3_preprocess.sav') joblib.dump([le,pipeline_preprocess], './model/las_kupedes_ultramikro_v3_preprocess_wo_scaler.sav') ### Resampling unbalanced dataset # (1) Over-sampling with SMOTE def_ratio = 0.15 sm = SMOTE(random_state=42, ratio={0:Y_train.value_counts()[0],1:int(Y_train.value_counts()[0]*(def_ratio/(1-def_ratio)))}) sm.fit(X_train,Y_train) X_train_upsampled, Y_train_upsampled = sm.sample(X_train, Y_train) # (2) Class weight from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight sample_weight = compute_sample_weight( class_weight = {0:1,1:10}, y = Y_train_upsampled ) ### CV - XGBoost from sklearn.model_selection import KFold K = 5 kf = KFold(n_splits = K, random_state = 3228, shuffle = True) xgb_preds = [] bst = bst_models(colname=colname,score_method="max")
print("Accuracy", acc) ##Random Forest acc = do_cross_val_RForest(np.array(X_resampled), y_resampled, 10) print("Accuracy", acc) ##Balancing by SMOTE from imblearn.over_sampling import SMOTE print('Original dataset shape {}'.format(Counter(y))) sm = SMOTE(random_state=0) X_res, y_res = sm.fit_sample(X, y) print('Resampled dataset shape {}'.format(Counter(y_res))) from imblearn.over_sampling import SMOTE print('Original dataset shape {}'.format(Counter(y))) sm = SMOTE(random_state=42) sm.fit(X, y) X_res, y_res = sm.sample(X, y) print('Resampled dataset shape {}'.format(Counter(y_res))) X_res1, y_res1 = sm.fit_sample(X_res, y_res) print('Resampled dataset shape {}'.format(Counter(y_res1))) X_res2, y_res2 = sm.fit_sample(X_res1, y_res1) print('Resampled dataset shape {}'.format(Counter(y_res2))) ## Decision Tree acc = do_cross_val_Decision(X_res2, y_res2, 10) print("Accuracy", acc) ## Logistic Regression acc = do_cross_val_LR(X_res2, y_res2, 10) print("Accuracy", acc) ## Random Forest acc = do_cross_val_RForest(X_res2, y_res2, 10) print("Accuracy", acc)
'roof_type_e0e2', 'ground_floor_type_467b', 'ground_floor_type_b1b4', 'ground_floor_type_b440', 'ground_floor_type_bb5f', 'ground_floor_type_e26c', 'other_floor_type_441a', 'other_floor_type_67f9', 'other_floor_type_9eb0', 'other_floor_type_f962', 'position_1787', 'position_3356', 'position_bcab', 'position_bfba', 'plan_configuration_0448', 'plan_configuration_1442', 'plan_configuration_3fee', 'plan_configuration_6e81', 'plan_configuration_84cf', 'plan_configuration_8e3f', 'plan_configuration_a779', 'plan_configuration_cb88', 'plan_configuration_d2d9', 'legal_ownership_status_ab03', 'legal_ownership_status_bb5f', 'legal_ownership_status_c8e1', 'legal_ownership_status_cae1'])""" #Take a random sample of the oversampled new DF with the same numberof rows in the original DF to avoid overfitting y_sample = y_resampled.sample(n=10000, random_state=4561, axis=0) x_sample = x_resampled.sample(n=10000, random_state=4561, axis=0) y_sample.plot(kind='hist') y_sample x_sample x_resampled y_resampled #Testing split methods from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score '''x_train, x_test, y_train, y_test = train_test_split(x_sample, y_sample, test_size = 0.30, random_state = 12, shuffle = True)''' from sklearn.model_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=1245)
def _sample(self, X, y): # Create the clusters and set the labels self._set_cluster() self._fit_cluster(X, y) self.labels = self._cluster_class.labels_ X_resampled = X.copy() y_resampled = y.copy() with catch_warnings(): filterwarnings("ignore", category=UserWarning, module="imblearn") for target_class in self.ratio_: n_to_generate = self.ratio_[target_class] clusters_to_use = self._filter_clusters( y, self._cluster_class.labels_, target_class) # In case we do not have cluster where the target class it dominant, we apply regular SMOTE if not clusters_to_use and n_to_generate > 0: warn("Class does not have a cluster where is dominant.") else: sampling_weights = self._calculate_sampling_weights( X, y, clusters_to_use, self.labels, target_class) for cluster in sampling_weights: mask = self.labels == cluster X_cluster = X[mask] y_cluster = y[mask] n_obs = mask.sum() artificial_index = -1 # There needs to be at least two unique values of the target variable if np.unique(y_cluster).size < 2: art_x = np.zeros((1, X.shape[1])) artificial_index = n_obs artificial_y = np.unique(y)[ np.unique(y) != target_class][0] X_cluster = np.concatenate((X_cluster, art_x), axis=0) y_cluster = np.concatenate( (y_cluster, np.asarray(artificial_y).reshape( (1, ))), axis=0) minority_obs = y_cluster[y_cluster == target_class] n_new = n_to_generate * sampling_weights[cluster] temp_dic = { target_class: int(round(n_new) + minority_obs.size) } # We need to make sure that k_neighors is less than the number of observations in the cluster if self.k_neighbors > minority_obs.size - 1: k_neighbors = minority_obs.size - 1 else: k_neighbors = self.k_neighbors over_sampler = SMOTE(ratio=temp_dic, k_neighbors=k_neighbors) over_sampler.fit(X_cluster, y_cluster) X_cluster_resampled, y_cluster_resampled = over_sampler.sample( X_cluster, y_cluster) # If there was a observation added, then it is necessary to remove it now if artificial_index > 0: X_cluster_resampled = np.delete( X_cluster_resampled, artificial_index, axis=0) y_cluster_resampled = np.delete( y_cluster_resampled, artificial_index) # Save the newly generated samples only X_cluster_resampled = X_cluster_resampled[n_obs:, :] y_cluster_resampled = y_cluster_resampled[n_obs:, ] # Add the newly generated samples to the data to be returned X_resampled = np.concatenate( (X_resampled, X_cluster_resampled)) y_resampled = np.concatenate( (y_resampled, y_cluster_resampled)) return X_resampled, y_resampled
# Select features on subset x_data = compound_x.loc[:, avail_columns] y_data = compound_y.copy() # Create binary variable y_class = np.squeeze([int(y_val <= 10) for y_val in y_data]) # Smote from custom_pipe_helper import SMOTER import auto smote = SMOTE() check = smote.fit(x_data, y_class) smote.fit_sample() check = smote.sample(x_data, y_class) check[0].shape check[1] # Create folds # For each fold # SMOTE the train data # Train model # Evaluate model from sklearn.ensemble import AdaBoostClassifier from imblearn.over_sampling import SMOTE from sklearn.model_selection import StratifiedKFold from sklearn.metrics import confusion_matrix import itertools as it