Beispiel #1
0
class SMOTER:
    def __init__(self, *args, **kwargs):
        self.smote = SMOTE(*args, **kwargs)
        self.params = dict()
        for key, value in kwargs.items():
            self.params[key] = value

    def fit(self, X, y):
        self.smote.fit(X, y)
        return None

    def transform(self, X, y=None):
        return self.smote.sample(X, y)

    def get_params(self, deep):
        return self.params
Beispiel #2
0
for i in lcol_num:
  colname.append("{}".format(i))

X_train = pipeline_preprocess.transform(X_train)
X_test = pipeline_preprocess.transform(X_test)

joblib.dump([dtype,categorical_feat_classes,list_col_cat,list_idx_cat,categorical_onehot_idx,categorical_onehot_nval,colname], './model/las_kupedes_ultramikro_v3_var.sav')
#joblib.dump([le,pipeline_preprocess], './model/las_kupedes_ultramikro_v3_preprocess.sav')
joblib.dump([le,pipeline_preprocess], './model/las_kupedes_ultramikro_v3_preprocess_wo_scaler.sav')

### Resampling unbalanced dataset
# (1) Over-sampling with SMOTE
def_ratio = 0.15
sm = SMOTE(random_state=42, ratio={0:Y_train.value_counts()[0],1:int(Y_train.value_counts()[0]*(def_ratio/(1-def_ratio)))})
sm.fit(X_train,Y_train)
X_train_upsampled, Y_train_upsampled = sm.sample(X_train, Y_train)
# (2) Class weight
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
sample_weight = compute_sample_weight(
  class_weight = {0:1,1:10},
  y = Y_train_upsampled
)

### CV - XGBoost

from sklearn.model_selection import KFold
K = 5
kf = KFold(n_splits = K, random_state = 3228, shuffle = True)

xgb_preds = []
bst = bst_models(colname=colname,score_method="max")
print("Accuracy", acc)
##Random Forest
acc = do_cross_val_RForest(np.array(X_resampled), y_resampled, 10)
print("Accuracy", acc)

##Balancing by SMOTE
from imblearn.over_sampling import SMOTE
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=0)
X_res, y_res = sm.fit_sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_res)))
from imblearn.over_sampling import SMOTE
print('Original dataset shape {}'.format(Counter(y)))
sm = SMOTE(random_state=42)
sm.fit(X, y)
X_res, y_res = sm.sample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_res)))
X_res1, y_res1 = sm.fit_sample(X_res, y_res)
print('Resampled dataset shape {}'.format(Counter(y_res1)))
X_res2, y_res2 = sm.fit_sample(X_res1, y_res1)
print('Resampled dataset shape {}'.format(Counter(y_res2)))

## Decision Tree
acc = do_cross_val_Decision(X_res2, y_res2, 10)
print("Accuracy", acc)
## Logistic Regression
acc = do_cross_val_LR(X_res2, y_res2, 10)
print("Accuracy", acc)
## Random Forest
acc = do_cross_val_RForest(X_res2, y_res2, 10)
print("Accuracy", acc)
Beispiel #4
0
       'roof_type_e0e2', 'ground_floor_type_467b', 'ground_floor_type_b1b4',
       'ground_floor_type_b440', 'ground_floor_type_bb5f',
       'ground_floor_type_e26c', 'other_floor_type_441a',
       'other_floor_type_67f9', 'other_floor_type_9eb0',
       'other_floor_type_f962', 'position_1787', 'position_3356',
       'position_bcab', 'position_bfba', 'plan_configuration_0448',
       'plan_configuration_1442', 'plan_configuration_3fee',
       'plan_configuration_6e81', 'plan_configuration_84cf',
       'plan_configuration_8e3f', 'plan_configuration_a779',
       'plan_configuration_cb88', 'plan_configuration_d2d9',
       'legal_ownership_status_ab03', 'legal_ownership_status_bb5f',
       'legal_ownership_status_c8e1', 'legal_ownership_status_cae1'])"""

#Take a random sample of the oversampled new DF with the same numberof rows in the original DF to avoid overfitting
y_sample = y_resampled.sample(n=10000, random_state=4561, axis=0)
x_sample = x_resampled.sample(n=10000, random_state=4561, axis=0)
y_sample.plot(kind='hist')

y_sample
x_sample
x_resampled
y_resampled

#Testing split methods
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
'''x_train, x_test, y_train, y_test = train_test_split(x_sample, y_sample, test_size = 0.30, random_state = 12,
                                                    shuffle = True)'''

from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=1245)
Beispiel #5
0
    def _sample(self, X, y):

        # Create the clusters and set the labels
        self._set_cluster()
        self._fit_cluster(X, y)

        self.labels = self._cluster_class.labels_

        X_resampled = X.copy()
        y_resampled = y.copy()

        with catch_warnings():
            filterwarnings("ignore", category=UserWarning, module="imblearn")

            for target_class in self.ratio_:

                n_to_generate = self.ratio_[target_class]

                clusters_to_use = self._filter_clusters(
                    y, self._cluster_class.labels_, target_class)

                # In case we do not have cluster where the target class it dominant, we apply regular SMOTE
                if not clusters_to_use and n_to_generate > 0:
                    warn("Class does not have a cluster where is dominant.")

                else:
                    sampling_weights = self._calculate_sampling_weights(
                        X, y, clusters_to_use, self.labels, target_class)

                    for cluster in sampling_weights:
                        mask = self.labels == cluster
                        X_cluster = X[mask]
                        y_cluster = y[mask]

                        n_obs = mask.sum()

                        artificial_index = -1

                        # There needs to be at least two unique values of the target variable
                        if np.unique(y_cluster).size < 2:
                            art_x = np.zeros((1, X.shape[1]))
                            artificial_index = n_obs

                            artificial_y = np.unique(y)[
                                np.unique(y) != target_class][0]

                            X_cluster = np.concatenate((X_cluster, art_x),
                                                       axis=0)
                            y_cluster = np.concatenate(
                                (y_cluster, np.asarray(artificial_y).reshape(
                                    (1, ))),
                                axis=0)

                        minority_obs = y_cluster[y_cluster == target_class]

                        n_new = n_to_generate * sampling_weights[cluster]

                        temp_dic = {
                            target_class:
                            int(round(n_new) + minority_obs.size)
                        }

                        # We need to make sure that k_neighors is less than the number of observations in the cluster
                        if self.k_neighbors > minority_obs.size - 1:
                            k_neighbors = minority_obs.size - 1
                        else:
                            k_neighbors = self.k_neighbors

                        over_sampler = SMOTE(ratio=temp_dic,
                                             k_neighbors=k_neighbors)
                        over_sampler.fit(X_cluster, y_cluster)

                        X_cluster_resampled, y_cluster_resampled = over_sampler.sample(
                            X_cluster, y_cluster)

                        # If there was a observation added, then it is necessary to remove it now
                        if artificial_index > 0:
                            X_cluster_resampled = np.delete(
                                X_cluster_resampled, artificial_index, axis=0)
                            y_cluster_resampled = np.delete(
                                y_cluster_resampled, artificial_index)

                        # Save the newly generated samples only
                        X_cluster_resampled = X_cluster_resampled[n_obs:, :]
                        y_cluster_resampled = y_cluster_resampled[n_obs:, ]

                        # Add the newly generated samples to the data to be returned
                        X_resampled = np.concatenate(
                            (X_resampled, X_cluster_resampled))
                        y_resampled = np.concatenate(
                            (y_resampled, y_cluster_resampled))

        return X_resampled, y_resampled
Beispiel #6
0
# Select features on subset
x_data = compound_x.loc[:, avail_columns]
y_data = compound_y.copy()
# Create binary variable
y_class = np.squeeze([int(y_val <= 10) for y_val in y_data])

# Smote
from custom_pipe_helper import SMOTER

import auto

smote = SMOTE()

check = smote.fit(x_data, y_class)
smote.fit_sample()
check = smote.sample(x_data, y_class)

check[0].shape
check[1]

# Create folds
# For each fold
# SMOTE the train data
# Train model
# Evaluate model

from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import itertools as it