Ejemplo n.º 1
0
    def data_oversample(self):

        x_train, x_val, x_test, y_train, y_val, y_test = self.data_sample_split(
        )

        for i in [0, 1, 2, 3, 4]:

            if i not in y_train:

                print("lesion " + i +
                      " not in y_train. Redoing sample split...")
                data_sample_split()

        print("Presampled train dataset: %s" % Counter(y_train))

        resample = SVMSMOTE(random_state=42)  # SVMSMOTE, SMOTENC

        x_train, y_train = resample.fit_resample(x_train, y_train)

        x_val, y_val = resample.fit_resample(x_val, y_val)

        print("Resampled train dataset: %s" % Counter(y_train))

        ##        x_test, y_test   = resample.fit_resample(x_test, y_test)

        return x_train, x_val, x_test, y_train, y_val, y_test
Ejemplo n.º 2
0
def test_svm_smote(data):
    svm_smote = SVMSMOTE(random_state=42)
    svm_smote_nn = SVMSMOTE(random_state=42,
                            k_neighbors=NearestNeighbors(n_neighbors=6),
                            m_neighbors=NearestNeighbors(n_neighbors=11),
                            svm_estimator=SVC(gamma='scale', random_state=42))

    X_res_1, y_res_1 = svm_smote.fit_resample(*data)
    X_res_2, y_res_2 = svm_smote_nn.fit_resample(*data)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)
Ejemplo n.º 3
0
def svm_smote(X,
              y,
              visualize=False,
              pca2d=True,
              pca3d=True,
              tsne=True,
              pie_evr=True):
    sm = SVMSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
Ejemplo n.º 4
0
def svm_smote(X, y):
    """Balancing data using SVMSMOTE

    Args:
        X: Training set without Class Target
        y:Training set Class Target

    Returns:
        balanced train_x, test_x
    """
    sample = SVMSMOTE(random_state=42)
    X, y = sample.fit_resample(X, y)
    print('after balancing:', X.shape)
    return X, y
Ejemplo n.º 5
0
def borderline_smoth_func(train_x, train_y, target):
    try:
        logger.info(
            f"counter before border line SMOTH is: {train_y[target].value_counts()}"
        )
        # transform the dataset
        #oversample = BorderlineSMOTE()
        oversample = SVMSMOTE()
        train_x, train_y = oversample.fit_resample(train_x, train_y)
        # summarize the new class distribution
        logger.info(
            f"counter after borderline SMOTH is: {train_y[target].value_counts()}"
        )
        return train_x, train_y
    except Exception as ex:
        logger.error(f"failed to run borderline_smoth_func due to: {ex}")
def run_upsample(json_file_path, fmt_file_path):
    json_manager = JsonManager(json_file_path)

    if json_manager.get_upsample_status() == True:
        print(f"Upsampling started using {json_file_path} and {fmt_file_path}")
        upsampled_path = json_manager.get_upsampled_path()
        constants.remove_folder_if_exists(\
         constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path)

        hot_encoded_folder = os.fsdecode(os.path.join(\
         json_manager.get_hot_encoded_path(), \
         constants.HOT_ENCODED_CSV_FOLDER_NAME))

        hot_encoded_file = os.fsdecode(os.path.join(\
         hot_encoded_folder, \
         constants.HOT_ENCODED_CSV_FILENAME))

        hotEncoded_data = pd.read_csv(hot_encoded_file)
        features_data = pd.read_csv(hot_encoded_file, \
        usecols = list(hotEncoded_data.columns)[:-1]) # everything except label
        labels_data = pd.read_csv(hot_encoded_file, \
        usecols = [list(hotEncoded_data.columns)[-1]]) # label

        sm = SVMSMOTE(random_state=json_manager.get_random_state())
        X_res, y_res = sm.fit_resample(features_data, labels_data)
        csv_ready = np.append(X_res, y_res, axis=constants.COLUMN_AXIS)

        upsampled_folder = constants.add_folder_to_directory(\
         constants.UPSAMPLED_CSV_FOLDER_NAME, upsampled_path)

        upsampled_file_path = os.fsdecode(os.path.join(\
         upsampled_folder, constants.UPSAMPLED_CSV_FILENAME))

        if os.path.exists(upsampled_file_path):
            os.remove(upsampled_file_path)

        f = open(fmt_file_path, "r")
        fmt = f.readline()
        f.close()

        header = ','.join(str(i) for i in hotEncoded_data.columns)
        np.savetxt(upsampled_file_path, csv_ready, \
         fmt = fmt, \
         delimiter = constants.CSV_DELIMITER, \
         header = header, \
         comments='')
        print(f"Upsampling finished, results in {upsampled_file_path}")
Ejemplo n.º 7
0
encoder = OneHotEncoder()
encoder.fit(wr_data[('FF_Spaceman', 'Conf')].values.reshape(-1, 1))
X = process_df(vet_feats, encoder)
X_rk = process_df(rook_feats, encoder)
X_soph = process_df(soph_feats, encoder)

#%% Define XGB model and data split
est_tiers = xgb.XGBClassifier(objective='multi:softmax', num_class=6)
param_grid = {}
model_tiers = GridSearchCV(est_tiers, param_grid, cv=10)

sm = SVMSMOTE(sampling_strategy='not majority')

X_res, y_res_t = sm.fit_resample(
    X, tiers_vets
)  #This is wrong, don't resample before split, I'm doing it for the memes
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_res,
                                                            y_res_t,
                                                            test_size=0.3,
                                                            stratify=y_res_t,
                                                            random_state=123)

model_tiers.fit(X_train_t, y_train_t)
preds = model_tiers.predict(X_test_t)
cf = confusion_matrix(y_test_t, preds, normalize='pred')
plotConfMatrix(cf, labels=range(6))

# %%
est_hit = xgb.XGBClassifier(objective='binary:logistic')
param_grid = {}
Ejemplo n.º 8
0
def getAllCleanedDataExperiment(binning=0):
    # https://www.kaggle.com/nasirislamsujan/bank-customer-churn-prediction for inspiration on feature engineering
    # https://www.kaggle.com/agustinpugliese/eda-86-ann-explained
    # https://www.kaggle.com/kmalit/bank-customer-churn-prediction
    # https://www.kaggle.com/nasirislamsujan/bank-customer-churn-prediction
    df = pd.read_csv('train.csv', header=0)
    # printFullRow(df)
    # print(df['Exited'].value_counts())
    df.drop(['CustomerId', 'Surname', 'RowNumber'], axis=1, inplace=True)
    X = df.drop(['Exited'], axis=1)
    y = df['Exited']
    # printFullRow(X_train.head())

    df_test = pd.read_csv('testing.csv', header=0)
    # print(df_test.info())
    test_train = df_test.drop(['Exited'], axis=1)
    test_val = df_test['Exited']

    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=0.25,
                                                      random_state=1)
    print(len(y_train), len(y_val))

    ##### ENCODING #####

    X_train['HasCrCard'] = X_train['HasCrCard'].apply(lambda x: 1.
                                                      if x == 1 else 0.)
    X_val['HasCrCard'] = X_val['HasCrCard'].apply(lambda x: 1.
                                                  if x == 1 else 0.)
    test_train['HasCrCard'] = test_train['HasCrCard'].apply(lambda x: 1.
                                                            if x == 1 else 0.)

    X_train['IsActiveMember'] = X_train['IsActiveMember'].apply(
        lambda x: 1. if x == 1 else 0.)
    X_val['IsActiveMember'] = X_val['IsActiveMember'].apply(lambda x: 1.
                                                            if x == 1 else 0.)
    test_train['IsActiveMember'] = test_train['IsActiveMember'].apply(
        lambda x: 1. if x == 1 else 0.)

    X_train_cat_df = X_train[['Geography', 'Gender']]
    X_val_cat_df = X_val[['Geography', 'Gender']]
    test_cat_df = test_train[['Geography', 'Gender']]

    X_train = X_train.drop(['Geography', 'Gender'], axis=1)
    X_val = X_val.drop(['Geography', 'Gender'], axis=1)
    test_train = test_train.drop(['Geography', 'Gender'], axis=1)
    X_train.reset_index(drop=True, inplace=True)
    X_val.reset_index(drop=True, inplace=True)
    test_train.reset_index(drop=True, inplace=True)

    X_train_cat = X_train_cat_df.to_numpy()
    X_val_cat = X_val_cat_df.to_numpy()
    test_cat = test_cat_df.to_numpy()

    enc = OneHotEncoder().fit(X_train_cat)
    X_train_enc_array = enc.transform(X_train_cat).toarray()
    X_val_enc_array = enc.transform(X_val_cat).toarray()
    test_enc_array = enc.transform(test_cat).toarray()

    X_train_enc_df = pd.DataFrame(
        data=X_train_enc_array,
        columns=['France', 'Germany', 'Spain', 'Female', 'Male'])
    X_val_enc_df = pd.DataFrame(
        data=X_val_enc_array,
        columns=['France', 'Germany', 'Spain', 'Female', 'Male'])
    test_enc_df = pd.DataFrame(
        data=test_enc_array,
        columns=['France', 'Germany', 'Spain', 'Female', 'Male'])

    X_train = pd.concat([X_train, X_train_enc_df], axis=1)
    X_val = pd.concat([X_val, X_val_enc_df], axis=1)
    test_train = pd.concat([test_train, test_enc_df], axis=1)

    # drop the extra columns
    X_train.drop(['France', 'Female'], axis=1, inplace=True)
    X_val.drop(['France', 'Female'], axis=1, inplace=True)
    test_train.drop(['France', 'Female'], axis=1, inplace=True)

    ###### Oversample training  data #####
    svmsmote = SVMSMOTE(random_state=101)
    X_train, y_train = svmsmote.fit_resample(X_train, y_train)

    # binning num of products
    X_train['1 Product'] = X_train['NumOfProducts'].apply(lambda x: 1
                                                          if x == 1 else 0)
    X_train['2 Product'] = X_train['NumOfProducts'].apply(lambda x: 1
                                                          if x == 2 else 0)
    X_train['1/2 Product'] = X_train['NumOfProducts'].apply(lambda x: 1
                                                            if x <= 2 else 0)
    X_train['3/4 Product'] = X_train['NumOfProducts'].apply(lambda x: 1
                                                            if x >= 3 else 0)
    X_train.drop(['NumOfProducts'], axis=1, inplace=True)
    X_val['1 Product'] = X_val['NumOfProducts'].apply(lambda x: 1
                                                      if x == 1 else 0)
    X_val['2 Product'] = X_val['NumOfProducts'].apply(lambda x: 1
                                                      if x == 2 else 0)
    X_val['1/2 Product'] = X_val['NumOfProducts'].apply(lambda x: 1
                                                        if x <= 2 else 0)
    X_val['3/4 Product'] = X_val['NumOfProducts'].apply(lambda x: 1
                                                        if x >= 3 else 0)
    X_val.drop(['NumOfProducts'], axis=1, inplace=True)
    test_train['1 Product'] = test_train['NumOfProducts'].apply(
        lambda x: 1 if x == 1 else 0)
    test_train['2 Product'] = test_train['NumOfProducts'].apply(
        lambda x: 1 if x == 2 else 0)
    test_train['1/2 Product'] = test_train['NumOfProducts'].apply(
        lambda x: 1 if x <= 2 else 0)
    test_train['3/4 Product'] = test_train['NumOfProducts'].apply(
        lambda x: 1 if x >= 3 else 0)
    test_train.drop(['NumOfProducts'], axis=1, inplace=True)
    # X_train['Balance0'] = X_train['Balance'].apply(lambda x: 1 if x < 50000 else 0)
    # X_train['Balance1'] = X_train['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0)
    # X_train['Balance2'] = X_train['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0)
    # X_train['Balance3'] = X_train['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0)
    # X_train.drop(['Balance'], axis=1, inplace=True)
    # X_val['Balance0'] = X_val['Balance'].apply(lambda x: 1 if x < 50000 else 0)
    # X_val['Balance1'] = X_val['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0)
    # X_val['Balance2'] = X_val['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0)
    # X_val['Balance3'] = X_val['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0)
    # X_val.drop(['Balance'], axis=1, inplace=True)
    # test_train['Balance0'] = test_train['Balance'].apply(lambda x: 1 if x < 50000 else 0)
    # test_train['Balance1'] = test_train['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0)
    # test_train['Balance2'] = test_train['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0)
    # test_train['Balance3'] = test_train['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0)
    # test_train.drop(['Balance'], axis=1, inplace=True)

    # age
    X_train['Age40-70'] = X_train['Age'].apply(lambda x: 1
                                               if (x >= 40 and x <= 70) else 0)
    X_val['Age40-70'] = X_val['Age'].apply(lambda x: 1
                                           if (x >= 40 and x <= 70) else 0)
    test_train['Age40-70'] = test_train['Age'].apply(
        lambda x: 1 if (x >= 40 and x <= 70) else 0)
    # balance
    X_train['Balance-mid'] = X_train['Balance'].apply(
        lambda x: 1 if (x >= 75000 and x <= 160000) else 0)
    X_val['Balance-mid'] = X_val['Balance'].apply(
        lambda x: 1 if (x >= 75000 and x <= 160000) else 0)
    test_train['Balance-mid'] = test_train['Balance'].apply(
        lambda x: 1 if (x >= 75000 and x <= 160000) else 0)
    X_train['Balance-low'] = X_train['Balance'].apply(lambda x: 1
                                                      if (x <= 25000) else 0)
    X_val['Balance-low'] = X_val['Balance'].apply(lambda x: 1
                                                  if (x <= 25000) else 0)
    test_train['Balance-low'] = test_train['Balance'].apply(
        lambda x: 1 if (x <= 25000) else 0)
    # credit score
    X_train['LowCredit'] = X_train['CreditScore'].apply(lambda x: 1
                                                        if (x <= 400) else 0)
    X_val['LowCredit'] = X_val['CreditScore'].apply(lambda x: 1
                                                    if (x <= 400) else 0)
    test_train['LowCredit'] = test_train['CreditScore'].apply(
        lambda x: 1 if (x <= 400) else 0)
    X_train['HighCredit'] = X_train['CreditScore'].apply(lambda x: 1
                                                         if (x >= 800) else 0)
    X_val['HighCredit'] = X_val['CreditScore'].apply(lambda x: 1
                                                     if (x >= 800) else 0)
    test_train['HighCredit'] = test_train['CreditScore'].apply(
        lambda x: 1 if (x >= 800) else 0)
    # # tenure
    X_train['Tenure6-8'] = X_train['Tenure'].apply(lambda x: 1 if
                                                   (x >= 6 and x <= 8) else 0)
    X_val['Tenure6-8'] = X_val['Tenure'].apply(lambda x: 1
                                               if (x >= 6 and x <= 8) else 0)
    test_train['Tenure6-8'] = test_train['Tenure'].apply(
        lambda x: 1 if (x >= 6 and x <= 8) else 0)
    # X_train['Tenure1-2'] = X_train['Tenure'].apply(lambda x: 1 if (x >= 1 and x <= 2) else 0)
    # X_val['Tenure1-2'] = X_val['Tenure'].apply(lambda x: 1 if (x >= 1 and x <= 2) else 0)
    # test_train['Tenure1-2'] = test_train['Tenure'].apply(lambda x: 1 if (x >= 1 and x <= 2) else 0)

    # Age vs. Balance and CreditScore
    # X_train['Balance/Age'] = X_train['Balance']/X_train['Age']
    # X_val['Balance/Age'] = X_val['Balance'] / X_val['Age']
    # test_train['Balance/Age'] = test_train['CreditScore'] / test_train['Age']
    # X_train['CreditScore/Age'] = X_train['CreditScore'] / X_train['Age']
    # X_val['CreditScore/Age'] = X_val['CreditScore'] / X_val['Age']
    # test_train['CreditScore/Age'] = test_train['CreditScore'] / test_train['Age']

    X_train.drop(
        ['Age', 'CreditScore', 'Balance', 'EstimatedSalary', 'Tenure'],
        axis=1,
        inplace=True)
    X_val.drop(['Age', 'CreditScore', 'Balance', 'EstimatedSalary', 'Tenure'],
               axis=1,
               inplace=True)
    test_train.drop(
        ['Age', 'CreditScore', 'Balance', 'EstimatedSalary', 'Tenure'],
        axis=1,
        inplace=True)

    return X_train, y_train, X_val, y_val, test_train, test_val
Ejemplo n.º 9
0
def Retrain_Model_10_Iterates_SVMSMOTE( target,title,max_depth=3,n_esti=160,withexperience = False, color='YlGnBu'):
    matrics = []
    seed(2145)
    for i in range(3):
        rnd = randint(1, 2021)
        print("Start Iterate Number {}:".format(i+1))
        TRAIN_TEST_SPLIT_PERC = 0.8
        uniques = df_model_draft["HospID"].unique()
        sep = int(len(uniques) * TRAIN_TEST_SPLIT_PERC)
        df = df_model_draft.sample(frac=1).reset_index(drop=True)  # For shuffling your data
        train_ids, test_ids = uniques[:sep], uniques[sep:]

        train_df, test_df = df[df.HospID.isin(train_ids)], df[df.HospID.isin(test_ids)]

        print("\nTRAIN DATAFRAME\n", train_df.shape)
        print("\nTEST DATAFRAME\n", test_df.shape)
        if withexperience is False:
            X_train = train_df.drop(
                ['HospID', 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1)
            y_train = train_df[target]
            X_test = test_df.drop(
                ['HospID', 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1)
            y_test = test_df[target]
        else:
            X_train = train_df.drop(
                ['HospID', 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM',
            'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear','HospID_total_cardiac_surgery',
            'surgid_total_cardiac_surgery','surgid_total_CABG', 'surgid_Reop_CABG'], axis=1)
            y_train = train_df[target]
            X_test = test_df.drop(
                ['HospID', 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM',
                 'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear','HospID_total_cardiac_surgery',
                 'surgid_total_cardiac_surgery','surgid_total_CABG', 'surgid_Reop_CABG'], axis=1)
            y_test = test_df[target]

        sm = SVMSMOTE()  # SVMSMOTE(random_state=21)
        # fit and apply the transform
        X_over, y_over = sm.fit_resample(X_train, y_train)

        # summarize class distribution
        print("after under sampling")
        counter = Counter(y_over)
        print(counter)
        estimate = counter[0] / counter[1]
        print('Estimate: %.3f' % estimate)

        model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=max_depth, learning_rate=0.1, n_estimators=n_esti)
        model.fit(X_over, y_over)
        y_pred = model.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        mats = Make_Confusion_Matrix(cm, categories=categories, cmap=color, title=title, group_names=labels,y_pred=y_pred,y_test=y_test)
        auc = roc_auc_score(y_test, model.predict_proba(X_test.values)[:, 1])
        mats['AUROC'] = auc
        matrics.append(mats)
    return matrics

# matrics_xgb = Retrain_Model_10_Iterates_SVMSMOTE('STSRCOM', 'STSRCOM SVMSMOTE with experience',3,180)
# matrics_xgb_df = pd.DataFrame(matrics_xgb)
# matrics_xgb_df.loc['Mean'] = matrics_xgb_df.mean()
# matrics_xgb_df.loc['Std'] = matrics_xgb_df.std()
# print(matrics_xgb_df)
# matrics_xgb_df.to_csv("/model_outputs/STSRCOM SVMSMOTE with experience.csv")
#
# matrics_xgb = Retrain_Model_10_Iterates_SVMSMOTE('STSRCOM', 'STSRCOM SVMSMOTE without experience',3,180,True)
# matrics_xgb_df = pd.DataFrame(matrics_xgb)
# matrics_xgb_df.loc['Mean'] = matrics_xgb_df.mean()
# matrics_xgb_df.loc['Std'] = matrics_xgb_df.std()
# print(matrics_xgb_df)
# matrics_xgb_df.to_csv("/model_outputs/STSRCOM SVMSMOTE without experience.csv")
#
# matrics_xgb = Retrain_Model_10_Iterates_SVMSMOTE('STSRCHOSPD', 'STSRCHOSPD SVMSMOTE with experience',3,180,color='RdPu')
# matrics_xgb_df = pd.DataFrame(matrics_xgb)
# matrics_xgb_df.loc['Mean'] = matrics_xgb_df.mean()
# matrics_xgb_df.loc['Std'] = matrics_xgb_df.std()
# print(matrics_xgb_df)
# matrics_xgb_df.to_csv("/model_outputs/STSRCHOSPD SVMSMOTE with experience.csv")
#
# matrics_xgb = Retrain_Model_10_Iterates_SVMSMOTE('STSRCHOSPD', 'STSRCHOSPD SVMSMOTE without experience',3,180,True,color='RdPu')
# matrics_xgb_df = pd.DataFrame(matrics_xgb)
# matrics_xgb_df.loc['Mean'] = matrics_xgb_df.mean()
# matrics_xgb_df.loc['Std'] = matrics_xgb_df.std()
# print(matrics_xgb_df)
# matrics_xgb_df.to_csv("/model_outputs/STSRCHOSPD SVMSMOTE without experience.csv")
Ejemplo n.º 10
0
bt_enc_sc = np.concatenate([bt_enc, bt_sc], axis=1)

# """#Separação das variaveis"""

X = bt_enc_sc
y = battle.iloc[:, -1].values

# """Tratando os dados faltantes"""

le = LabelEncoder()
y = le.fit_transform(y)

# """Como há poucos dados é possivel aplicar a técnica de dados sintéticos, aumentando o volume de dados"""

sm = SVMSMOTE(random_state=42, k_neighbors=3, out_step=0.4)
X, y = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# """Treinamento do modelo"""

classifier = SVC()
classifier.fit(X_train, y_train)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'gamma': [10, 1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
Ejemplo n.º 11
0
def svm_smote(X, y):
    sm = SVMSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    return X_res, y_res
Ejemplo n.º 12
0
def Retrain_Model_10_Iterates_SVMSMOTE(target,
                                       title,
                                       max_depth=3,
                                       n_esti=160,
                                       lr=0.1,
                                       withexperience=False,
                                       color='YlGnBu'):
    matrics = []
    seed(2145)
    groups = df_model_draft['HospID']
    if withexperience is False:
        X = df_model_draft.drop(
            ['SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1)
        y = df_model_draft[target]
    else:
        X = df_model_draft.drop([
            'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM',
            'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear',
            'HospID_total_cardiac_surgery', 'surgid_total_cardiac_surgery',
            'surgid_total_CABG', 'surgid_Reop_CABG'
        ],
                                axis=1)
        y = df_model_draft[target]

    print(groups.shape)
    print(groups.unique())
    gss = GroupShuffleSplit(n_splits=10, train_size=.8, random_state=42)
    gss.get_n_splits()
    i = 1
    for train_idx, test_idx in gss.split(X, y, groups):
        print("TRAIN:", train_idx, "TEST:", test_idx)
        if (i == 1):
            X = X.drop(['HospID'], axis=1)

        print(X.columns.tolist())
        X_train = X.loc[train_idx]
        y_train = y.loc[train_idx]

        X_test = X.loc[test_idx]
        y_test = y.loc[test_idx]
        print("\nTRAIN DATAFRAME\n", X_train.shape)
        print("\nTEST DATAFRAME\n", X_test.shape)
        # summarize class distribution

        sm = SVMSMOTE()  # SVMSMOTE(random_state=21)
        # fit and apply the transform
        X_over, y_over = sm.fit_resample(X_train, y_train)

        # summarize class distribution
        print("after under sampling")
        counter = Counter(y_over)
        print(counter)
        estimate = counter[0] / counter[1]
        print('Estimate: %.3f' % estimate)

        model = XGBClassifier(objective='binary:logistic',
                              eval_metric='logloss',
                              max_depth=max_depth,
                              learning_rate=lr,
                              n_estimators=n_esti)
        model.fit(X_over, y_over)
        y_pred = model.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        mats = Make_Confusion_Matrix(cm,
                                     categories=categories,
                                     cmap=color,
                                     title=title,
                                     group_names=labels,
                                     y_pred=y_pred,
                                     y_test=y_test)
        auc = roc_auc_score(y_test, model.predict_proba(X_test.values)[:, 1])
        mats['AUROC'] = auc
        matrics.append(mats)
        i = i + 1
    return matrics
    def process2(self, Xtr, ytr, Xts):
        # print(Xtr.shape,Xts.shape)
        # x1 = Xtr.shape[0]
        # x2 = Xts.shape[0]
        # X = np.concatenate(Xtr,Xts,axis=0)
        # if self.norm==True:
        # 	X = StandardScaler().fit(X).transform(X)
        # if self.pca==True:
        # 	if self.dim_red:
        # 		pca = PCA(dim_red).fit(X)
        # 	else:
        # 		pca = PCA().fit(X)
        # 		if self.retain_info==1:
        # 			info_retain = np.argmax(pca.explained_variance_ratio_.cumsum())
        # 			pca = PCA(info_retain).fit(X)
        # 		else:
        # 			info_retain = np.where(pca.explained_variance_ratio_.cumsum() >= self.retain_info)
        # 			if (info_retain[0][0]>0):
        # 				pca = PCA(info_retain[0][0]).fit(X)
        # 			elif (info_retain[0][0]==0): #Randbedingung, s.d. wir nicht auf 0 dim reduzieren
        # 				pca = PCA(1).fit(X)
        # 	X = pca.transform(X)
        # 	Xtr = X[:x1,:]
        # 	Xts = X[x1:,:]
        # print(Xtr.shape,Xts.shape)
        if self.pca == True:
            if self.dim_red:
                pca = PCA(dim_red).fit(Xtr)
                #pca = PCA(dim_red).fit(Xts)
            else:
                pca = PCA().fit(Xtr)
                #pca = PCA().fit(Xts)
                if self.retain_info == 1:
                    info_retain = np.argmax(
                        pca.explained_variance_ratio_.cumsum())
                    pca = PCA(info_retain).fit(Xtr)
                    #pca = PCA(info_retain).fit(Xts)
                else:
                    info_retain = np.where(pca.explained_variance_ratio_.
                                           cumsum() >= self.retain_info)
                    if (info_retain[0][0] > 0):
                        pca = PCA(info_retain[0][0]).fit(Xtr)
                        #pca = PCA(info_retain[0][0]).fit(Xts)
                    elif (
                            info_retain[0][0] == 0
                    ):  #Randbedingung, s.d. wir nicht auf 0 dim reduzieren
                        pca = PCA(1).fit(Xtr)
                        #pca = PCA(1).fit(Xts)

            Xtr = pca.transform(Xtr)
            Xts = pca.transform(Xts)

        ### Optional Oversampling SMOTE or ADAYSN
        if self.smote_fct == True:
            sm = SVMSMOTE(sampling_strategy='not majority',
                          random_state=41,
                          k_neighbors=self.k_neighbors)
            Xtr, ytr = sm.fit_resample(Xtr, ytr)
            print('Resampled dataset shape %s' % Counter(ytr))

        if self.adasyn == True:
            print('Original dataset shape %s' % Counter(ytr))
            ada = ADASYN(random_state=42, n_neighbors=self.k_neighbors)
            Xtr, ytr = ada.fit_resample(Xtr, ytr)
            print('Resampled dataset shape %s' % Counter(ytr))
        return Xtr, ytr, Xts
Ejemplo n.º 14
0
    def predict(self, X):
        # Check is fit had been called
        check_is_fitted(self, "classes_")

        # Input validation
        X = check_array(X)
        if X.shape[1] != self.X_.shape[1]:
            raise ValueError("number of features does not match")

        X_dsel = self.previous_X
        y_dsel = self.previous_y

        unique, counts = np.unique(y_dsel, return_counts=True)

        k_neighbors = 5
        if counts[0] - 1 < 5:
            k_neighbors = counts[0] - 1

        if self.oversampler == "SMOTE" and k_neighbors > 0:
            smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
            X_dsel, y_dsel = smote.fit_resample(X_dsel, y_dsel)
        elif self.oversampler == "svmSMOTE" and k_neighbors > 0:
            try:
                svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors)
                X_dsel, y_dsel = svmSmote.fit_resample(X_dsel, y_dsel)
            except ValueError:
                pass
        elif self.oversampler == "borderline1" and k_neighbors > 0:
            borderlineSmote1 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-1')
            X_dsel, y_dsel = borderlineSmote1.fit_resample(X_dsel, y_dsel)
        elif self.oversampler == "borderline2" and k_neighbors > 0:
            borderlineSmote2 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-2')
            X_dsel, y_dsel = borderlineSmote2.fit_resample(X_dsel, y_dsel)
        elif self.oversampler == "ADASYN" and k_neighbors > 0:
            try:
                adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors)
                X_dsel, y_dsel = adasyn.fit_resample(X_dsel, y_dsel)
            except RuntimeError:
                pass
            except ValueError:
                pass
        elif self.oversampler == "SLS" and k_neighbors > 0:
            sls = Safe_Level_SMOTE(n_neighbors=k_neighbors)
            X_dsel, y_dsel = sls.sample(X_dsel, y_dsel)

        if self.desMethod == "KNORAE":
            des = KNORAE(self.ensemble_, random_state=42)
        elif self.desMethod == "KNORAU":
            des = KNORAU(self.ensemble_, random_state=42)
        elif self.desMethod == "KNN":
            des = DESKNN(self.ensemble_, random_state=42)
        elif self.desMethod == "Clustering":
            des = DESClustering(self.ensemble_, random_state=42)
        else:
            des = KNORAE(self.ensemble_, random_state=42)

        if len(self.ensemble_) < 2:
            prediction = self.ensemble_[0].predict(X)
        else:
            des.fit(X_dsel, y_dsel)
            prediction = des.predict(X)

        return prediction
# borderline-SMOTE with SVM for imbalanced dataset
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SVMSMOTE
from plotDataset import plot_dataset

X, y = make_classification(n_samples=10000,
                           n_features=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           weights=[0.99],
                           flip_y=0,
                           random_state=1)
counter = Counter(y)
print(counter)
plot_dataset(X, y, counter)

oversample = SVMSMOTE()
X, y = oversample.fit_resample(X, y)

counter = Counter(y)
print(counter)
plot_dataset(X, y, counter)
Ejemplo n.º 16
0
                           columns=X_train.columns[rfe.support_])
X_test_RFE = pd.DataFrame(rfe.transform(X_test),
                          columns=X_test.columns[rfe.support_])

print(X_train.shape, X_train_RFE.shape)

#%%
from imblearn.over_sampling import ADASYN, SMOTE, SVMSMOTE

#OVERSAMPLING
sampler_smote = SMOTE(n_jobs=-1)
sampler_svm = SVMSMOTE(n_jobs=-1)
sampler_adasyn = ADASYN(n_jobs=-1)

X_smote, y_smote = sampler_smote.fit_resample(X=X_train, y=y_train.ravel())
X_svm, y_svm = sampler_svm.fit_resample(X=X_train, y=y_train.ravel())
X_adasyn, y_adasyn = sampler_adasyn.fit_resample(X=X_train, y=y_train.ravel())

#%%
X_smote.shape

#%%
#baseline
rf = ensemble.RandomForestClassifier(n_estimators=100,
                                     max_depth=8,
                                     criterion="entropy",
                                     n_jobs=-1)
rf.fit(X=X_train, y=y_train.ravel())

#FROM NOW ON, USE THE TUNED VERSION ALTHOUGH WE SHOULD RE-TUNE
Ejemplo n.º 17
0
train_5 = []
test_5 = []

train_arr = [train_1, train_2, train_3, train_4, train_5]
test_arr = [test_1, test_2, test_3, test_4, test_5]

x_data = []
y_data = []
for i in data:
    for l in range(len(i)):
        i[l] = float(i[l])
    x_data.append(np.array(i[1:]))
    y_data.append(np.array(np.array(i[0])))

sm = SVMSMOTE(random_state=1)
x_data, y_data = sm.fit_resample(x_data, y_data)

y_data = y_data.reshape([len(y_data), 1])

test_num = int(len(x_data) * 0.2)
for i in range(1):
    result = {}
    id = 1
    x_data_train = x_data
    y_data_train = y_data
    x_data_test = np.zeros([1, len(x_data_train[0])])
    y_data_test = np.zeros([1, len(x_data_train[0])])
    delPoint = test_num * i
    print(x_data_train[0])
    print(x_data_test.shape)
    print(x_data_train.shape)
Ejemplo n.º 18
0
def getAllCleanedDataStable(standardize=0, binning=0):
    df = pd.read_csv('train.csv', header=0)
    # printFullRow(df)
    # print(df['Exited'].value_counts())
    df.drop(['CustomerId', 'Surname', 'RowNumber'], axis=1, inplace=True)
    X = df.drop(['Exited'], axis=1)
    y = df['Exited']
    # printFullRow(X_train.head())

    df_test = pd.read_csv('testing.csv', header=0)
    # print(df_test.info())
    test_train = df_test.drop(['Exited'], axis=1)
    test_val = df_test['Exited']

    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=0.25,
                                                      random_state=42)
    print(len(y_train), len(y_val))

    ##### ENCODING #####

    X_train['HasCrCard'] = X_train['HasCrCard'].apply(lambda x: 1.
                                                      if x == 1 else 0.)
    X_val['HasCrCard'] = X_val['HasCrCard'].apply(lambda x: 1.
                                                  if x == 1 else 0.)
    test_train['HasCrCard'] = test_train['HasCrCard'].apply(lambda x: 1.
                                                            if x == 1 else 0.)

    X_train['IsActiveMember'] = X_train['IsActiveMember'].apply(
        lambda x: 1. if x == 1 else 0.)
    X_val['IsActiveMember'] = X_val['IsActiveMember'].apply(lambda x: 1.
                                                            if x == 1 else 0.)
    test_train['IsActiveMember'] = test_train['IsActiveMember'].apply(
        lambda x: 1. if x == 1 else 0.)

    X_train_cat_df = X_train[['Geography', 'Gender']]
    X_val_cat_df = X_val[['Geography', 'Gender']]
    test_cat_df = test_train[['Geography', 'Gender']]

    X_train = X_train.drop(['Geography', 'Gender'], axis=1)
    X_val = X_val.drop(['Geography', 'Gender'], axis=1)
    test_train = test_train.drop(['Geography', 'Gender'], axis=1)
    X_train.reset_index(drop=True, inplace=True)
    X_val.reset_index(drop=True, inplace=True)
    test_train.reset_index(drop=True, inplace=True)

    X_train_cat = X_train_cat_df.to_numpy()
    X_val_cat = X_val_cat_df.to_numpy()
    test_cat = test_cat_df.to_numpy()

    enc = OneHotEncoder().fit(X_train_cat)
    X_train_enc_array = enc.transform(X_train_cat).toarray()
    X_val_enc_array = enc.transform(X_val_cat).toarray()
    test_enc_array = enc.transform(test_cat).toarray()

    X_train_enc_df = pd.DataFrame(
        data=X_train_enc_array,
        columns=['France', 'Germany', 'Spain', 'Female', 'Male'])
    X_val_enc_df = pd.DataFrame(
        data=X_val_enc_array,
        columns=['France', 'Germany', 'Spain', 'Female', 'Male'])
    test_enc_df = pd.DataFrame(
        data=test_enc_array,
        columns=['France', 'Germany', 'Spain', 'Female', 'Male'])

    X_train = pd.concat([X_train, X_train_enc_df], axis=1)
    X_val = pd.concat([X_val, X_val_enc_df], axis=1)
    test_train = pd.concat([test_train, test_enc_df], axis=1)

    # drop the extra columns
    X_train.drop(['France', 'Female'], axis=1, inplace=True)
    X_val.drop(['France', 'Female'], axis=1, inplace=True)
    test_train.drop(['France', 'Female'], axis=1, inplace=True)

    ###### Oversample training  data #####
    # random_state=101 gives 0.618
    # random_state=5 gives 0.61846
    svmsmote = SVMSMOTE(random_state=5)
    X_train, y_train = svmsmote.fit_resample(X_train, y_train)
    # smk = SMOTE()
    # X_train, y_train = smk.fit_sample(X_train, y_train)
    # adasyn = ADASYN(random_state=101)
    # X_train, y_train = adasyn.fit_resample(X_train, y_train)
    # over = RandomOverSampler(sampling_strategy=0.4, random_state=42)
    # under = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
    # X_train, y_train = over.fit_sample(X_train, y_train)
    # X_train, y_train = under.fit_sample(X_train, y_train)

    # print(X_train.shape, X_val.shape)
    # printFullRow(X_train[:5])
    # print(y_train.value_counts())

    if binning == 1:
        # bin balance into 4 categories
        # X_train['Balance0'] = X_train['Balance'].apply(lambda x: 1 if x < 50000 else 0)
        # X_train['Balance1'] = X_train['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0)
        # X_train['Balance2'] = X_train['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0)
        # X_train['Balance3'] = X_train['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0)
        # X_train.drop(['Balance'], axis=1, inplace=True)
        # X_val['Balance0'] = X_val['Balance'].apply(lambda x: 1 if x < 50000 else 0)
        # X_val['Balance1'] = X_val['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0)
        # X_val['Balance2'] = X_val['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0)
        # X_val['Balance3'] = X_val['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0)
        # X_val.drop(['Balance'], axis=1, inplace=True)
        # test_train['Balance0'] = test_train['Balance'].apply(lambda x: 1 if x < 50000 else 0)
        # test_train['Balance1'] = test_train['Balance'].apply(lambda x: 1 if (x > 50000 and x < 100000) else 0)
        # test_train['Balance2'] = test_train['Balance'].apply(lambda x: 1 if (x > 100000 and x < 150000) else 0)
        # test_train['Balance3'] = test_train['Balance'].apply(lambda x: 1 if (x > 150000 and x < 200000) else 0)
        # test_train.drop(['Balance'], axis=1, inplace=True)

        # binning age
        # X_train['Age40-50'] = X_train['Age'].apply(lambda x: 1 if (x>=40 and x<50) else 0)
        # X_train['Age30-40'] = X_train['Age'].apply(lambda x: 1 if (x >= 30 and x < 40) else 0)
        # X_train['Ageless30'] = X_train['Age'].apply(lambda x: 1 if (x < 30) else 0)
        # X_train['Ageover50'] = X_train['Age'].apply(lambda x: 1 if (x > 50) else 0)
        # X_train.drop(['Age'], axis=1, inplace=True)
        # X_val['Age40-50'] = X_val['Age'].apply(lambda x: 1 if (x >= 40 and x < 50) else 0)
        # X_val['Age30-40'] = X_val['Age'].apply(lambda x: 1 if (x >= 30 and x < 40) else 0)
        # X_val['Ageless30'] = X_val['Age'].apply(lambda x: 1 if (x < 30) else 0)
        # X_val['Ageover50'] = X_val['Age'].apply(lambda x: 1 if (x > 50) else 0)
        # X_val.drop(['Age'], axis=1, inplace=True)
        # test_train['Age40-50'] = test_train['Age'].apply(lambda x: 1 if (x>=40 and x<50) else 0)
        # test_train['Age30-40'] = test_train['Age'].apply(lambda x: 1 if (x >= 30 and x < 40) else 0)
        # test_train['Ageless30'] = test_train['Age'].apply(lambda x: 1 if (x < 30) else 0)
        # test_train['Ageover50'] = test_train['Age'].apply(lambda x: 1 if (x > 50) else 0)
        # test_train.drop(['Age'], axis=1, inplace=True)

        # binning num of products
        X_train['1 Product'] = X_train['NumOfProducts'].apply(lambda x: 1
                                                              if x == 1 else 0)
        X_train['2 Product'] = X_train['NumOfProducts'].apply(lambda x: 1
                                                              if x == 2 else 0)
        X_train['3 Product'] = X_train['NumOfProducts'].apply(lambda x: 1
                                                              if x == 3 else 0)
        X_train['4 Product'] = X_train['NumOfProducts'].apply(lambda x: 1
                                                              if x == 4 else 0)
        X_train.drop(['NumOfProducts'], axis=1, inplace=True)
        X_val['1 Product'] = X_val['NumOfProducts'].apply(lambda x: 1
                                                          if x == 1 else 0)
        X_val['2 Product'] = X_val['NumOfProducts'].apply(lambda x: 1
                                                          if x == 2 else 0)
        X_val['3 Product'] = X_val['NumOfProducts'].apply(lambda x: 1
                                                          if x == 3 else 0)
        X_val['4 Product'] = X_val['NumOfProducts'].apply(lambda x: 1
                                                          if x == 4 else 0)
        X_val.drop(['NumOfProducts'], axis=1, inplace=True)
        test_train['1 Product'] = test_train['NumOfProducts'].apply(
            lambda x: 1 if x == 1 else 0)
        test_train['2 Product'] = test_train['NumOfProducts'].apply(
            lambda x: 1 if x == 2 else 0)
        test_train['3 Product'] = test_train['NumOfProducts'].apply(
            lambda x: 1 if x == 3 else 0)
        test_train['4 Product'] = test_train['NumOfProducts'].apply(
            lambda x: 1 if x == 4 else 0)
        test_train.drop(['NumOfProducts'], axis=1, inplace=True)

    ##### Control
    if standardize != 1:
        X_train_og = X_train
        X_val_og = X_val
        y_train_og = y_train
        y_val_og = y_val
        test_train_og = test_train
        test_val_og = test_val
        return X_train_og, y_train_og, X_val_og, y_val_og, test_train_og, test_val_og

    ###### Standarize and Normalize #####

    # scale = StandardScaler().fit(X_train[['CreditScore', 'Age', 'NumOfProducts']])
    # X_train[['CreditScore', 'Age', 'NumOfProducts']] = scale.transform(X_train[['CreditScore', 'Age', 'NumOfProducts']])
    # X_val[['CreditScore', 'Age', 'NumOfProducts']] = scale.transform(X_val[['CreditScore', 'Age', 'NumOfProducts']])
    # test_train[['CreditScore', 'Age', 'NumOfProducts']] = scale.transform(
    #     test_train[['CreditScore', 'Age', 'NumOfProducts']])

    scale = StandardScaler().fit(X_train[['CreditScore', 'NumOfProducts']])
    X_train[['CreditScore', 'NumOfProducts'
             ]] = scale.transform(X_train[['CreditScore', 'NumOfProducts']])
    X_val[['CreditScore', 'NumOfProducts'
           ]] = scale.transform(X_val[['CreditScore', 'NumOfProducts']])
    test_train[['CreditScore', 'NumOfProducts']] = scale.transform(
        test_train[['CreditScore', 'NumOfProducts']])

    robust_scale = RobustScaler().fit(X_train[['Balance']])
    X_train[['Balance']] = robust_scale.transform(X_train[['Balance']])
    X_val[['Balance']] = robust_scale.transform(X_val[['Balance']])
    test_train[['Balance']] = robust_scale.transform(test_train[['Balance']])

    # normalize salary
    salary_mean = X_train['EstimatedSalary'].mean()
    salary_std = X_train['EstimatedSalary'].std()
    X_train['EstimatedSalary'] = X_train['EstimatedSalary'].apply(
        lambda x: (x - salary_mean) / salary_std)
    salary_mean = X_val['EstimatedSalary'].mean()
    salary_std = X_val['EstimatedSalary'].std()
    X_val['EstimatedSalary'] = X_val['EstimatedSalary'].apply(
        lambda x: (x - salary_mean) / salary_std)
    salary_mean = test_train['EstimatedSalary'].mean()
    salary_std = test_train['EstimatedSalary'].std()
    test_train['EstimatedSalary'] = test_train['EstimatedSalary'].apply(
        lambda x: (x - salary_mean) / salary_std)
    # normalize tenure
    tenure_mean = X_train['Tenure'].mean()
    tenure_std = X_train['Tenure'].std()
    X_train['Tenure'] = X_train['Tenure'].apply(lambda x:
                                                (x - tenure_mean) / tenure_std)
    tenure_mean = X_val['Tenure'].mean()
    tenure_std = X_val['Tenure'].std()
    X_val['Tenure'] = X_val['Tenure'].apply(lambda x:
                                            (x - tenure_mean) / tenure_std)
    tenure_mean = test_train['Tenure'].mean()
    tenure_std = test_train['Tenure'].std()
    test_train['Tenure'] = test_train['Tenure'].apply(
        lambda x: (x - tenure_mean) / tenure_std)

    return X_train, y_train, X_val, y_val, test_train, test_val
Ejemplo n.º 19
0
    def partial_fit(self, X, y, classes=None):
        """Partial fitting."""
        if not hasattr(self, "_base_clf"):
            self.set_base_clf()
        X, y = check_X_y(X, y)

        if _check_partial_fit_first_call(self, classes):
            self.classes_ = classes
            self.ensemble_ = []

        self.X_, self.y_ = X, y

        train_X, train_y = X, y

        unique, counts = np.unique(train_y, return_counts=True)

        k_neighbors = 5
        if counts[0] - 1 < 5:
            k_neighbors = counts[0] - 1

        if self.oversampler == "SMOTE" and k_neighbors > 0:
            smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
            train_X, train_y = smote.fit_resample(train_X, train_y)
        elif self.oversampler == "svmSMOTE" and k_neighbors > 0:
            try:
                svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors)
                train_X, train_y = svmSmote.fit_resample(train_X, train_y)
            except ValueError:
                pass
        elif self.oversampler == "borderline1" and k_neighbors > 0:
            borderlineSmote1 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-1')
            train_X, train_y = borderlineSmote1.fit_resample(train_X, train_y)
        elif self.oversampler == "borderline2" and k_neighbors > 0:
            borderlineSmote2 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-2')
            train_X, train_y = borderlineSmote2.fit_resample(train_X, train_y)
        elif self.oversampler == "ADASYN" and k_neighbors > 0:
            try:
                adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors)
                train_X, train_y = adasyn.fit_resample(train_X, train_y)
            except RuntimeError:
                pass
        elif self.oversampler == "SLS" and k_neighbors > 0:
            sls = Safe_Level_SMOTE(n_neighbors=k_neighbors)
            train_X, train_y = sls.sample(train_X, train_y)

        # Testing all models
        scores = np.array([ba(y, clf.predict(X)) for clf in self.ensemble_])

        # Pruning
        if len(self.ensemble_) > 1:
            alpha_good = scores > (0.5 + self.alpha)
            self.ensemble_ = [
                self.ensemble_[i] for i in np.where(alpha_good)[0]
            ]

        if len(self.ensemble_) > self.ensemble_size - 1:
            worst = np.argmin(scores)
            del self.ensemble_[worst]

        # Preparing and training new candidate
        self.ensemble_.append(base.clone(self._base_clf).fit(train_X, train_y))
Ejemplo n.º 20
0
                                                     predict_adasyn,
                                                     average='binary')

    fpr_adasyn[idx, :], tpr_adasyn[idx, :], thresholds_adasyn[
        idx, :] = roc_curve(predict_adasyn, adasyn_label_test, pos_label=1)
    auc_adasyn[idx, :] = auc(fpr_adasyn[idx, :], tpr_adasyn[idx, :])

    ### benchmark set ###
    predict_adasyn_benchmark[idx, :] = model_adasyn.predict(benchmark_data)

    #############################################################################################################################################################

    svmsmote = SVMSMOTE(sampling_strategy="minority")

    X_svmsmote, y_svmsmote = svmsmote.fit_resample(
        data.reshape(
            (num_channels * (size_freq_parameters + size_time_parameters),
             -1)).T, labels)

    num_svmsmote_samples_preictal[idx, :] = np.count_nonzero(
        y_svmsmote)  # need this only to report it in a paper
    num_svmsmote_samples_interictal[
        idx, :] = np.shape(y_svmsmote)[0] - np.count_nonzero(y_svmsmote)

    svmsmote_data_train, svmsmote_data_test, svmsmote_label_train, svmsmote_label_test = train_test_split(
        X_svmsmote, y_svmsmote, test_size=0.3, shuffle=True)

    model_svmsmote = LinearSVC(penalty="l1", dual=False, max_iter=5000)
    model_svmsmote.fit(svmsmote_data_train, svmsmote_label_train)
    parameters_svmsmote = model_svmsmote.get_params()
    coefficients_svmsmote.append(model_svmsmote.coef_.tolist())
Ejemplo n.º 21
0
print(cleaned_train_data)

X = cleaned_train_data.drop('target', 1)
y = cleaned_train_data.target

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.25, random_state=42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

svm_smote = SVMSMOTE(sampling_strategy='minority',
                     random_state=42,
                     k_neighbors=5)
X_svm_smote, y_svm_smote = svm_smote.fit_resample(X, y)

X_train_svm, X_test_svm, y_train_svm, y_test_svm = tts(X_svm_smote,
                                                       y_svm_smote,
                                                       test_size=0.25,
                                                       random_state=42)

sc = StandardScaler()
X_train_svm = sc.fit_transform(X_train_svm)
X_test_svm = sc.transform(X_test_svm)


def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    errors = abs(y_pred - y_test)
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))