def __build_model(self):
        std_scaler = StandardScaler()
        smt = SMOTE(k_neighbors=3,
                    random_state=42,
                    sampling_strategy='minority')
        if self.model_hyperparameters:
            log_reg_sm = LogisticRegression(**self.model_hyperparameters)
        else:
            log_reg_sm = LogisticRegression()

        pipeline = imbalanced_make_pipeline(
            std_scaler,
            smt,
            log_reg_sm,
        )
        self.model = pipeline
Beispiel #2
0
def get_models(clf_list, X,y,
               sampling_method = RandomUnderSampler(sampling_strategy='majority'), 
               ):
    classifiers = {
                    "LogisticRegression": LogisticRegression(max_iter=500, n_jobs=-1),
                    "XGBClassifier": XGBClassifier(n_jobs=-1, n_estimators=1000, max_depth=10),#, **{'gpu_id': 0, 'tree_method': 'gpu_hist'}),
                    "KNeighborsClassifier" : KNeighborsClassifier(3, n_jobs=-1),
                    "SVC" : SVC(gamma=2, C=1),
                    "GaussianProcessClassifier" : GaussianProcessClassifier(1.0 * RBF(1.0)),
                    "DecisionTreeClassifier" : DecisionTreeClassifier(max_depth=5),
                    "RandomForestClassifier" : RandomForestClassifier(max_depth=5, n_estimators=100, n_jobs=-1),
                    "MLPClassifier" : MLPClassifier(max_iter=1000),
                    "AdaBoostClassifier" : AdaBoostClassifier(),
                    "LGBMClassifier": LGBMClassifier(
                                                    boosting_type= 'gbdt',
                                                    max_depth = 10,
                                                    objective= 'binary',
                                                    nthread= 5,
                                                    num_leaves= 32,
                                                    learning_rate= 0.05,
                                                    max_bin= 512,
                                                    subsample_for_bin= 200,
                                                    subsample= 0.7,
                                                    subsample_freq= 1,
                                                    colsample_bytree= 0.8,
                                                    reg_alpha= 20,
                                                    reg_lambda= 20,
                                                    min_split_gain= 0.5,
                                                    min_child_weight= 1,
                                                    min_child_samples= 10,
                                                    scale_pos_weight= 1,
                                                    num_class = 1,
                                                    metric = 'auc')
                 }
    clf = {}
    for i, clf_name in enumerate(clf_list):
        # print("%i/%i : Training %s"%(i, len(clf_list), clf_name), end="")
        # classifiers[clf_name].fit(X,y)
        clf[clf_name] = imbalanced_make_pipeline(sampling_method, classifiers[clf_name])
        # print("[OK]")
    
    
    return clf
Beispiel #3
0
def training():
    test = pd.read_csv('./csv/preprocess/missing_test.csv', index_col=0)
    train = pd.read_csv('./csv/preprocess/missing_train.csv', index_col=0)

    train_label = pd.read_csv('./csv/base/train_label.csv', index_col=0)
    if var_env.knn_pickle == None:
        clf = KNeighborsClassifier(algorithm='auto',
                                   leaf_size=30,
                                   metric='minkowski',
                                   metric_params=None,
                                   n_jobs=None,
                                   n_neighbors=5,
                                   p=2,
                                   weights='uniform')
        pipeline = imbalanced_make_pipeline(
            SMOTE(sampling_strategy='minority'), clf)
        var_env.knn_pickle = pipeline.fit(train.values, train_label)

    result = pd.DataFrame({'predict_label': var_env.knn_pickle.predict(test)})
    result.to_csv('./csv/result.csv', index=False)
    return True
def training():
    data = pd.read_csv('./csv/dataset.csv', index_col='id', low_memory=False)
    original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split(
        data, data['label'], test_size=0.2, random_state=42)
    num_round = 10
    kfold = StratifiedKFold(n_splits=num_round,
                            random_state=None,
                            shuffle=False)
    id = 0
    split_data = {}
    for train_id, test_id in kfold.split(original_Xtrain, original_ytrain):
        split_data['train' + str(id)] = train_id
        split_data['valid' + str(id)] = test_id
        id += 1

    f7_array = original_Xtrain['FIELD_7'].apply(
        lambda x: '[]' if x != x else x).apply(literal_eval)
    original_Xtrain['FIELD_7'] = f7_array.apply(len)

    f7_array = original_Xtest['FIELD_7'].apply(
        lambda x: '[]' if x != x else x).apply(literal_eval)
    original_Xtest['FIELD_7'] = f7_array.apply(len)

    original_Xtrain_, original_Xtrain_label_, original_Xtest = preprocessing(
        original_Xtrain, original_Xtest)

    num_round = 10
    process_data = {}

    for id in range(num_round):
        process_data['train' + str(id)], process_data[
            'train_label' +
            str(id)], process_data['valid' + str(id)] = preprocessing(
                original_Xtrain.iloc[split_data['train' + str(id)]],
                original_Xtrain.iloc[split_data['valid' + str(id)]])

    result_valid = {}  # save result
    result_orginal = {}
    clssifers = {
        "KNearest":
        KNeighborsClassifier(algorithm='ball_tree',
                             leaf_size=30,
                             metric='minkowski',
                             metric_params=None,
                             n_jobs=None,
                             n_neighbors=5,
                             p=2,
                             weights='uniform'),
        "LogisticRegression":
        LogisticRegression(max_iter=1000000, penalty='l2'),
        "DecisionTreeClassifier":
        DecisionTreeClassifier()
    }

    for key, clf in clssifers.items():
        average_original_test = 0.0
        average_valid_test = 0.0
        average_private_test = 0.0
        print(key)
        for id in range(0, num_round):

            idx = split_data['valid' + str(id)]
            pipeline = imbalanced_make_pipeline(
                SMOTE(sampling_strategy='minority'), clf)
            model = pipeline.fit(process_data['train' + str(id)],
                                 process_data['train_label' + str(id)])

            score_valid_test = model.predict(process_data['valid' + str(id)])
            score_original_test = model.predict(original_Xtest)

            average_valid_test += score_valid_test / num_round
            average_original_test += score_original_test / num_round

        valid = transform_average_result(average_valid_test)
        orginal = transform_average_result(average_original_test)

        result_valid[key] = metrics.confusion_matrix(valid,
                                                     original_ytrain.iloc[idx])
        result_orginal[key] = metrics.confusion_matrix(orginal, original_ytest)
    pickle.dump(clssifers['KNearest'], open('./csv/model/KNearest.pkl', 'wb'))
    pickle.dump(clssifers['LogisticRegression'],
                open('./csv/model/LogisticRegression.pkl', 'wb'))
    pickle.dump(clssifers['DecisionTreeClassifier'],
                open('./csv/model/DecisionTreeClassifier.pkl', 'wb'))
    return True
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

# Classifier with optimal parameters
# log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm = LogisticRegression()
rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)
# Implementing SMOTE Technique 
# Cross Validating the right way
# Parameters
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
for train, test in stratified_fold.split(original_Xtrain, original_ytrain):
    pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg) # SMOTE happens during Cross Validation not before..
    model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
    best_est = rand_log_reg.best_estimator_
    prediction = best_est.predict(original_Xtrain[test])
    
    accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
    precision_lst.append(precision_score(original_ytrain[test], prediction))
    recall_lst.append(recall_score(original_ytrain[test], prediction))
    f1_lst.append(f1_score(original_ytrain[test], prediction))
    auc_lst.append(roc_auc_score(original_ytrain[test], prediction))
    
print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
Beispiel #6
0
    Y_undersample_train, Y_undersample_test = Y_undersample.iloc[
        train_index], Y_undersample.iloc[test_index]
X_undersample_train = X_undersample_train.values
X_undersample_test = X_undersample_test.values
Y_undersample_train = Y_undersample_train.values
Y_undersample_test = Y_undersample_test.values
# %%
undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []
# %%
for train_index, test_index in SKfold.split(X_undersample_train,
                                            Y_undersample_train):
    undersample_pipeline = imbalanced_make_pipeline(
        NearMiss(sampling_strategy="majority"), log_reg)
    undersample_model = undersample_pipeline.fit(
        X_undersample_train[train_index], Y_undersample_train[train_index])
    undersample_prediction = undersample_model.predict(
        X_undersample_train[test_index])
    undersample_accuracy.append(
        undersample_pipeline.score(og_X_train[test_index],
                                   og_Y_train[test_index]))
    undersample_precision.append(
        precision_score(Y_undersample_train[test_index],
                        undersample_prediction))
    undersample_recall.append(
        recall_score(Y_undersample_train[test_index], undersample_prediction))
    undersample_f1.append(
        f1_score(Y_undersample_train[test_index], undersample_prediction))
    undersample_auc.append(
Beispiel #7
0
undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []

# Implementing NearMiss Technique
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values)
print 'NearMiss label distribution:', Counter(y_nearmiss)

for train, test in sss.split(undersample_Xtrain, undersample_ytrain):
    '''
    Why is it still called 'undersample'?!
    '''
    undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg)
    # TODO: SMOTE happens during Cross Validation not before...

    undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train])
    undersample_prediction = undersample_model.predict(undersample_Xtrain[test])

    undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test]))
    undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction))
    undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction))
    undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction))
    undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction))

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 14), sharey=True)
Beispiel #8
0
undersample_ytest = undersample_ytest.values 

undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []

# Implementing NearMiss Technique (or Undersampling)
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values)
print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating 

for train, test in sss.split(undersample_Xtrain, undersample_ytrain):
    undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg) # SMOTE happens during Cross Validation not before..
    undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train])
    undersample_prediction = undersample_model.predict(undersample_Xtrain[test])
    
    undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test]))
    undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction))
    undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction))
    undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction))
    undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction))
    

# Let's Plot LogisticRegression Learning Curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator1, estimator2, estimator3, estimator4, X, y, ylim=None, cv=None,n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
Beispiel #9
0
def ClassifierTesting(method_name, method, method_prop, sampling_algoritm):

    acc_lst = []
    prec_lst = []
    rec_lst = []
    f1_lst = []
    auc_lst = []

    random_search = RandomizedSearchCV(method, method_prop, n_iter=20)

    for train, test in strat_cross_val.split(original_train_data,
                                             original_train_index):

        cross_val_model = imbalanced_make_pipeline(
            sampling_algoritm, random_search
        )  # SMOTE happens during Cross Validation not before..
        cross_val_model.fit(original_train_data[train],
                            original_train_index[train])

        best_est = random_search.best_estimator_
        prediction = best_est.predict(original_train_data[test])

        acc_lst.append(
            cross_val_model.score(original_train_data[test],
                                  original_train_index[test]))
        prec_lst.append(precision_score(original_train_index[test],
                                        prediction))
        rec_lst.append(recall_score(original_train_index[test], prediction))
        f1_lst.append(f1_score(original_train_index[test], prediction))
        auc_lst.append(roc_auc_score(original_train_index[test], prediction))

    print('_' * 50)
    print(method_name)
    print('_' * 50)
    print('Результат перекрестной проверки:')
    print("accuracy:{0:.3f}".format(np.mean(acc_lst) * 100), '%')
    print("precision:{0:.3f}".format(np.mean(prec_lst) * 100), '%')
    print("recall:{0:.3f}".format(np.mean(rec_lst) * 100), '%')
    print("f1:{0:.3f}".format(np.mean(f1_lst) * 100), '%')
    print("Roc Auc:{0:.3f}".format(np.mean(auc_lst) * 100), '%')
    print('_' * 50)
    print('_' * 50)

    acc_lst = []
    prec_lst = []
    rec_lst = []
    f1_lst = []
    auc_lst = []

    prediction = best_est.predict(original_test_data)

    acc_lst.append(accuracy_score(original_test_index, prediction))
    prec_lst.append(precision_score(original_test_index, prediction))
    rec_lst.append(recall_score(original_test_index, prediction))
    f1_lst.append(f1_score(original_test_index, prediction))
    auc_lst.append(roc_auc_score(original_test_index, prediction))

    print('Результат тестирования:')
    print("accuracy:{0:.3f}".format(np.mean(acc_lst) * 100), '%')
    print("precision:{0:.3f}".format(np.mean(prec_lst) * 100), '%')
    print("recall:{0:.3f}".format(np.mean(rec_lst) * 100), '%')
    print("f1:{0:.3f}".format(np.mean(f1_lst) * 100), '%')
    print("Roc Auc:{0:.3f}".format(np.mean(auc_lst) * 100), '%')
    print('_' * 50)

    cm = confusion_matrix(original_test_index, prediction)

    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    sns.heatmap(cm, ax=ax, annot=True, cmap=plt.cm.Purples)
    ax.set_title(method_name, fontsize=18)
    ax.set_xticklabels(['Честные', 'Мошеннические'], fontsize=10, rotation=0)
    ax.set_yticklabels(['Честные', 'Мошеннические'], fontsize=10, rotation=90)
    ax.set_xlabel(('Предсказанные значения'), fontsize=12, rotation=0)
    ax.set_ylabel(('Истинные значения'), fontsize=12, rotation=90)
    plt.show()

    FPR, TPR, none = roc_curve(original_test_index, prediction)
    custom_roc_curve(FPR, TPR, method_name)
Beispiel #10
0
def cv_grid_search(sampling, X, y, model_indices=False):
    log_reg_params = {
        "penalty": ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }
    knears_params = {
        "n_neighbors": list(range(2, 5, 1)),
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }
    svc_params = {
        'C': [0.5, 0.7, 0.9, 1],
        'kernel': ['rbf', 'poly', 'sigmoid', 'linear']
    }
    tree_params = {
        "criterion": ["gini", "entropy"],
        "max_depth": list(range(2, 4, 1)),
        "min_samples_leaf": list(range(5, 7, 1))
    }

    grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
    grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params)
    grid_svc = GridSearchCV(SVC(), svc_params)
    grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)

    models = np.array([grid_log_reg, grid_knears, grid_svc, grid_tree])
    best_ests = []
    if model_indices:
        models = models[model_indices]

    for model in models:
        print("Model: " + str(model))
        # prepare initial train and test
        splitter = StratifiedKFold(n_splits=5,
                                   random_state=None,
                                   shuffle=False)
        for train_index, test_index in splitter.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Turn into an array
        X_train, X_test = X_train.values, X_test.values
        y_train, y_test = y_train.values, y_test.values

        # List to append the score and then find the average
        accuracy_lst, precision_lst = [], []
        recall_lst, f1_lst = [], []
        auc_lst = []

        for train, test in splitter.split(X_train, y_train):
            pipeline = imbalanced_make_pipeline(sampling, model)
            model = pipeline.fit(X_train[train], y_train[train])
            best_est = model.best_estimator_
            prediction = best_est.predict(X_train[test])

            accuracy_lst.append(pipeline.score(X_train[test], y_train[test]))
            precision_lst.append(precision_score(y_train[test], prediction))
            recall_lst.append(recall_score(y_train[test], prediction))
            f1_lst.append(f1_score(y_train[test], prediction))
            auc_lst.append(roc_auc_score(y_train[test], prediction))

        print("accuracy: {}".format(np.mean(accuracy_lst)))
        print("precision: {}".format(np.mean(precision_lst)))
        print("recall: {}".format(np.mean(recall_lst)))
        print("f1: {}".format(np.mean(f1_lst)))
        print("AUC: {}".format(np.mean(auc_lst)))

        best_ests.append(best_est)
    return best_ests