Beispiel #1
0
def get_forest_model(data=data, balanced=False, model_name='model_forest_unbalanced_ori.pkl'):

    if balanced == True:
        X_train, X_test, y_train, y_test = get_balanced_data(data)
    else:
        X = data.drop('class', axis=1)
        y = data['class']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

    # Number of trees in random forest
    n_estimators = [10, 100]  # [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    #max_features = ['auto']  # ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [3, 5, 10]  # [int(x) for x in np.linspace(10, 110, num = 11)]
    # Minimum number of samples required to split a node
    min_samples_split = [2, 10, 20]
    #criterion = ['gini', 'entropy']
    # Minimum number of samples required at each leaf node
    # min_samples_leaf = [1, 2]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   # 'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   # 'min_samples_leaf': min_samples_leaf,
                   #'criterion': criterion,
                   'bootstrap': bootstrap}

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    rf_random = GridSearchCV(estimator=rf, param_grid=random_grid, cv=3, verbose=2, n_jobs=7)
    rf_random.fit(X_train, y_train)
    clf = rf_random.best_estimator_
    print(rf_random.best_params_)

    model = clf.fit(X_train, y_train)

    path = '1) classification algorithms/random forest/credit card fraud/' + model_name
    with open(path, 'wb') as file:
        pickle.dump(model, file)

    return
def get_SVM_model(data=data, kernel='rbf', gamma=0.0001, C=1000, probability=True, model_name='model_SVM_balanced.pkl'):

    X_train, X_test, y_train, y_test = get_balanced_data(data)

    # Create SVM classifer object
    clf = svm.SVC(kernel=kernel, gamma=gamma, C=C, probability=probability)

    # Train SVM classifer
    model = clf.fit(X_train, y_train)

    path = '1) classification algorithms/SVM/credit card fraud/'+model_name

    with open(path, 'wb') as file:
        pickle.dump(model, file)
    return
Beispiel #3
0
from global_functions import plot_confusion_matrix, cm_analysis
np.random.seed(7)

# load the data
file_name = 'data/credit card fraud/data_creditcard.pkl'  # set working directory to MSc Project
data = pd.read_pickle(file_name)

# unbalanced data
X = data.drop('class', axis=1)
y = data['class']
X_train_unbalanced, X_test_unbalanced, y_train_unbalanced, y_test_unbalanced = train_test_split(
    X, y, test_size=0.25, random_state=1)

# balanced data
# even out the data set -> 1:1 ratio of fraud and non fraud
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = get_balanced_data(
    data)

# unpack unbalanced model
path = '1) classification algorithms/random forest/credit card fraud/model_forest_unbalanced_ori.pkl'
with open(path, 'rb') as file:
    unbalanced_model = pickle.load(file)

# unpack balanced model
path = '1) classification algorithms/random forest/credit card fraud/model_forest_balanced_ori.pkl'
with open(path, 'rb') as file:
    balanced_model = pickle.load(file)

# predict labels
unbalanced_predictions = unbalanced_model.predict(X_test_unbalanced)
unbalanced_predictions = [int(round(x)) for x in unbalanced_predictions]
balanced_predictions = balanced_model.predict(X_test_balanced)
import pickle
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from global_functions import get_balanced_data
import numpy
numpy.random.seed(7)

# load the data
file_name = 'data/customer churn/customer churn modified.pkl'  # set working directory to MSc Project
data = pd.read_pickle(file_name)

X_train, X_test, y_train, y_test = get_balanced_data(data)


def get_NN_model(data=data,
                 lr=0.001,
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'],
                 validation_split=0.2,
                 batch_size=25,
                 epochs=20,
                 shuffle=True,
                 verbose=2,
                 model_name='model_NN_balanced_churn.pkl'):

    # create the neural net
    n_inputs = X_train.shape[1]
def get_accuracies(data):

    X_train, X_test, y_train, y_test = get_balanced_data(data)

    seed = 1
    rfc = RandomForestClassifier(bootstrap=True,
                                 max_depth=10,
                                 max_features='auto',
                                 min_samples_leaf=2,
                                 min_samples_split=10,
                                 n_estimators=500)

    rfc2 = RandomForestClassifier(bootstrap=False,
                                  max_depth=2,
                                  max_features='auto',
                                  min_samples_leaf=5,
                                  min_samples_split=20,
                                  n_estimators=100)

    gbm = GradientBoostingClassifier(min_samples_split=25,
                                     min_samples_leaf=25,
                                     loss='deviance',
                                     learning_rate=0.1,
                                     max_depth=5,
                                     max_features='auto',
                                     criterion='friedman_mse',
                                     n_estimators=100)

    def baseline_model(optimizer='adam', learn_rate=0.01):
        model = Sequential()
        model.add(Dense(100, input_dim=X_train.shape[1], activation='relu'))
        model.add(
            Dense(50, activation='relu')
        )  # 8 is the dim/ the number of hidden units (units are the kernel)
        model.add(Dense(2, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])
        return model

    keras = KerasClassifier(build_fn=baseline_model,
                            batch_size=32,
                            epochs=100,
                            verbose=0,
                            optimizer='Adam')

    outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

    svm = SVC(gamma="scale", probability=True, kernel='rbf', C=0.5)

    models = [('GBM', gbm), ('RFC', rfc), ('RFC2', rfc2), ('Keras', keras),
              ('SVM', svm)]

    results = []
    names = []
    scoring = 'accuracy'

    accuracy = []
    for name, model in models:
        cv_results = cross_val_score(model,
                                     X_train,
                                     y_train,
                                     cv=outer_cv,
                                     scoring=scoring)
        results.append(cv_results)
        names.append(name)
        # msg = "Cross-validation Accuracy %s: %f (+/- %f )" % (name, cv_results.mean() * 100, cv_results.std() * 100)
        # print(msg)
        model.fit(X_train, y_train)
        # print('Test set accuracy: {:.2f}'.format(model.score(X_test, y_test) * 100), '%')
        # accuracy.append(name)
        accuracy.append(model.score(X_test, y_test))
    return accuracy