コード例 #1
0
ファイル: analysis_functions.py プロジェクト: kaducovas/fraud
def ada_boost(x_train, y_train, x_test, y_test, compute_threshold=False):
    '''
        Train an AdaBoost ensemble of Decision Trees on x_train and predict on x_test.

        x_train, x_test: DataFrames of shape num_samples x num_features.
    '''
    from sklearn.ensemble       import AdaBoostClassifier
    from sklearn.tree           import DecisionTreeClassifier
    from sklearn.linear_model   import LogisticRegression

    # estimator = DecisionTreeClassifier(class_weight='balanced', criterion='entropy', max_depth=7, min_samples_leaf=5)
    estimator = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', n_jobs=1, max_iter=100)
    model = AdaBoostClassifier(base_estimator=estimator, n_estimators=60, learning_rate=1.0)

    metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False)


    model.fit(x_train, y_train)


    if compute_threshold is True:
        probTest  = model.predict_proba(x_test)
        # probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest    = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode)

        plot_roc_curve(y_test, probTest, modelName="AdaBoost")
    else:
        predTest    = model.predict(x_test)

    return predTest, metricsCV, model
コード例 #2
0
ファイル: analysis_functions.py プロジェクト: kaducovas/fraud
def gaussian_naive_bayes(x_train, y_train, x_test, y_test, compute_threshold=True):
    '''
        Train Naive Bayes classifier on x_train and predict on x_test
        x_train, x_test: DataFrames of shape (data x features)
    '''
    from sklearn.naive_bayes    import GaussianNB

    model = GaussianNB(priors=None)
    metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False)


    model.fit(x_train, y_train)


    if compute_threshold is True:
        probTest  = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest    = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode)

        plot_roc_curve(y_test, probTest, modelName="Naive Bayes")
    else:
        predTest    = model.predict(x_test)

    return predTest, metricsCV, model
コード例 #3
0
def decision_tree(x_train, y_train, x_test, y_test, compute_threshold=True):
    '''
        Train a Decision Tree classifier on x_train and predict on x_test.

        x_train, x_test: DataFrames of shape num_samples x num_features.
    '''
    from sklearn.tree import DecisionTreeClassifier

    classWeights = {defs.posCode: 0.5, defs.negCode: 0.5}
    model = DecisionTreeClassifier(class_weight='balanced',
                                   criterion='entropy',
                                   max_depth=15,
                                   min_samples_leaf=5)

    metricsCV = cross_val_analysis(classifier=model,
                                   x=x_train,
                                   y=y_train,
                                   plot=False)

    model.fit(x_train, y_train)

    if compute_threshold is True:
        probTest = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode,
                            defs.negCode)

        plot_roc_curve(y_test, probTest, modelName="Decision Tree")
    else:
        predTest = model.predict(x_test)

    return predTest, metricsCV, model
コード例 #4
0
ファイル: analysis_functions.py プロジェクト: kaducovas/fraud
def log_reg(x_train, y_train, x_test, y_test, compute_threshold=True):
    '''
        Train Logistic Regression classifier on x_train and predict on x_test.

        x_train, x_test: DataFrames of shape data x features.
        thres: Class discrimination threshold
    '''
    from sklearn.linear_model   import LogisticRegression
    modelName = "Logistic Regression"

    model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', n_jobs=1, max_iter=100)

    #metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False)
    metricsCV = 0
    model.fit(x_train, y_train)#, weights) # TODO: Add class weights

    if compute_threshold is True:
        probTest  = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest    = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode)
        # predTrain   = np.where(probTrain[:, 1] >= bestThresh, defs.posCode, defs.negCode)

        plot_roc_curve(y_test, probTest, modelName="Logistic Regression")
    else:
        predTest    = model.predict(x_test)
        # predTrain   = model.predict(x_train)

    return predTest, metricsCV, model
コード例 #5
0
def nearest_neighbours(x_train,
                       y_train,
                       x_test,
                       y_test,
                       compute_threshold=True):
    '''
        Train K-Nearest Neighbours classifier on x_train and predict on x_test.

        x_train, x_test: DataFrames of shape data x features.
    '''
    from sklearn.neighbors import KNeighborsClassifier

    # TODO: Experiment with 'weights' parameter
    # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5}
    model = KNeighborsClassifier(n_neighbors=3,
                                 algorithm='ball_tree',
                                 weights='uniform',
                                 p=2,
                                 metric='minkowski',
                                 n_jobs=-1)

    # print("\nParameters initialization:")
    # print(percep.coef_)

    metricsCV = cross_val_analysis(classifier=model,
                                   x=x_train,
                                   y=y_train,
                                   plot=False)

    model.fit(x_train, y_train)  #, weights) # TODO: Add class weights

    if compute_threshold is True:
        probTest = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode,
                            defs.negCode)

        plot_roc_curve(y_test, probTest, modelName="Nearest Neighbors")
    else:
        predTest = model.predict(x_test)

    return predTest, metricsCV, model
コード例 #6
0
def svm(x_train, y_train, x_test, y_test, compute_threshold=True):
    '''
        Train SVM classifier on x_train and predict on x_test.

        x_train, x_test: DataFrames of shape data x features.
    '''
    from sklearn.svm import SVC

    model = SVC(C=1.0,
                cache_size=200,
                class_weight=None,
                coef0=0.0,
                decision_function_shape='ovr',
                degree=3,
                gamma='auto',
                kernel='rbf',
                max_iter=-1,
                probability=True,
                random_state=None,
                shrinking=True,
                tol=0.001,
                verbose=False)

    metricsCV = cross_val_analysis(classifier=model,
                                   x=x_train,
                                   y=y_train,
                                   plot=False)

    model.fit(x_train, y_train)

    if compute_threshold is True:
        probTest = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode,
                            defs.negCode)

        plot_roc_curve(y_test, probTest, modelName="SVM")
    else:
        predTest = model.predict(x_test)

    return predTest, metricsCV, model
コード例 #7
0
def linear_svm(x_train, y_train, x_test, y_test, compute_threshold=True):
    '''
        Train linear SVM classifier on x_train and predict on x_test.

        x_train, x_test: DataFrames of shape data x features.
    '''
    from sklearn.svm import LinearSVC

    model = LinearSVC(C=1.0,
                      class_weight=None,
                      dual=True,
                      fit_intercept=True,
                      intercept_scaling=1,
                      loss='squared_hinge',
                      max_iter=1000,
                      multi_class='ovr',
                      penalty='l2',
                      random_state=0,
                      tol=0.0001,
                      verbose=0)

    metricsCV = cross_val_analysis(classifier=model,
                                   x=x_train,
                                   y=y_train,
                                   plot=False)

    model.fit(x_train, y_train)

    if compute_threshold is True:
        probTest = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode,
                            defs.negCode)

        plot_roc_curve(y_test, probTest, modelName="SVM")
    else:
        predTest = model.predict(x_test)

    return predTest, metricsCV, model
コード例 #8
0
def linear_discriminant_analysis(x_train,
                                 y_train,
                                 x_test,
                                 y_test,
                                 n_components=2,
                                 compute_threshold=True):
    '''
        Train Linear Discriminant Analysis (LDA) classifier on x_train and predict on x_test.

        x_train, x_test: DataFrames of shape data x features.
        n_components: Number of components (< n_classes - 1) for dimensionality reduction.
    '''
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

    # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5}
    model = LinearDiscriminantAnalysis(priors=None, n_components=n_components)
    #X_r2 = model.fit(x_train, y_train).transform(X)
    metricsCV = cross_val_analysis(classifier=model,
                                   x=x_train,
                                   y=y_train,
                                   plot=False)

    model.fit(x_train, y_train)

    if compute_threshold is True:
        probTest = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode,
                            defs.negCode)

        plot_roc_curve(y_test,
                       probTest,
                       modelName="Linear Discriminant Analysis")
    else:
        predTest = model.predict(x_test)

    return predTest, metricsCV, model
コード例 #9
0
def mlp(x_train,
        y_train,
        x_test,
        y_test,
        hidden_neurons=10,
        hidden_activation='tanh',
        output_activation='tanh',
        lossFunction='mean_squared_error',
        optmizer='Adam',
        metrics=['mae', 'mape', 'acc', 'categorical_accuracy'],
        patience=30,
        train_verbose=2,
        n_epochs=500):
    '''
        Neural Networks classifier.

        x_train, x_test: DataFrames of shape data x features.
    '''
    from keras.models import Sequential
    from keras.layers import Dense
    import keras.callbacks as callbacks

    model = Sequential()
    model.add(
        Dense(hidden_neurons,
              input_dim=x_train.shape[1],
              activation=hidden_activation))
    model.add(Dense(1, activation=output_activation))
    model.compile(loss=lossFunction, optimizer=optmizer, metrics=metrics)

    earlyStopping = callbacks.EarlyStopping(monitor='val_loss',
                                            patience=patience,
                                            verbose=train_verbose,
                                            mode='auto')
    history = model.fit(x_train,
                        y_train,
                        epochs=n_epochs,
                        callbacks=[earlyStopping],
                        verbose=train_verbose,
                        validation_data=(x_test, y_test))

    metricsCV = cross_val_analysis(classifier=model,
                                   x=x_train,
                                   y=y_train,
                                   plot=False)

    model.fit(x_train, y_train)

    if compute_threshold is True:
        probTest = model.predict_proba(x_test)
        probTrain = model.predict_proba(x_train)

        bestThresh = get_best_thresh(y_train, probTrain)

        predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode,
                            defs.negCode)

        plot_roc_curve(y_test, probTest, modelName="SVM")
    else:
        predTest = model.predict(x_test)

    return predTest, metricsCV, model