Python retrieve_data Examples, data_process.retrieve_data Python Examples

Example #1

0

Show file

File: test_data_process.py Project: michaesb/machine_learning_3

 def test_undersampling(self):
     """
     Checks that the ratio property of the function is working properly
     """
     with self.assertRaises(ValueError):
         retrieve_data(undersampling=True, ratio=1.1)
     X_train, X_test, y_train, y_test = retrieve_data( undersampling=True,\
                                                       ratio=0.5)
     self.assertEqual(1476, len(X_train) + len(X_test))
     self.assertEqual(1476, len(y_train) + len(y_test))
     self.assertEqual(988, len(y_train))
     self.assertEqual(988, len(X_train))
     self.assertEqual(488, len(y_test))
     self.assertEqual(488, len(y_test))

Example #2

0

Show file

File: neural_network.py Project: michaesb/machine_learning_3

def neuralnet_tuned():
    """
    This function reads the dataset and uses the neural network ..
    to test optimized parameters found in the function above.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1.0,
                                                     random_state=3)

    clf = sklearn.neural_network.MLPClassifier(learning_rate="adaptive",
                                               learning_rate_init=0.001,
                                               activation="logistic",
                                               alpha=0.1,
                                               hidden_layer_sizes=(30, 30, 30,
                                                                   30),
                                               max_iter=500,
                                               solver="lbfgs",
                                               tol=1e-4,
                                               verbose=False)

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)

    scores(prediction, y_test, X_train, y_train)

Example #3

0

Show file

def logreg_gridsearch():
    """
    This function reads in the dataset and performs a logistic regression method and .. 
    using Grid Search to find the optimum parameters that will maximize the recall score,
    """

    X_train, X_test, y_train, y_test = retrieve_data( undersampling=True, ratio=1.0, random_state=3 )

    clf = LogisticRegression(random_state=4, solver="liblinear")

    ## Grid search parameter grid
    param_grid= {
        "C" : np.logspace(-3,3,7),
        "penalty" : ["l1", "l2"]
    }

    ## Different scorers for the grid search
    scorers = {
        "precision_score": make_scorer(precision_score),
        "recall_score": make_scorer(recall_score),
        "accuracy_score": make_scorer(accuracy_score)
    }

    ## Creating the grid search object. Using refit="recall_score" to optimize using this score
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorers, refit="recall_score", return_train_score=True, n_jobs=-1)

    grid_search.fit(X_train, y_train)

    prediction = grid_search.predict(X_test)
    scores(prediction, y_test, X_train, y_train, grid_search)

Example #4

0

Show file

def decisiontree_tuned():
    """
    This function reads the dataset and uses the decision tree ..
    to test optimized parameters found in the function above.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1,
                                                     random_state=2)

    clf = tree.DecisionTreeClassifier(criterion="gini",
                                      max_depth=20,
                                      max_features=30,
                                      min_samples_leaf=1,
                                      min_samples_split=2)

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)

    scores(prediction, y_test, X_train, y_train)

    dot_data = tree.export_graphviz(clf,
                                    out_file=None,
                                    filled=True,
                                    rounded=True,
                                    special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.format = "png"
    graph.render("plots/tree")

Example #5

0

Show file

File: test_data_process.py Project: michaesb/machine_learning_3

 def test_arrays(self):
     """
     testing that the ouput has correct dimension
     """
     total_size = 284807
     X_train, X_test, y_train, y_test = retrieve_data()
     self.assertEqual(total_size, len(X_train) + len(X_test))
     self.assertEqual(total_size, len(y_train) + len(y_test))
     self.assertEqual(int(total_size * 0.67), len(y_train))
     self.assertEqual(int(total_size * 0.67), len(X_train))
     self.assertEqual(int(total_size * 0.33 + 1), len(y_test))
     self.assertEqual(int(total_size * 0.33 + 1), len(X_test))

Example #6

0

Show file

def decisiontree_gridsearch():
    """
    This function retrieves the dataset and uses grid search to find optimum parameters
    to optimize the recall score of a Decision Tree classifier.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1.0,
                                                     random_state=2)

    ## Our decision tree model
    clf = tree.DecisionTreeClassifier()

    # Grid search parameter grid to search through.
    param_grid = {
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 3, 5, 8, 10],
        "max_depth": [3, 5, 10, 15, 20, 25],
        "max_features": [5, 20, 25, 30, "auto", "sqrt", "log2"],
        "min_samples_leaf": [1, 5, 10, 20, 50]
    }

    ## Different scorers for the grid search
    scorers = {
        "precision_score": make_scorer(precision_score),
        "recall_score": make_scorer(recall_score),
        "accuracy_score": make_scorer(accuracy_score)
    }

    ## Creating the grid search object. Using refit="recall_score" to optimize using this score
    grid_search = GridSearchCV(clf,
                               param_grid,
                               cv=5,
                               scoring=scorers,
                               refit="recall_score",
                               return_train_score=True,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    prediction = grid_search.predict(X_test)

    scores(prediction, y_test, X_train, y_train, grid_search)

    ## Using the graphviz package to produce a PNG image to display the decision tree
    dot_data = tree.export_graphviz(grid_search.best_estimator_,
                                    out_file=None,
                                    filled=True,
                                    rounded=True,
                                    special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.format = "png"
    graph.render("plots/tree")

Example #7

0

Show file

def logreg_tuned():
    """
    This function reads the dataset and uses logistic regression ..
    to test optimized parameters found in the function above.
    """

    X_train, X_test, y_train, y_test = retrieve_data( undersampling=True, ratio=1.0, random_state=3 )

    clf = LogisticRegression(random_state=4, solver="liblinear", C=0.01, penalty="l1")

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)

    scores(prediction, y_test, X_train, y_train)

Example #8

0

Show file

File: neural_network.py Project: michaesb/machine_learning_3

def neuralnet_learningrate():
    """
    This function tests using a neural network with different,
    initial learning rate and then plots and prints the results.
    """

    ratio_ = 0.1
    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=ratio_)

    learning_rate = 10**(-np.linspace(3, 1, 70))
    n = len(learning_rate)
    acc_score = np.zeros(n)
    rec_score = np.zeros(n)
    prec_score = np.zeros(n)
    for i in range(len(learning_rate)):

        print(int(100 * i / len(learning_rate)), "%", end="\r")
        clf = sklearn.neural_network.MLPClassifier(
            hidden_layer_sizes=(30, 30, 30, 30),
            learning_rate="adaptive",
            learning_rate_init=learning_rate[i],
            max_iter=1000000,
            tol=1e-10,
            verbose=False,
        )

        clf = clf.fit(X_train, y_train.ravel())
        predict = clf.predict(X_test)

        acc_score[i] = accuracy_score(y_test.ravel(), predict)
        prec_score[i] = precision_score(y_test.ravel(), predict)
        rec_score[i] = recall_score(y_test.ravel(), predict)

    plt.semilogx(learning_rate, acc_score)
    plt.semilogx(learning_rate, prec_score)
    plt.semilogx(learning_rate, rec_score)
    plt.legend(["Accuracy", "Precision", "Recall"], prop={'size': 12})
    plt.xlabel(r"Learning rate $\eta$", size=14)
    plt.ylabel("Scores", size=14)
    plt.title("Scikit-Learn NeuralNet score for different learning rates",
              size=16)
    plt.show()
    print("Ratio: ", ratio_)
    print("Accuracy score", acc_score)
    print("Precision score", prec_score)
    print("Recall score", rec_score)

Example #9

0

Show file

File: neural_network.py Project: michaesb/machine_learning_3

def neuralnet_gridsearch():
    """
    This function retrieves the dataset, creates a neural network Multilayered Perceptron, and
    uses a grid search method to find the most optimum parameters for maximizing the recall score.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1.0,
                                                     random_state=3)

    ## We decided on using the adaptive learning rate and a inital rate of 0.001.
    clf = sklearn.neural_network.MLPClassifier(learning_rate="adaptive",
                                               learning_rate_init=0.001,
                                               tol=1e-4,
                                               verbose=False)

    ## Grid search parameter grid to search through.
    param_grid = {
        "hidden_layer_sizes": [(30), (40, 40), (50, 50, 50), (30, 30, 30, 30)],
        "activation": ["logistic"],
        "solver": ["lbfgs", "adam"],
        "alpha": [0.1, 0.01, 0.001],
        "max_iter": [500, 1000]
    }

    ## Different scorers for the grid search
    scorers = {
        "precision_score": make_scorer(precision_score),
        "recall_score": make_scorer(recall_score),
        "accuracy_score": make_scorer(accuracy_score)
    }

    ## Creating the grid search object. Using refit="recall_score" to optimize using this score
    grid_search = GridSearchCV(clf,
                               param_grid,
                               cv=5,
                               scoring=scorers,
                               refit="recall_score",
                               return_train_score=True,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    prediction = grid_search.predict(X_test)

    scores(prediction, y_test, X_train, y_train, grid_search)

Example #10

0

Show file

File: random_forest.py Project: michaesb/machine_learning_3

def randomforest_gridsearch():
    """
    This function retrieves the dataset and uses a random forest classifier for predicting
    credit card frauds. To maximize the recall score, we used a grid search method to optimize
    the parameters going into the random forest classifier.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1.0,
                                                     random_state=None)

    ### Random Forest Classifier
    clf = RandomForestClassifier(random_state=4)

    ## Grid search parameter grid to search through
    param_grid = {
        "criterion": ["gini", "entropy"],
        "n_estimators": [10, 100, 200],
        "min_samples_split": [3, 5, 10],
        "max_depth": [5, 15, 25],
        "max_features": [5, 10, 30],
        "min_samples_leaf": [1, 10, 20]
    }

    ## Different scorers for the grid search
    scorers = {
        "precision_score": make_scorer(precision_score),
        "recall_score": make_scorer(recall_score),
        "accuracy_score": make_scorer(accuracy_score)
    }

    ## Creating the grid search object. Using refit="recall_score" to optimize using this score
    grid_search = GridSearchCV(clf,
                               param_grid,
                               cv=5,
                               scoring=scorers,
                               refit="recall_score",
                               return_train_score=True,
                               n_jobs=-1)

    grid_search.fit(X_train, y_train)
    prediction = grid_search.predict(X_test)

    scores(prediction, y_test, X_train, y_train, grid_search)

Example #11

0

Show file

File: random_forest.py Project: michaesb/machine_learning_3

def randomforest_tuned():
    """
    This function reads the dataset and uses the random forest ..
    to test optimized parameters found in the function above.
    """

    X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                     ratio=1,
                                                     random_state=None)
    print("shape of X_train " + str(np.shape(X_train)))
    print("shape of Y_train " + str(np.shape(y_train)))
    print("shape of X_test " + str(np.shape(X_test)))
    print("shape of Y_test " + str(np.shape(y_test)))
    clf = RandomForestClassifier()

    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)

    scores(prediction, y_test, X_train, y_train)

Example #12

0

Show file

def decisiontree_undersamplingratio():
    """
    This funcions purpose is to test how the scores vary when using different
    undersampling ratios and plots the results.
    """

    n = 61

    ratio_ = 10**(-np.linspace(6.0, 0.0, n))
    n = len(ratio_)
    acc_score = np.zeros(n)
    rec_score = np.zeros(n)
    prec_score = np.zeros(n)
    for i in range(n):

        print(int(100 * i / len(ratio_)), "%", end="\r")
        X_train, X_test, y_train, y_test = retrieve_data(undersampling=True,
                                                         ratio=ratio_[i])

        clf = tree.DecisionTreeClassifier()

        clf = clf.fit(X_train, y_train.ravel())
        predict = clf.predict(X_test)

        acc_score[i] = accuracy_score(y_test.ravel(), predict)
        prec_score[i] = precision_score(y_test.ravel(), predict)
        rec_score[i] = recall_score(y_test.ravel(), predict)

    plt.semilogx(ratio_, acc_score)
    plt.semilogx(ratio_, prec_score)
    plt.semilogx(ratio_, rec_score)
    plt.xlabel("Ratio", size=14)
    plt.ylabel("Score", size=14)
    plt.title("Scikit-Learn Decision Tree score for different ratios", size=16)
    plt.legend(['Accuracy', "Precision", "Recall"], prop={'size': 12})
    plt.savefig("plots/dectree_ratiotest.png")
    plt.show()