Beispiel #1
0
def svm_validation_curve(df, features, params, param_name, param_range):
    """This routine calculates the validation curve for one hyper-parameter of
    the SVM classification method.

    Input:
            df (DataFrame) The database to draw from
            features (list) list of features in the DataFrame
            param_name (string) name of the hyper parameter
            param_range (list) list of parameter values to use

    Output:
            None
    """

    X, y = sets.build_matrices(df, features)

    clf = SVC(**params)
    title = "Validation curve / SVM classifier"
    ml_an.plot_validation_curve(clf,
                                param_name,
                                param_range,
                                title,
                                X,
                                y,
                                ylim=(0.0, 1.1),
                                cv=None,
                                n_jobs=4)

    plt.show()
Beispiel #2
0
def svm_example(df, features, params):
    """This routine calculates an example of the SVM classification method. It
    prints the classification report, the ROC AUC and shows the learning curve
    for the chosen hyper-parameters as well as the ROC curve.

    Input:
            df (DataFrame) The database to draw from
            features (list) list of features in the DataFrame

    Output:
            None
    """

    X, y = sets.build_matrices(df, features)

    # score curves, each time with 20% data randomly selected for validation.
    cv = cross_validation.ShuffleSplit(df.shape[0],
                                       n_iter=10,
                                       test_size=0.2,
                                       random_state=0)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    clf = SVC(**params)

    title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$, C=10)"
    ml_an.plot_learning_curve(clf, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)
    plt.show()

    clf.fit(X_train, y_train)
    y_true, y_pred = y_test, clf.predict(X_test)

    print "Classification Report "
    print(classification_report(y_true, y_pred))
    print "\n"
    print "\n"
    print "Feature Importance "
    for i in range(len(features)):
        print str(features[i]) + ": " + str(feat_importances[i])
    print "\n"

    y_pred_rf = clf.predict_proba(X_test)[:, 0]

    ml_an.plot_precision_recall_curve(y_true, y_pred_proba, pos_label="QSO")
    plt.show()

    ml_an.plot_roc_curve(y_true, y_pred_proba, pos_label="QSO")

    plt.show()
def rf_class_predict(df_train, df_pred, features, label, params, rand_state):
    """This routine calculates an example of the random forest classification
     method. It is aimed at multi-class classification.
     It prints the classification report and feature importances and shows the
     confusion matrix for all classes.

    Parameters:
            df : pandas dataframe
            The dataframe containing the features and the label for the
            regression.

            features : list of strings
            List of features

            label : string
            The label for the regression

            params : dictionary
            List of input parameters for the regression

            rand_state : integer
            Setting the random state variables to ensure reproducibility

    Return :
            clf : scikit-learn Classifier
            The Classifier trained on the training set

            y_pred : array-like
            An array with the predicted classes from df_pred

            y_prob : array-like
            An array with the predicted class probabilities
    """

    X_train, y_train = sets.build_matrices(df_train, features, label=label)
    X_pred = sets.build_matrix(df_pred, features)

    # Standardizing the data
    # X_train = preprocessing.robust_scale(X_train)
    # X_pred = preprocessing.robust_scale(X_pred)

    clf = RandomForestClassifier(**params)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_pred)

    # Predicting the probabilities for the classes
    y_prob = clf.predict_proba(X_pred)

    return clf, y_pred, y_prob
Beispiel #4
0
def svm_reg_predict(train_set, pred_set, features, label, params, pred_label):
    """This function predicts the regression values for pred_set based on the
    features specified in the train_set

    Parameters:
            train_set : pandas dataframe
            The dataframe containing the features and the label for the
            regression.

            pred_set : pandas dataframe
            The dataframe containing the features for prediction

            features : list of strings
            List of features

            label : string
            The label for the regression

            params : dictionary
            List of input parameters for the regression

            pred_label : string
            Name of the new label in the pred_set dataframe in which the
            predicted values are written

    Output:
            pred_set : pandas dataframe
            The dataframe containing the features for prediction and the
            regression values in the pred_label named column.
    """

    for feature in features:
        train_set.dropna(axis=0, how='any', subset=[feature], inplace=True)

    # Building test and training sample
    train_X, train_y = sets.build_matrices(train_set, features, label)

    pred_X = sets.build_matrix(pred_set, features)

    # Standardizing the data
    train_X = preprocessing.robust_scale(train_X)
    pred_X = preprocessing.robust_scale(pred_X)

    # Random Forest Regression
    reg = SVR(**params)
    reg.fit(train_X, train_y)

    pred_set[pred_label] = reg.predict(pred_X)

    return pred_set
Beispiel #5
0
def svm_grid_search(df, features, param_grid):
    """This routine calculates the support vector machine classification on a
    grid of hyper-parameters for the SVM method to test the best support vector
    classification hyper-parameters. The results of the test will be written
    out.

    Input:
            df (DataFrame) The database to draw from
            features (list) list of features in the DataFrame

    Output:
            None
    """

    X, y = sets.build_matrices(df, features)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    scores = ['precision_weighted', 'recall_weighted', 'f1_score']

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(SVC(C=1), param_grid, cv=5, scoring='%s' % score)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on training set:")
        print()
        for params, mean_score, scores in clf.grid_scores_:
            print("%0.4f (+/-%0.04f) for %r" %
                  (mean_score, scores.std() * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the training set.")
        print("The scores are computed on the test set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
Beispiel #6
0
def rf_reg_validation_curve(df, features, label, params, val_param, val_range):
    """This routine calculates the validation curve for random forest
    regression.

    Parameters:
            df : pandas dataframe
            The dataframe containing the features and the label for the
            regression.

            features : list of strings
            List of features

            label : string
            The label for the regression

            params : dictionary
            List of input parameters for the regression

            val_param : string
            Name of the validation parameter

            val_range : array-like
            List of parameter values for the validation curve

    """

    X, y = sets.build_matrices(df, features, label)

    # Random Forest Regression
    reg = RandomForestRegressor(**params)

    # Calculate and plot validation curve
    pz_an.plot_validation_curve(reg,
                                val_param,
                                val_range,
                                X,
                                y,
                                ylim=(0.0, 1.1),
                                cv=None,
                                n_jobs=2)

    plt.show()
def rf_class_validation_curve(df, features, label, params, param_name,
                              param_range):
    """This routine calculates the validation curve for one hyper-parameter of
    the random forest classification method.

    Input:
            df (DataFrame) The database to draw from
            features (list) list of features in the DataFrame

            label : string
            The label for the regression

            param_name (string) name of the hyper parameter
            param_range (list) list of parameter values to use


    Output:
            None
    """

    X, y = sets.build_matrices(df, features, label)

    # Standardizing the data
    # X = preprocessing.robust_scale(X)

    clf = RandomForestClassifier(**params)
    title = "Validation curve / Random Forest Classifier"
    ml_an.plot_validation_curve(clf,
                                param_name,
                                param_range,
                                title,
                                X,
                                y,
                                ylim=(0.0, 1.1),
                                cv=None,
                                n_jobs=4)

    plt.show()
Beispiel #8
0
def svm_reg_grid_search(df, features, label, param_grid, rand_state, scores,
                        name):
    """This routine calculates the support vector machine regression on a grid
    of hyper-parameters for the random forest method to test the best
    hyper-parameters. The analysis results of the test will be written out and
    saved.

    Parameters:
            df : pandas dataframe
            The dataframe containing the features and the label for the
            regression.

            features : list of strings
            List of features

            label : string
            The label for the regression

            param_grid : dictionary-like structure
            Parameter grid of input parameters for the grid search

            rand_state : integer
            Setting the random state variables to ensure reproducibility

            scores : list of strings
            Setting the score by which the grid search should be evaluated

            name : strings
            Setting the name of the output file for the grid search which
            contains all information about the grid

    """

    X, y = sets.build_matrices(df, features, label)

    # Standardizing the data
    X = preprocessing.robust_scale(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=rand_state)

    print "Training sample size: ", X_train.shape
    print "Evaluation sample size: ", X_test.shape

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        reg = GridSearchCV(SVR(), \
                        param_grid,scoring='%s' % score,cv=5,n_jobs=6)

        reg.fit(X_train, y_train)

        print("Best parameters set found on training set:")
        print()
        print(reg.best_params_)
        print()
        print("Grid scores on training set:")
        print()
        means = reg.cv_results_['mean_test_score']
        stds = reg.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, reg.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print()
        df = pd.DataFrame(reg.cv_results_)
        df.to_hdf('SVR_GS_' + name + '_' + score + '.hdf5', 'data')
        print()
        print("The model is trained on the full development set (80%).")
        print("The scores are computed on the full evaluation set (20%).")
        print()
        y_true, y_pred = y_test, reg.predict(X_test)
        ml_an.evaluate_regression(y_test, y_pred)
        pz_an.evaluate_photoz(y_test, y_pred)
        print()
Beispiel #9
0
def svm_reg_example(df,
                    features,
                    label,
                    params,
                    rand_state,
                    save=False,
                    save_filename=None):
    """This routine calculates an example of the random forest regression tuned
    to photometric redshift estimation. The results will be analyzed with the
    analyis routines/functions provided in ml_eval.py and photoz_analysis.py

    Parameters:
            df : pandas dataframe
            The dataframe containing the features and the label for the
            regression.

            features : list of strings
            List of features

            label : string
            The label for the regression

            params : dictionary
            List of input parameters for the regression

            rand_state : integer
            Setting the random state variables to ensure reproducibility


    """

    # Building test and training sample
    X, y = sets.build_matrices(df, features, label)

    # Standardizing the data
    X = preprocessing.robust_scale(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=rand_state)

    # Random Forest Regression
    reg = SVR(**params)

    reg.fit(X_train, y_train)

    y_pred = reg.predict(X_test)

    # Save predicted and test y values for later analysis

    if save:
        if save_filename:
            results = pd.DataFrame(data=np.array([y_pred, y_test]).T,
                                   columns=['y_pred', 'y_test'])
            results.to_csv(save_filename + '.csv', index=False)

        else:
            print "Error: No Filename supplied!"

    # Evaluate regression method

    ml_an.evaluate_regression(y_test, y_pred)

    pz_an.plot_redshifts(y_test, y_pred)
    pz_an.plot_error_hist(y_test, y_pred)

    plt.show()
def rf_class_example(df_train, df_pred, features, label, params, rand_state):
    """This routine calculates an example of the random forest classification
     method. It is aimed at multi-class classification.
     It prints the classification report and feature importances and shows the
     confusion matrix for all classes.

    Parameters:
            df : pandas dataframe
            The dataframe containing the features and the label for the
            regression.

            features : list of strings
            List of features

            label : string
            The label for the regression

            params : dictionary
            List of input parameters for the regression

            rand_state : integer
            Setting the random state variables to ensure reproducibility
    """

    clf, y_pred, y_prob = rf_class_predict(df_train, df_pred, features, label,
                                           params, rand_state)

    X_pred, y_true = sets.build_matrices(df_pred, features, label=label)

    y_true = y_true.astype('string')
    y_pred = y_pred.astype('string')

    feat_importances = clf.feature_importances_

    print "Classification Report "
    print(classification_report(y_true, y_pred))
    print "\n"
    print "Feature Importance "
    for i in range(len(features)):
        print str(features[i]) + ": " + str(feat_importances[i])
    print "\n"

    # Confusion matrix
    class_names = clf.classes_
    cnf_matrix = confusion_matrix(y_true,
                                  y_pred,
                                  labels=None,
                                  sample_weight=None)

    ml_an.my_confusion_matrix(cnf_matrix, classes=class_names)

    plt.show()

    # Predicting the probabilities for the classes
    y_prob = clf.predict_proba(X_pred)

    df_prob = pd.DataFrame(y_prob)
    df_prob.columns = clf.classes_
    df_prob.index = df_pred.index
    df_prob[
        'qso_prob'] = df_prob.highz + df_prob.midz + df_prob.lowz + df_prob.vlowz
    df_prob['true_class'] = y_true
    df_prob['pred_class'] = y_pred

    return y_true, y_pred, df_prob
def rf_class_grid_search(df_train, df_pred, features, label, param_grid,
                         rand_state, scores, name):
    """This routine calculates the random forest classification on a grid of
    hyper-parameters for the random forest method to test the best
    hyper-parameters. The analysis results of the test will be written out and
    saved.

    Parameters:
            df : pandas dataframe
            The dataframe containing the features and the label for the
            regression.

            features : list of strings
            List of features

            label : string
            The label for the regression

            param_grid : dictionary-like structure
            Parameter grid of input parameters for the grid search

            rand_state : integer
            Setting the random state variables to ensure reproducibility

            scores : list of strings
            Setting the score by which the grid search should be evaluated

            name : strings
            Setting the name of the output file for the grid search which
            contains all information about the grid

    """

    X_train, y_train = sets.build_matrices(df_train, features, label=label)
    X_test, y_test = sets.build_matrices(df_pred, features, label=label)

    print X_train.shape, X_test.shape

    print pd.Series(y_train).value_counts(), pd.Series(y_test).value_counts()

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(RandomForestClassifier(random_state=rand_state),
                           param_grid,
                           cv=5,
                           scoring='%s' % score,
                           n_jobs=4)

        clf.fit(X_train, y_train)

        print("Detailed classification report:")
        print()
        print("The model is trained on the training set.")
        print("The scores are computed on the test set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test)
        y_true = y_true.astype('string')
        y_pred = y_pred.astype('string')

        print(classification_report(y_true, y_pred))
        print "\n"

        print("Best parameters set found on training set:\n")
        print(clf.best_params_)
        print "\n"
        print("Grid scores on training set:")
        print "\n"
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print "\n"
        df = pd.DataFrame(clf.cv_results_)
        df.to_hdf('RF_GS_CLASS_' + name + '_' + score + '.hdf5', 'data')