def svm_validation_curve(df, features, params, param_name, param_range): """This routine calculates the validation curve for one hyper-parameter of the SVM classification method. Input: df (DataFrame) The database to draw from features (list) list of features in the DataFrame param_name (string) name of the hyper parameter param_range (list) list of parameter values to use Output: None """ X, y = sets.build_matrices(df, features) clf = SVC(**params) title = "Validation curve / SVM classifier" ml_an.plot_validation_curve(clf, param_name, param_range, title, X, y, ylim=(0.0, 1.1), cv=None, n_jobs=4) plt.show()
def svm_example(df, features, params): """This routine calculates an example of the SVM classification method. It prints the classification report, the ROC AUC and shows the learning curve for the chosen hyper-parameters as well as the ROC curve. Input: df (DataFrame) The database to draw from features (list) list of features in the DataFrame Output: None """ X, y = sets.build_matrices(df, features) # score curves, each time with 20% data randomly selected for validation. cv = cross_validation.ShuffleSplit(df.shape[0], n_iter=10, test_size=0.2, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) clf = SVC(**params) title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$, C=10)" ml_an.plot_learning_curve(clf, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4) plt.show() clf.fit(X_train, y_train) y_true, y_pred = y_test, clf.predict(X_test) print "Classification Report " print(classification_report(y_true, y_pred)) print "\n" print "\n" print "Feature Importance " for i in range(len(features)): print str(features[i]) + ": " + str(feat_importances[i]) print "\n" y_pred_rf = clf.predict_proba(X_test)[:, 0] ml_an.plot_precision_recall_curve(y_true, y_pred_proba, pos_label="QSO") plt.show() ml_an.plot_roc_curve(y_true, y_pred_proba, pos_label="QSO") plt.show()
def rf_class_predict(df_train, df_pred, features, label, params, rand_state): """This routine calculates an example of the random forest classification method. It is aimed at multi-class classification. It prints the classification report and feature importances and shows the confusion matrix for all classes. Parameters: df : pandas dataframe The dataframe containing the features and the label for the regression. features : list of strings List of features label : string The label for the regression params : dictionary List of input parameters for the regression rand_state : integer Setting the random state variables to ensure reproducibility Return : clf : scikit-learn Classifier The Classifier trained on the training set y_pred : array-like An array with the predicted classes from df_pred y_prob : array-like An array with the predicted class probabilities """ X_train, y_train = sets.build_matrices(df_train, features, label=label) X_pred = sets.build_matrix(df_pred, features) # Standardizing the data # X_train = preprocessing.robust_scale(X_train) # X_pred = preprocessing.robust_scale(X_pred) clf = RandomForestClassifier(**params) clf.fit(X_train, y_train) y_pred = clf.predict(X_pred) # Predicting the probabilities for the classes y_prob = clf.predict_proba(X_pred) return clf, y_pred, y_prob
def svm_reg_predict(train_set, pred_set, features, label, params, pred_label): """This function predicts the regression values for pred_set based on the features specified in the train_set Parameters: train_set : pandas dataframe The dataframe containing the features and the label for the regression. pred_set : pandas dataframe The dataframe containing the features for prediction features : list of strings List of features label : string The label for the regression params : dictionary List of input parameters for the regression pred_label : string Name of the new label in the pred_set dataframe in which the predicted values are written Output: pred_set : pandas dataframe The dataframe containing the features for prediction and the regression values in the pred_label named column. """ for feature in features: train_set.dropna(axis=0, how='any', subset=[feature], inplace=True) # Building test and training sample train_X, train_y = sets.build_matrices(train_set, features, label) pred_X = sets.build_matrix(pred_set, features) # Standardizing the data train_X = preprocessing.robust_scale(train_X) pred_X = preprocessing.robust_scale(pred_X) # Random Forest Regression reg = SVR(**params) reg.fit(train_X, train_y) pred_set[pred_label] = reg.predict(pred_X) return pred_set
def svm_grid_search(df, features, param_grid): """This routine calculates the support vector machine classification on a grid of hyper-parameters for the SVM method to test the best support vector classification hyper-parameters. The results of the test will be written out. Input: df (DataFrame) The database to draw from features (list) list of features in the DataFrame Output: None """ X, y = sets.build_matrices(df, features) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) scores = ['precision_weighted', 'recall_weighted', 'f1_score'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(SVC(C=1), param_grid, cv=5, scoring='%s' % score) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on training set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.4f (+/-%0.04f) for %r" % (mean_score, scores.std() * 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the training set.") print("The scores are computed on the test set.") print() y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print()
def rf_reg_validation_curve(df, features, label, params, val_param, val_range): """This routine calculates the validation curve for random forest regression. Parameters: df : pandas dataframe The dataframe containing the features and the label for the regression. features : list of strings List of features label : string The label for the regression params : dictionary List of input parameters for the regression val_param : string Name of the validation parameter val_range : array-like List of parameter values for the validation curve """ X, y = sets.build_matrices(df, features, label) # Random Forest Regression reg = RandomForestRegressor(**params) # Calculate and plot validation curve pz_an.plot_validation_curve(reg, val_param, val_range, X, y, ylim=(0.0, 1.1), cv=None, n_jobs=2) plt.show()
def rf_class_validation_curve(df, features, label, params, param_name, param_range): """This routine calculates the validation curve for one hyper-parameter of the random forest classification method. Input: df (DataFrame) The database to draw from features (list) list of features in the DataFrame label : string The label for the regression param_name (string) name of the hyper parameter param_range (list) list of parameter values to use Output: None """ X, y = sets.build_matrices(df, features, label) # Standardizing the data # X = preprocessing.robust_scale(X) clf = RandomForestClassifier(**params) title = "Validation curve / Random Forest Classifier" ml_an.plot_validation_curve(clf, param_name, param_range, title, X, y, ylim=(0.0, 1.1), cv=None, n_jobs=4) plt.show()
def svm_reg_grid_search(df, features, label, param_grid, rand_state, scores, name): """This routine calculates the support vector machine regression on a grid of hyper-parameters for the random forest method to test the best hyper-parameters. The analysis results of the test will be written out and saved. Parameters: df : pandas dataframe The dataframe containing the features and the label for the regression. features : list of strings List of features label : string The label for the regression param_grid : dictionary-like structure Parameter grid of input parameters for the grid search rand_state : integer Setting the random state variables to ensure reproducibility scores : list of strings Setting the score by which the grid search should be evaluated name : strings Setting the name of the output file for the grid search which contains all information about the grid """ X, y = sets.build_matrices(df, features, label) # Standardizing the data X = preprocessing.robust_scale(X) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=rand_state) print "Training sample size: ", X_train.shape print "Evaluation sample size: ", X_test.shape for score in scores: print("# Tuning hyper-parameters for %s" % score) print() reg = GridSearchCV(SVR(), \ param_grid,scoring='%s' % score,cv=5,n_jobs=6) reg.fit(X_train, y_train) print("Best parameters set found on training set:") print() print(reg.best_params_) print() print("Grid scores on training set:") print() means = reg.cv_results_['mean_test_score'] stds = reg.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, reg.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() df = pd.DataFrame(reg.cv_results_) df.to_hdf('SVR_GS_' + name + '_' + score + '.hdf5', 'data') print() print("The model is trained on the full development set (80%).") print("The scores are computed on the full evaluation set (20%).") print() y_true, y_pred = y_test, reg.predict(X_test) ml_an.evaluate_regression(y_test, y_pred) pz_an.evaluate_photoz(y_test, y_pred) print()
def svm_reg_example(df, features, label, params, rand_state, save=False, save_filename=None): """This routine calculates an example of the random forest regression tuned to photometric redshift estimation. The results will be analyzed with the analyis routines/functions provided in ml_eval.py and photoz_analysis.py Parameters: df : pandas dataframe The dataframe containing the features and the label for the regression. features : list of strings List of features label : string The label for the regression params : dictionary List of input parameters for the regression rand_state : integer Setting the random state variables to ensure reproducibility """ # Building test and training sample X, y = sets.build_matrices(df, features, label) # Standardizing the data X = preprocessing.robust_scale(X) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=rand_state) # Random Forest Regression reg = SVR(**params) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) # Save predicted and test y values for later analysis if save: if save_filename: results = pd.DataFrame(data=np.array([y_pred, y_test]).T, columns=['y_pred', 'y_test']) results.to_csv(save_filename + '.csv', index=False) else: print "Error: No Filename supplied!" # Evaluate regression method ml_an.evaluate_regression(y_test, y_pred) pz_an.plot_redshifts(y_test, y_pred) pz_an.plot_error_hist(y_test, y_pred) plt.show()
def rf_class_example(df_train, df_pred, features, label, params, rand_state): """This routine calculates an example of the random forest classification method. It is aimed at multi-class classification. It prints the classification report and feature importances and shows the confusion matrix for all classes. Parameters: df : pandas dataframe The dataframe containing the features and the label for the regression. features : list of strings List of features label : string The label for the regression params : dictionary List of input parameters for the regression rand_state : integer Setting the random state variables to ensure reproducibility """ clf, y_pred, y_prob = rf_class_predict(df_train, df_pred, features, label, params, rand_state) X_pred, y_true = sets.build_matrices(df_pred, features, label=label) y_true = y_true.astype('string') y_pred = y_pred.astype('string') feat_importances = clf.feature_importances_ print "Classification Report " print(classification_report(y_true, y_pred)) print "\n" print "Feature Importance " for i in range(len(features)): print str(features[i]) + ": " + str(feat_importances[i]) print "\n" # Confusion matrix class_names = clf.classes_ cnf_matrix = confusion_matrix(y_true, y_pred, labels=None, sample_weight=None) ml_an.my_confusion_matrix(cnf_matrix, classes=class_names) plt.show() # Predicting the probabilities for the classes y_prob = clf.predict_proba(X_pred) df_prob = pd.DataFrame(y_prob) df_prob.columns = clf.classes_ df_prob.index = df_pred.index df_prob[ 'qso_prob'] = df_prob.highz + df_prob.midz + df_prob.lowz + df_prob.vlowz df_prob['true_class'] = y_true df_prob['pred_class'] = y_pred return y_true, y_pred, df_prob
def rf_class_grid_search(df_train, df_pred, features, label, param_grid, rand_state, scores, name): """This routine calculates the random forest classification on a grid of hyper-parameters for the random forest method to test the best hyper-parameters. The analysis results of the test will be written out and saved. Parameters: df : pandas dataframe The dataframe containing the features and the label for the regression. features : list of strings List of features label : string The label for the regression param_grid : dictionary-like structure Parameter grid of input parameters for the grid search rand_state : integer Setting the random state variables to ensure reproducibility scores : list of strings Setting the score by which the grid search should be evaluated name : strings Setting the name of the output file for the grid search which contains all information about the grid """ X_train, y_train = sets.build_matrices(df_train, features, label=label) X_test, y_test = sets.build_matrices(df_pred, features, label=label) print X_train.shape, X_test.shape print pd.Series(y_train).value_counts(), pd.Series(y_test).value_counts() for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(RandomForestClassifier(random_state=rand_state), param_grid, cv=5, scoring='%s' % score, n_jobs=4) clf.fit(X_train, y_train) print("Detailed classification report:") print() print("The model is trained on the training set.") print("The scores are computed on the test set.") print() y_true, y_pred = y_test, clf.predict(X_test) y_true = y_true.astype('string') y_pred = y_pred.astype('string') print(classification_report(y_true, y_pred)) print "\n" print("Best parameters set found on training set:\n") print(clf.best_params_) print "\n" print("Grid scores on training set:") print "\n" means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print "\n" df = pd.DataFrame(clf.cv_results_) df.to_hdf('RF_GS_CLASS_' + name + '_' + score + '.hdf5', 'data')