def detailed_analysis(self):
        print_to_consol(
            'Making a confusion matrix for test set classification outcomes')

        matrix_stats = confusion_matrix_and_stats(self.y_test, self.y_pred,
                                                  'before_cal', self.directory)

        logging.info(f'Detailed analysis of confusion matrix for test set. \n'
                     f'True positives: {matrix_stats["TP"]} \n'
                     f'True negatives: {matrix_stats["TN"]} \n'
                     f'False positives: {matrix_stats["FP"]} \n'
                     f'False negatives: {matrix_stats["FN"]} \n'
                     f'Classification accuracy: {matrix_stats["acc"]} \n'
                     f'Classification error: {matrix_stats["err"]} \n'
                     f'Sensitivity: {matrix_stats["sensitivity"]} \n'
                     f'Specificity: {matrix_stats["specificity"]} \n'
                     f'False positive rate: {matrix_stats["FP-rate"]} \n'
                     f'False negative rate: {matrix_stats["FN-rate"]} \n'
                     f'Precision: {matrix_stats["precision"]} \n'
                     f'F1-score: {matrix_stats["F1-score"]} \n')

        print_to_consol(
            'Plotting precision recall curve for test set class 1 probabilities'
        )

        logging.info(
            f'Plotting precision recall curve for class 1 in test set probabilities. \n'
        )

        plot_precision_recall_vs_threshold(self.y_test, self.y_pred_proba_ones,
                                           self.directory)

        print_to_consol(
            'Plotting ROC curve ad calculating AUC for test set class 1 probabilities'
        )

        logging.info(
            f'Plotting ROC curve for class 1 in test set probabilities. \n')

        self.fpr, self.tpr, self.thresholds = plot_roc_curve(
            self.y_test, self.y_pred_proba_ones, self.directory)

        AUC = round(
            roc_auc_score(self.y_test, self.y_pred_proba_ones) * 100, 2)

        logging.info(
            f'Calculating AUC for ROC curve for class 1 in test set probabilities: {AUC} \n'
        )

        print_to_consol('Make a radar plot for performance metrics')

        radar_dict = {
            'Classification accuracy': matrix_stats["acc"],
            'Classification error': matrix_stats["err"],
            'Sensitivity': matrix_stats["sensitivity"],
            'Specificity': matrix_stats["specificity"],
            'False positive rate': matrix_stats["FP-rate"],
            'False negative rate': matrix_stats["FN-rate"],
            'Precision': matrix_stats["precision"],
            'F1-score': matrix_stats["F1-score"],
            'ROC AUC': AUC
        }

        plot_radar_chart(radar_dict, self.directory)

        print('*' * 80)
        print(
            '*    Exploring probability thresholds, sensitivity, specificity for class 1 '
        )
        print('*' * 80)

        threshold_dict = evaluate_threshold(self.tpr, self.fpr,
                                            self.thresholds)

        logging.info(
            f'Exploring different probability thresholds and sensitivity-specificity trade-offs. \n'
            f'Threshold 0.2: {threshold_dict["0.2"]} \n'
            f'Threshold 0.3: {threshold_dict["0.3"]} \n'
            f'Threshold 0.4: {threshold_dict["0.4"]} \n'
            f'Threshold 0.5: {threshold_dict["0.5"]} \n'
            f'Threshold 0.6: {threshold_dict["0.6"]} \n'
            f'Threshold 0.7: {threshold_dict["0.7"]} \n'
            f'Threshold 0.8: {threshold_dict["0.8"]} \n'
            f'Threshold 0.9: {threshold_dict["0.9"]} \n')

        print_to_consol(
            'Calibrating classifier and writing to disk; getting new accuracy')

        self.calibrated_clf, clf_acc = calibrate_classifier(
            self.model, self.X_cal_scaled, self.y_cal)

        date = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.calibrated_clf,
            os.path.join(self.directory,
                         'best_calibrated_predictor_' + date + '.pkl'))

        logging.info(
            f'Calibrated the best classifier with X_cal and y_cal and new accuracy {clf_acc}\n'
            f'Writing file to disk disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for calibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.calibrated_clf, self.directory, self.bootiter, 'calibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for calibrated classifier. \n')

        print_to_consol('Running prediction for calibrated classifier')

        print_to_consol(
            'Getting class predictions and probabilities for test set with calibrated classifier'
        )

        test_stats_cal, self.y_pred_cal, self.y_pred_proba_cal = testing_predict_stats(
            self.calibrated_clf, self.X_test_scaled, self.y_test)

        logging.info(
            f'Predicting on the test set with calibrated classifier. \n'
            f'Storing classes for calibrated classifier in y_pred and probabilities in y_pred_proba. \n'
        )

        print_to_consol(
            'Calculate prediction stats for y_pred and y_pred_proba of test set with calibrated classifier'
        )

        logging.info(
            f'Basic stats on the test set woth calibrated classifier. \n'
            f'Prediction accuracy on the test set: {test_stats_cal["predict_acc"]} \n'
            f'Class distributio in the test set: {test_stats_cal["class_distribution"]} \n'
            f'Matthews Correlation Coefficient: {test_stats_cal["mcc"]} \n'
            f'Average number of class 1 samples: {test_stats_cal["class_one"]} \n'
            f'Average number of class 0 samples: {test_stats_cal["class_zero"]} \n'
            f'Null accuracy: {test_stats_cal["null_acc"]} \n')

        print_to_consol(
            'Plotting histogram for class 1 prediction probabilities for test set'
        )

        #store the predicted probabilities for class 1 of test set
        self.y_pred_proba_cal_ones = self.y_pred_proba_cal[:, 1]

        plot_hist_pred_proba(self.y_pred_proba_cal_ones, self.directory)

        logging.info(
            f'Plotting prediction probabilities for class 1 in test set in histogram for calibrated classifier. \n'
        )

        print_to_consol(
            'Making a confusion matrix for test set classification outcomes with calibrated classifier'
        )

        matrix_stats_cal = confusion_matrix_and_stats(self.y_test,
                                                      self.y_pred_cal,
                                                      'after_cal',
                                                      self.directory)

        logging.info(
            f'Detailed analysis of confusion matrix for test set with calibrated classifier. \n'
            f'True positives: {matrix_stats_cal["TP"]} \n'
            f'True negatives: {matrix_stats_cal["TN"]} \n'
            f'False positives: {matrix_stats_cal["FP"]} \n'
            f'False negatives: {matrix_stats_cal["FN"]} \n'
            f'Classification accuracy: {matrix_stats_cal["acc"]} \n'
            f'Classification error: {matrix_stats_cal["err"]} \n'
            f'Sensitivity: {matrix_stats_cal["sensitivity"]} \n'
            f'Specificity: {matrix_stats_cal["specificity"]} \n'
            f'False positive rate: {matrix_stats_cal["FP-rate"]} \n'
            f'False negative rate: {matrix_stats_cal["FN-rate"]} \n'
            f'Precision: {matrix_stats_cal["precision"]} \n'
            f'F1-score: {matrix_stats_cal["F1-score"]} \n')

        print_to_consol(
            'Plotting precision recall curve for test set class 1 probabilities with calibrated classifier'
        )

        logging.info(
            f'Plotting precision recall curve for class 1 in test set probabilities with calibrated classifier. \n'
        )

        plot_precision_recall_vs_threshold(self.y_test,
                                           self.y_pred_proba_cal_ones,
                                           self.directory)

        print_to_consol(
            'Plotting ROC curve ad calculating AUC for test set class 1 probabilities with calibrated classifier'
        )

        logging.info(
            f'Plotting ROC curve for class 1 in test set probabilities with calibrated classifier. \n'
        )

        self.fpr_cal, self.tpr_cal, self.thresholds_cal = plot_roc_curve(
            self.y_test, self.y_pred_proba_cal_ones, self.directory)

        AUC_cal = round(
            roc_auc_score(self.y_test, self.y_pred_proba_cal_ones) * 100, 2)

        logging.info(
            f'Calculating AUC for ROC curve for class 1 in test set probabilities with calibrated classifier: {AUC_cal} \n'
        )

        print_to_consol(
            'Make a radar plot for performance metrics with calibrated classifier'
        )

        radar_dict_cal = {
            'Classification accuracy': matrix_stats_cal["acc"],
            'Classification error': matrix_stats_cal["err"],
            'Sensitivity': matrix_stats_cal["sensitivity"],
            'Specificity': matrix_stats_cal["specificity"],
            'False positive rate': matrix_stats_cal["FP-rate"],
            'False negative rate': matrix_stats_cal["FN-rate"],
            'Precision': matrix_stats_cal["precision"],
            'F1-score': matrix_stats_cal["F1-score"],
            'ROC AUC': AUC_cal
        }

        plot_radar_chart(radar_dict_cal, self.directory)

        print_to_consol(
            'Exploring probability thresholds, sensitivity, specificity for class 1 with calibrated classifier'
        )

        threshold_dict_cal = evaluate_threshold(self.tpr_cal, self.fpr_cal,
                                                self.thresholds_cal)

        logging.info(
            f'Exploring different probability thresholds and sensitivity-specificity trade-offs \n'
            f'for calibrated classifier. \n'
            f'Threshold 0.2: {threshold_dict_cal["0.2"]} \n'
            f'Threshold 0.3: {threshold_dict_cal["0.3"]} \n'
            f'Threshold 0.4: {threshold_dict_cal["0.4"]} \n'
            f'Threshold 0.5: {threshold_dict_cal["0.5"]} \n'
            f'Threshold 0.6: {threshold_dict_cal["0.6"]} \n'
            f'Threshold 0.7: {threshold_dict_cal["0.7"]} \n'
            f'Threshold 0.8: {threshold_dict_cal["0.8"]} \n'
            f'Threshold 0.9: {threshold_dict_cal["0.9"]} \n')

        end = datetime.now()
        duration = end - self.start

        logging.info(f'Training lasted for {duration} minutes \n')

        logging.info(f'Training completed \n')

        print_to_consol('Training completed')
Beispiel #2
0
    def detailed_analysis(self):
        print_to_consol(
            'Making a confusion matrix for test set classification outcomes')

        matrix_stats, report = confusion_matrix_and_stats_multiclass(
            self.y_test, self.y_pred, 'before_cal', self.directory)

        logging.info(f'Detailed analysis of confusion matrix for test set. \n'
                     f'True positives: {matrix_stats["TP"]} \n'
                     f'True negatives: {matrix_stats["TN"]} \n'
                     f'False positives: {matrix_stats["FP"]} \n'
                     f'False negatives: {matrix_stats["FN"]} \n'
                     f'Classification accuracy: {matrix_stats["acc"]} \n'
                     f'Classification error: {matrix_stats["err"]} \n'
                     f'Sensitivity: {matrix_stats["sensitivity"]} \n'
                     f'Specificity: {matrix_stats["specificity"]} \n'
                     f'False positive rate: {matrix_stats["FP-rate"]} \n'
                     f'False negative rate: {matrix_stats["FN-rate"]} \n'
                     f'Precision: {matrix_stats["precision"]} \n'
                     f'F1-score: {matrix_stats["F1-score"]} \n')

        logging.info(
            f'Classification report on test set before calibration. \n'
            f'{report} \n')

        print_to_consol('Make a radar plot for performance metrics')

        radar_dict = {
            'Classification accuracy': matrix_stats["acc"],
            'Classification error': matrix_stats["err"],
            'Sensitivity': matrix_stats["sensitivity"],
            'Specificity': matrix_stats["specificity"],
            'False positive rate': matrix_stats["FP-rate"],
            'False negative rate': matrix_stats["FN-rate"],
            'Precision': matrix_stats["precision"],
            'F1-score': matrix_stats["F1-score"],
            'ROC AUC': None
        }

        plot_radar_chart(radar_dict, self.directory)

        print_to_consol(
            'Calibrating classifier and writing to disk; getting new accuracy')

        self.calibrated_clf, clf_acc = calibrate_classifier(
            self.model, self.X_cal_scaled, self.y_cal)

        date = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.calibrated_clf,
            os.path.join(self.directory,
                         'best_calibrated_predictor_' + date + '.pkl'))

        logging.info(
            f'Calibrated the best classifier with X_cal and y_cal and new accuracy {clf_acc}\n'
            f'Writing file to disk disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for calibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.calibrated_clf, self.directory, self.bootiter, 'calibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for calibrated classifier. \n')

        print_to_consol('Running prediction for calibrated classifier')

        print_to_consol(
            'Getting class predictions and probabilities for test set with calibrated classifier'
        )

        test_stats_cal, self.y_pred_cal, self.y_pred_proba_cal = testing_predict_stats_multiclass(
            self.calibrated_clf, self.X_test_scaled, self.y_test)

        y_pred_cal_out = os.path.join(self.directory,
                                      "y_pred_after_calibration.csv")
        np.savetxt(y_pred_cal_out, self.y_pred_cal, delimiter=",")

        y_pred_proba_cal_out = os.path.join(
            self.directory, "y_pred_proba_after_calibration.csv")
        np.savetxt(y_pred_proba_cal_out, self.y_pred_proba_cal, delimiter=",")

        logging.info(
            f'Writing y_pred and y_pred_proba after calibration to disk. \n'
            f'Predicting on the test set with calibrated classifier. \n'
            f'Storing classes for calibrated classifier in y_pred and probabilities in y_pred_proba. \n'
        )

        print_to_consol(
            'Calculate prediction stats for y_pred and y_pred_proba of test set with calibrated classifier'
        )

        logging.info(
            f'Basic stats on the test set woth calibrated classifier. \n'
            f'Prediction accuracy on the test set: {test_stats_cal["predict_acc"]} \n'
            f'Class distributio in the test set: {test_stats_cal["class_distribution"]} \n'
            f'Matthews Correlation Coefficient: {test_stats_cal["mcc"]} \n')

        print_to_consol(
            'Making a confusion matrix for test set classification outcomes with calibrated classifier'
        )

        matrix_stats_cal, report_cal = confusion_matrix_and_stats_multiclass(
            self.y_test, self.y_pred_cal, 'after_cal', self.directory)

        logging.info(
            f'Detailed analysis of confusion matrix for test set with calibrated classifier. \n'
            f'True positives: {matrix_stats_cal["TP"]} \n'
            f'True negatives: {matrix_stats_cal["TN"]} \n'
            f'False positives: {matrix_stats_cal["FP"]} \n'
            f'False negatives: {matrix_stats_cal["FN"]} \n'
            f'Classification accuracy: {matrix_stats_cal["acc"]} \n'
            f'Classification error: {matrix_stats_cal["err"]} \n'
            f'Sensitivity: {matrix_stats_cal["sensitivity"]} \n'
            f'Specificity: {matrix_stats_cal["specificity"]} \n'
            f'False positive rate: {matrix_stats_cal["FP-rate"]} \n'
            f'False negative rate: {matrix_stats_cal["FN-rate"]} \n'
            f'Precision: {matrix_stats_cal["precision"]} \n'
            f'F1-score: {matrix_stats_cal["F1-score"]} \n')

        logging.info(
            f'Classification report on test set afetr callibration. \n'
            f'{report_cal} \n')

        print_to_consol(
            'Make a radar plot for performance metrics with calibrated classifier'
        )

        radar_dict_cal = {
            'Classification accuracy': matrix_stats_cal["acc"],
            'Classification error': matrix_stats_cal["err"],
            'Sensitivity': matrix_stats_cal["sensitivity"],
            'Specificity': matrix_stats_cal["specificity"],
            'False positive rate': matrix_stats_cal["FP-rate"],
            'False negative rate': matrix_stats_cal["FN-rate"],
            'Precision': matrix_stats_cal["precision"],
            'F1-score': matrix_stats_cal["F1-score"],
            'ROC AUC': None
        }

        plot_radar_chart(radar_dict_cal, self.directory)

        end = datetime.now()
        duration = end - self.start

        logging.info(f'Training lasted for {duration} minutes \n')

        logging.info(f'Training completed \n')

        print_to_consol('Training completed')
    def randomised_search(self):
        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = DecisionTreeClassifier(random_state=20,
                                      class_weight='balanced',
                                      max_features=self.numf)

        bag = BaggingClassifier(base_estimator=clf1,
                                n_jobs=-1,
                                bootstrap=True,
                                random_state=55)

        logging.info(f'Initialised classifier \n')

        #set up randomized search
        param_dict = {
            'base_estimator__criterion': ['gini', 'entropy'],
            'n_estimators': randint(100,
                                    10000),  #number of base estimators to use
            'base_estimator__min_samples_split': randint(2, 20),
            'base_estimator__max_depth': randint(1, 10),
            'base_estimator__min_samples_leaf': randint(1, 20),
            'base_estimator__max_leaf_nodes': randint(10, 20)
        }

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(bag,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        best_parameters = rand_search_fitted.best_params_
        best_scores = rand_search_fitted.best_score_

        logging.info(
            f'Running randomised search for best patameters of classifier \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best accuracy scores found: {best_scores} \n')

        self.model = rand_search_fitted.best_estimator_

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for uncalibrated classifier. \n')

        print_to_consol('Getting feature importances for best classifier')

        all_clf_feat_import_mean = np.mean(
            [tree.feature_importances_ for tree in self.model.estimators_],
            axis=0)
        all_clf_feat_import_mean_sorted = sorted(zip(
            all_clf_feat_import_mean, self.X_train_scaled.columns),
                                                 reverse=True)

        logging.info(
            f'Feature importances across all trees {all_clf_feat_import_mean_sorted} \n'
        )

        print_to_consol('Plotting feature importances for best classifier')

        feature_importances_error_bars(self.model, self.X_train_scaled.columns,
                                       self.directory)
        logging.info(
            f'Plotting feature importances for best classifier with errorbars \n'
        )
    def randomised_search(self):

        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = GaussianNB(priors=None)

        logging.info(f'Initialised classifier \n')

        #set up randomized search
        param_dict = {'var_smoothing': uniform(0.000000000001, 10.0)}

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(clf1,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        best_parameters = rand_search_fitted.best_params_
        best_scores = rand_search_fitted.best_score_

        self.model = rand_search_fitted.best_estimator_

        logging.info(
            f'Running randomised search for best patameters of classifier \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best accuracy scores found: {best_scores} \n'
            f'Probability for each class 0: {self.model.class_prior_} \n'
            f'Mean for each feature for class 0: {self.model.theta_[0]} \n'
            f'Mean for each feature for class 1: {self.model.theta_[1]} \n')

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for uncalibrated classifier. \n')

        print_to_consol('Getting feature importances for best classifier')

        class0_feature_ls = self.model.theta_[0]
        class1_feature_ls = self.model.theta_[1]
        df_0 = pd.DataFrame(class0_feature_ls.reshape(-1,
                                                      len(class0_feature_ls)),
                            columns=self.X_train_scaled.columns)
        df_1 = pd.DataFrame(class1_feature_ls.reshape(-1,
                                                      len(class1_feature_ls)),
                            columns=self.X_train_scaled.columns)

        feature_importance_class0 = df_0.to_dict(orient='records')
        feature_importance_class1 = df_1.to_dict(orient='records')

        logging.info(
            f'Feature importances for class 0 for best classifier {feature_importance_class0} \n'
            f'Feature importances for class 1 for best classifier {feature_importance_class1} \n'
        )

        print_to_consol('Plotting feature importances for best classifier')

        gnb_feature_importances(feature_importance_class0, 'class_0',
                                self.directory)
        gnb_feature_importances(feature_importance_class1, 'class_1',
                                self.directory)
        logging.info(
            f'Plotting feature importances for each class for best classifier in decreasing order \n'
        )
Beispiel #5
0
    def randomised_search(self):
        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = SVC(kernel='linear',
                   probability=True,
                   random_state=20,
                   class_weight='balanced')

        logging.info(f'Initialised classifer \n')

        #set up randomized search
        param_dict = {'C': expon(scale=100)}

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(clf1,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        self.model = rand_search_fitted.best_estimator_

        best_parameters = rand_search_fitted.best_params_
        coef = self.model.coef_
        coef_ravel = coef.ravel()
        intercept = self.model.intercept_

        logging.info(
            f'Running randomised search for best patameters of a decision tree \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best coefficient/score: {coef_ravel} \n'
            f'Best intercept/score: {intercept} \n')

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n')

        print_to_consol('Getting feature importances for best classifier')

        cv = CountVectorizer(lowercase=False)
        cv.fit(self.X_train_scaled.columns)
        feature_names = cv.get_feature_names()

        print_to_consol('Plotting feature importances for best classifier')

        plot_coefficients(coef_ravel, feature_names, self.directory)
        logging.info(f'Plotting feature importances for best classifier \n')
    def randomised_search(self):
        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = KNeighborsClassifier()

        logging.info(f'Initialised classifier \n')

        #set up randomized search
        param_dict = {
            'n_neighbors': randint(2, 10),
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'leaf_size': randint(2, 50)
        }

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(clf1,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        best_parameters = rand_search_fitted.best_params_
        best_scores = rand_search_fitted.best_score_

        logging.info(
            f'Running randomised search for best patameters of classifier \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best accuracy scores found: {best_scores} \n')

        self.model = rand_search_fitted.best_estimator_

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for uncalibrated classifier. \n')

        print_to_consol('Getting feature importances for best classifier')

        feature_list = []

        for i in range(len(self.X_train_scaled.columns)):
            X = self.X_train_scaled.iloc[:, i].values.reshape(-1, 1)
            scores = cross_val_score(self.model,
                                     self.X_train_scaled,
                                     self.y_train,
                                     cv=self.cv)
            feature_list.append(scores.mean())

        feature_importance = sorted(zip(self.X_train_scaled.columns,
                                        feature_list),
                                    reverse=True)

        logging.info(
            f'Feature importances for best classifier {feature_importance} \n')

        print_to_consol('Plotting feature importances for best classifier')

        feature_importances_best_estimator(feature_importance, self.directory)
        logging.info(
            f'Plotting feature importances for best classifier in decreasing order \n'
        )
Beispiel #7
0
    def randomised_search(self):
        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = SVC(kernel='rbf',
                   probability=True,
                   random_state=20,
                   class_weight='balanced')

        logging.info(f'Initialised classifier \n')

        #set up randomized search
        param_dict = {'C': expon(scale=100), 'gamma': expon(scale=.1)}

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(clf1,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        self.model = rand_search_fitted.best_estimator_

        best_parameters = rand_search_fitted.best_params_
        sv = self.model.support_vectors_
        intercept = self.model.intercept_

        logging.info(
            f'Running randomised search for best patameters of a decision tree \n'
            f'with AdaBoost scoring is accuracy \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best coefficient/score: {sv} \n'
            f'Best intercept/score: {intercept} \n')

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(
            f'Writing best classifier to disk in {self.directory} \n'
            f'No feature importances available for this type of predictor \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for uncalibrated classifier. \n')
    def randomised_search(self):

        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = LogisticRegression(penalty='l2',
                                  random_state=20,
                                  class_weight='balanced')

        logging.info(f'Initialised classifier')

        #set up randomized search
        param_dict = {'max_iter': randint(100, 10000), 'C': expon(scale=100)}

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(clf1,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        self.model = rand_search_fitted.best_estimator_

        best_parameters = rand_search_fitted.best_params_
        coef = self.model.coef_
        intercept = self.model.intercept_
        n_feat = self.model.n_features_in_
        features = self.model.feature_names_in_

        logging.info(f'Running randomised search for best patameters of a \n'
                     f'Logistic Regression classifier scoring is accuracy \n'
                     f'Best parameters found: {best_parameters} \n'
                     f'Best coefficient/score: {coef} \n'
                     f'Best intercept/score: {intercept} \n'
                     f'Number of features used for fit: {n_feat} \n'
                     f'Features used for fit: {features} \n')

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for uncalibrated classifier. \n')