コード例 #1
0
    def randomised_search(self):

        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = DecisionTreeClassifier(random_state=20,
                                      class_weight='balanced',
                                      max_features=self.numf)

        logging.info(f'Initialised classifier')

        #set up randomized search
        param_dict = {
            'criterion': ['gini', 'entropy'],
            'min_samples_split': randint(2, 20),
            'max_depth': randint(1, 10),
            'min_samples_leaf': randint(1, 20),
            'max_leaf_nodes': randint(10, 20)
        }

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(clf1,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        best_parameters = rand_search_fitted.best_params_
        best_scores = rand_search_fitted.best_score_

        logging.info(
            f'Running randomised search for best patameters of classifier \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best accuracy scores found: {best_scores} \n')

        self.model = rand_search_fitted.best_estimator_

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for uncalibrated classifier. \n')

        print_to_consol('Getting feature importances for best classifier')

        best_clf_feat_import = self.model.feature_importances_
        best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import,
                                                 self.X_train_scaled.columns),
                                             reverse=True)

        logging.info(
            f'Feature importances for best classifier {best_clf_feat_import_sorted} \n'
        )

        print_to_consol('Plotting feature importances for best classifier')

        feature_importances_best_estimator(best_clf_feat_import_sorted,
                                           self.directory)
        logging.info(
            f'Plotting feature importances for best classifier in decreasing order \n'
        )
コード例 #2
0
    def randomised_search(self):

        print('*' * 80)
        print('*    Running randomized search to find best classifier')
        print('*' * 80)

        #create the decision forest
        clf1 = DecisionTreeClassifier(random_state=20,
                                      class_weight='balanced',
                                      max_features=self.numf)

        ada = AdaBoostClassifier(base_estimator=clf1,
                                 algorithm="SAMME.R",
                                 random_state=55)

        logging.info(
            f'Initialised decision tree and AdaBoost using balanced class weights'
        )

        #set up randomized search
        param_dict = {
            'base_estimator__criterion': ['gini', 'entropy'],
            'n_estimators': randint(100,
                                    10000),  #number of base estimators to use
            'learning_rate': uniform(0.0001, 1.0),
            'base_estimator__min_samples_split': randint(2, 20),
            'base_estimator__max_depth': randint(1, 10),
            'base_estimator__min_samples_leaf': randint(1, 20),
            'base_estimator__max_leaf_nodes': randint(10, 20)
        }

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict}')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(ada,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train, self.y_train)

        best_parameters = rand_search_fitted.best_params_
        best_scores = rand_search_fitted.best_score_

        logging.info(
            f'Running randomised search for best patameters of a decision tree \n'
            f'with AdaBoost scoring is accuracy \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best accuracy scores found: {best_scores} \n')

        self.model = rand_search_fitted.best_estimator_

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print('*' * 80)
        print('*    Getting 95%% confidence interval for best classifier')
        print('*' * 80)

        alpha, upper, lower = get_confidence_interval(
            self.X_train, self.y_train, self.X_test, self.y_test, self.model,
            self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n')

        print('*' * 80)
        print('*    Getting feature importances for best classifier')
        print('*' * 80)

        best_clf_feat_import = self.model.feature_importances_
        best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import,
                                                 self.X_train.columns),
                                             reverse=True)

        logging.info(
            f'Feature importances for best classifier {best_clf_feat_import_sorted} \n'
        )

        all_clf_feat_import_mean = np.mean(
            [tree.feature_importances_ for tree in self.model.estimators_],
            axis=0)
        all_clf_feat_import_mean_sorted = sorted(zip(all_clf_feat_import_mean,
                                                     self.X_train.columns),
                                                 reverse=True)

        print('*' * 80)
        print('*    Plotting feature importances across all trees')
        print('*' * 80)

        feature_importances_best_estimator(best_clf_feat_import_sorted,
                                           self.directory)
        logging.info(
            f'Plotting feature importances for best classifier in decreasing order \n'
        )
        feature_importances_error_bars(self.model, self.X_train.columns,
                                       self.directory)
        logging.info(
            f'Plotting feature importances for best classifier with errorbars \n'
        )
コード例 #3
0
    def randomised_search(self):
        print_to_consol('Running randomized search to find best classifier')

        #create the decision forest
        clf1 = KNeighborsClassifier()

        logging.info(f'Initialised classifier \n')

        #set up randomized search
        param_dict = {
            'n_neighbors': randint(2, 10),
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'leaf_size': randint(2, 50)
        }

        logging.info(
            f'Following parameters will be explored in randomized search \n'
            f'{param_dict} \n')

        #building and running the randomized search
        rand_search = RandomizedSearchCV(clf1,
                                         param_dict,
                                         random_state=5,
                                         cv=self.cv,
                                         n_iter=self.numc,
                                         scoring='accuracy',
                                         n_jobs=-1)

        rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train)

        best_parameters = rand_search_fitted.best_params_
        best_scores = rand_search_fitted.best_score_

        logging.info(
            f'Running randomised search for best patameters of classifier \n'
            f'Best parameters found: {best_parameters} \n'
            f'Best accuracy scores found: {best_scores} \n')

        self.model = rand_search_fitted.best_estimator_

        datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
        joblib.dump(
            self.model,
            os.path.join(self.directory,
                         'best_predictor_' + datestring + '.pkl'))

        logging.info(f'Writing best classifier to disk in {self.directory} \n')

        print_to_consol(
            'Getting 95% confidence interval for uncalibrated classifier')

        alpha, upper, lower = get_confidence_interval(
            self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test,
            self.model, self.directory, self.bootiter, 'uncalibrated')

        logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n'
                     f'for uncalibrated classifier. \n')

        print_to_consol('Getting feature importances for best classifier')

        feature_list = []

        for i in range(len(self.X_train_scaled.columns)):
            X = self.X_train_scaled.iloc[:, i].values.reshape(-1, 1)
            scores = cross_val_score(self.model,
                                     self.X_train_scaled,
                                     self.y_train,
                                     cv=self.cv)
            feature_list.append(scores.mean())

        feature_importance = sorted(zip(self.X_train_scaled.columns,
                                        feature_list),
                                    reverse=True)

        logging.info(
            f'Feature importances for best classifier {feature_importance} \n')

        print_to_consol('Plotting feature importances for best classifier')

        feature_importances_best_estimator(feature_importance, self.directory)
        logging.info(
            f'Plotting feature importances for best classifier in decreasing order \n'
        )