Example #1
0
    def train(self, dataset, classifiers):
        """A complete process of distinct steps in figuring out the best ML algorithm with best hyperparameters to
        polygon classification problem.
        """
        pt = param_tuning.ParamTuning()
        f = Features()

        tot_time = time.time()
        start_time = time.time()
        data_df = pd.read_csv(dataset)
        ytrain = data_df['status']
        Xtrain = data_df.drop('status', axis=1)
        print("Loaded train dataset in {} sec.".format(time.time() -
                                                       start_time))

        fX = f.build(Xtrain)
        print("Build features from train data in {} sec.".format(time.time() -
                                                                 start_time))

        start_time = time.time()
        # 1st phase: find and fine tune the best classifier from a list of candidate ones
        best_clf = pt.fineTuneClassifiers(fX, ytrain, classifiers)
        estimator = best_clf['estimator']
        print("Best hyperparams for {}, {}, with score {}; {} sec.".format(
            best_clf['hyperparams'], best_clf['clf_name'], best_clf['score'],
            time.time() - start_time))

        estimator = pt.trainClassifier(fX, ytrain, estimator)
        os.makedirs(os.path.join(os.getcwd(), 'models'), exist_ok=True)
        dump(
            estimator,
            os.path.join(os.getcwd(), 'models',
                         best_clf['clf_name'] + '_model.joblib'))

        print("The whole process took {} sec.".format(time.time() - tot_time))
    def evaluate(self, dataset, classifier):
        """Evaluate the best ML algorithm with optimal hyperparameters to new unseen data.

        Parameters
        ----------
        dataset : str
            Name of train dataset
        classifier : str
            Classifier to train and evaluate
        """
        pt = param_tuning.ParamTuning()
        f = Features()

        tot_time = time.time()
        start_time = time.time()
        # Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset)
        data_df = pd.read_csv(dataset)
        ytest = data_df['status']
        Xtest = data_df.drop('status', axis=1)
        print("Loaded test dataset in {} sec.".format(time.time() -
                                                      start_time))

        start_time = time.time()
        fX = f.build(Xtest)
        print("Build features from test data in {} sec".format(time.time() -
                                                               start_time))

        start_time = time.time()
        # 3th phase: test the fine tuned best classifier on the test dataset
        estimator = load(
            os.path.join(os.getcwd(), 'models', classifier + '_model.joblib'))
        res = pt.testClassifier(fX, ytest, estimator, True)
        self._print_stats(classifier, res['metrics'], res['feature_imp'],
                          start_time)

        Xtest.reset_index(inplace=True)
        Xtest = pd.concat([
            Xtest,
            pd.DataFrame(res['proba'],
                         columns=['none_origin_pred', 'dian_origin_pred'])
        ],
                          axis=1)
        os.makedirs('output', exist_ok=True)
        Xtest[[
            'pst_geom', 'dian_geom', 'none_origin_pred', 'dian_origin_pred'
        ]].to_csv(os.path.join('output', 'predictions.csv'), index=False)

        print("The whole process took {} sec.".format(time.time() - tot_time))
    def hyperparamTuning(self, dataset, classifiers):
        """A complete process of distinct steps in figuring out the best ML algorithm with best hyperparameters to
        polygon classification problem.
        """
        pt = param_tuning.ParamTuning()
        f = Features()

        tot_time = time.time()
        start_time = time.time()
        Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset)
        print("Loaded train/test datasets in {} sec.".format(time.time() -
                                                             start_time))

        fX = f.build(Xtrain)
        print("Build features from train data in {} sec.".format(time.time() -
                                                                 start_time))

        start_time = time.time()
        # 1st phase: find and fine tune the best classifier from a list of candidate ones
        best_clf = pt.fineTuneClassifiers(fX, ytrain, classifiers)
        estimator = best_clf['estimator']
        print("Best hyperparams, {}, with score {}; {} sec.".format(
            best_clf['hyperparams'], best_clf['score'],
            time.time() - start_time))

        start_time = time.time()
        # 2nd phase: train the fine tuned best classifier on the whole train dataset (no folds)
        estimator = pt.trainClassifier(fX, ytrain, estimator)
        print(
            "Finished training model on dataset; {} sec.".format(time.time() -
                                                                 start_time))

        start_time = time.time()
        fX = f.build(Xtest)
        print("Build features from test data in {} sec".format(time.time() -
                                                               start_time))

        start_time = time.time()
        # 3th phase: test the fine tuned best classifier on the test dataset
        res = pt.testClassifier(fX, ytest, estimator)
        self._print_stats(best_clf['clf_name'], res['metrics'],
                          res['feature_imp'], start_time)

        print("The whole process took {} sec.".format(time.time() - tot_time))
    def exec_classifiers(self, dataset):
        """Train and evaluate selected ML algorithms with custom hyper-parameters on dataset.
        """
        f = Features()
        pt = param_tuning.ParamTuning()

        start_time = time.time()
        Xtrain, Xtest, ytrain, ytest = self._load_and_split_data(dataset)
        print("Loaded train/test datasets in {} sec.".format(time.time() -
                                                             start_time))

        fX_train = f.build(Xtrain)
        fX_test = f.build(Xtest)
        print("Build features from train/test data in {} sec".format(
            time.time() - start_time))

        for clf in config.MLConf.clf_custom_params:
            print('Method {}'.format(clf))
            print('=======', end='')
            print(len(clf) * '=')

            tot_time = time.time()
            start_time = time.time()
            # 1st phase: train each classifier on the whole train dataset (no folds)
            # estimator = pt.clf_names[clf][0](**config.MLConf.clf_custom_params[clf])
            estimator = pt.clf_names[clf][0](random_state=config.seed_no)
            estimator.set_params(**config.MLConf.clf_custom_params[clf])
            estimator = pt.trainClassifier(fX_train, ytrain, estimator)

            print("Finished training model on dataset; {} sec.".format(
                time.time() - start_time))

            start_time = time.time()
            # 2nd phase: test each classifier on the test dataset
            res = pt.testClassifier(fX_test, ytest, estimator)
            self._print_stats(clf, res['metrics'], res['feature_imp'],
                              start_time)
            # if not os.path.exists('output'):
            #     os.makedirs('output')
            # np.savetxt(f'output/{clf}_default_stats.csv', res['metrics']['stats'], fmt="%u")

            print("The whole process took {} sec.\n".format(time.time() -
                                                            tot_time))