Example #1
0
 def predict_proba_model(self,
                         X,
                         estimator=None,
                         probability_threshold=None,
                         round=4,
                         verbose=False):
     X = X.copy()
     X = self.preprocessor.transform(X)
     estimator_model = {}
     if estimator is None:
         for name_model, estimator in self.estimator.items():
             estimator_model[name_model] = estimator
     else:
         for name_model in estimator:
             if name_model in self.estimators.keys():
                 estimator_model[name_model] = self.estimators[name_model]
     preds = []
     for name_model, estimator in estimator_model.items():
         try:
             y_pred = estimator.predict_proba(X)[:, 1]
             preds.append(y_pred)
         except:
             LOGGER.warn(
                 f'{estimator.__class__.__name__} not function predict_proba'
             )
     y_pred_proba = np.mean(np.vstack(preds), axis=0)
     return y_pred_proba
Example #2
0
    def model_selection_visualizer(self, classes=None, params={}):

        for idx, (name_model, estimator) in enumerate(self.estimator.items()):
            cv = StratifiedKFold(n_splits=2, random_state=42)
            try:
                if visualizer.__class__.__name__ in params.keys():
                    LOGGER.info('Visualizer ValidationCurve')
                    visualizer = ValidationCurve(
                        model=estimator,
                        cv=cv,
                        **params[visualizer.__class__.__name__])
                    visualizer.fit(self.X, self.y)
                    visualizer.show(outpath=os.path.join(
                        os.getcwd(),
                        f"visualizer/{visualizer.__class__.__name__}.png"))
                    plt.cla()
            except:
                LOGGER.warn('ERROR ValidationCurve')
            try:
                LOGGER.info('Visualizer LearningCurve')
                visualizer = CVScores(model=estimator, cv=cv)
                if visualizer.__class__.__name__ in params.keys():
                    visualizer = LearningCurve(
                        **params[visualizer.__class__.__name__])
                visualizer.fit(self.X, self.y)
                visualizer.show(outpath=os.path.join(
                    os.getcwd(),
                    f"visualizer/{visualizer.__class__.__name__}.png"))
                plt.cla()
            except:
                LOGGER.warn('ERROR LearningCurve')
            try:
                LOGGER.info('Visualizer CVScores')
                visualizer = CVScores(model=estimator, cv=cv)
                if visualizer.__class__.__name__ in params.keys():
                    visualizer = CVScores(
                        **params[visualizer.__class__.__name__])
                visualizer.fit(self.X, self.y)
                visualizer.show(outpath=os.path.join(
                    os.getcwd(),
                    f"visualizer/{visualizer.__class__.__name__}.png"))
                plt.cla()
            except:
                LOGGER.warn('ERROR CVScores')
            try:
                LOGGER.info('Visualizer FeatureImportances')
                visualizer = FeatureImportances(estimator)
                if visualizer.__class__.__name__ in params.keys():
                    visualizer = FeatureImportances(
                        **params[visualizer.__class__.__name__])
                visualizer.fit(self.X, self.y)
                visualizer.show(outpath=os.path.join(
                    os.getcwd(),
                    f"visualizer/{visualizer.__class__.__name__}.png"))
                plt.cla()
            except:
                LOGGER.warn('ERROR FeatureImportances')
Example #3
0
 def predict_model(self, X, estimator=None, verbose=False):
     X = X.copy()
     X = self.preprocessor.transform(X)
     estimator_model = {}
     if estimator is None:
         for name_model, estimator in self.estimator.items():
             estimator_model[name_model] = estimator
     else:
         for name_model in estimator:
             if name_model in self.estimators.keys():
                 estimator_model[name_model] = self.estimator[name_model]
     preds = []
     for name_model, estimator in estimator_model.items():
         try:
             y_pred = estimator.predict(X)
             preds.append(y_pred)
         except:
             LOGGER.warn(
                 f'{estimator.__class__.__name__} not function predict')
     y_pred = scipy.stats.mode(np.vstack(preds), axis=0)[0][0].tolist()
     return y_pred
Example #4
0
 def target_visualizer(self,
                       classes=None,
                       params={'BalancedBinningReference': {
                           'bins': 5
                       }}):
     LOGGER.info('Initializing target visualizer')
     if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False:
         os.makedirs(os.path.join(os.getcwd(), 'visualizer/'))
     visualizers = []
     y = self.y.squeeze()
     try:
         LOGGER.info('Visualizer BalancedBinningReference')
         visualizer = BalancedBinningReference()
         if visualizer.__class__.__name__ in params.keys():
             visualizer = BalancedBinningReference(
                 **params[visualizer.__class__.__name__])
         visualizer.fit(y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR BalancedBinning')
     try:
         LOGGER.info('Visualizer CLassBalance')
         visualizer = ClassBalance()
         if visualizer.__class__.__name__ in params.keys():
             visualizer = ClassBalance(
                 **params[visualizer.__class__.__name__])
         visualizer.fit(y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR ClassBalance')
     try:
         LOGGER.info('Visualizer Feature Correlation')
         visualizer = FeatureCorrelation(
             method='mutual_info-classification',
             feature_names=self.X.columns.tolist(),
             sort=True)
         if visualizer.__class__.__name__ in params.keys():
             visualizer = FeatureCorrelation(
                 **params[visualizer.__class__.__name__])
         visualizer.fit(self.X, y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR FeatureCorrelation')
Example #5
0
 def evaluate_visualizer(self, classes=None, params={}):
     LOGGER.info('Initializing plot model')
     if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False:
         os.makedirs(os.path.join(os.getcwd(), 'visualizer/'))
     if classes is None:
         classes = pd.value_counts(self.y.values.flatten()).index.tolist()
     visualizers = []
     for idx, (name_model, estimator) in enumerate(self.estimator.items()):
         X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
             self.X,
             self.y,
             test_size=0.2,
             stratify=self.y,
             random_state=24)
         try:
             LOGGER.info('Visualizer ClassificationReport')
             visualizer = ClassificationReport(model=estimator,
                                               classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = ClassificationReport(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR ClassificationReport')
         try:
             LOGGER.info('Visualizer ConfusionMatrix')
             visualizer = ConfusionMatrix(model=estimator, classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = ConfusionMatrix(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR ConfusionMatrix')
         try:
             LOGGER.info('Visualizer ROCAUC')
             visualizer = ROCAUC(model=estimator, classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = ROCAUC(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR ROCAUC')
         try:
             LOGGER.info('Visualizer PrecisionRecallCurve')
             visualizer = PrecisionRecallCurve(model=estimator,
                                               per_class=True,
                                               classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = PrecisionRecallCurve(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR PrecisionRecallCurve')
         try:
             LOGGER.info('Visualizer ClassPredictionError')
             visualizer = ClassPredictionError(model=estimator,
                                               classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = ClassPredictionError(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR ClassPredictionError')
         try:
             LOGGER.info('Visualizer Discrimination Threshold')
             visualizer = DiscriminationThreshold(model=estimator,
                                                  classes=classes)
             if visualizer.__class__.__name__ in params.keys():
                 visualizer = DiscriminationThreshold(
                     **params[visualizer.__class__.__name__])
             visualizer.fit(X_train, y_train)
             visualizer.score(X_test, y_test)
             visualizer.show(outpath=os.path.join(
                 os.getcwd(),
                 f'visualizer/{visualizer.__class__.__name__}_{estimator.__class__.__name__}.png'
             ))
             plt.cla()
         except:
             LOGGER.warn('ERROR Discrimination Threshold')
Example #6
0
 def feature_visualizer(self, classes=None, params={}):
     if os.path.isdir(os.path.join(os.getcwd(), 'visualizer/')) == False:
         os.makedirs(os.path.join(os.getcwd(), 'visualizer/'))
     if classes is None:
         classes = pd.value_counts(self.y.values.flatten()).index.tolist()
     try:
         LOGGER.info('Visualizer RadViz')
         visualizer = RadViz(classes=classes,
                             features=self.X.columns.tolist())
         if visualizer.__class__.__name__ in params.keys():
             visualizer = RadViz(**params[visualizer.__class__.__name__])
         visualizer.fit(self.X, self.y)
         visualizer.transform(self.X)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR RadViz')
     try:
         LOGGER.info('Visualizer Rank1D')
         visualizer = Rank1D()
         if visualizer.__class__.__name__ in params.keys():
             visualizer = Rank1D(**params[visualizer.__class__.__name__])
         visualizer.fit(self.X, self.y)
         visualizer.transform(self.X)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR Rank1D')
     try:
         LOGGER.info('Visualizer Rank2D')
         visualizer = Rank2D()
         if visualizer.__class__.__name__ in params.keys():
             visualizer = Rank2D(**params[visualizer.__class__.__name__])
         visualizer.fit(self.X, self.y)
         visualizer.transform(self.X)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR Rank2D')
     try:
         LOGGER.info('Visualizer ParallelCoordinates')
         visualizer = ParallelCoordinates(classes=classes,
                                          features=self.X.columns.tolist(),
                                          shuffle=True)
         if visualizer.__class__.__name__ in params.keys():
             visualizer = ParallelCoordinates(
                 **params[visualizer.__class__.__name__])
         visualizer.fit_transform(self.X, self.y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR ParallelCoordinates')
     try:
         LOGGER.info('Visualizer PCA 3D')
         visualizer = PCA(classes=classes, scale=True, projection=3)
         if visualizer.__class__.__name__ in params.keys():
             visualizer = PCA(**params[visualizer.__class__.__name__])
         visualizer.fit_transform(self.X, self.y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR PCA 3D')
     try:
         LOGGER.info('Visualizer PCA Biplot')
         visualizer = PCA(classes=classes, scale=True, proj_features=True)
         if visualizer.__class__.__name__ in params.keys():
             visualizer = PCA(**params[visualizer.__class__.__name__])
         visualizer.fit_transform(self.X, self.y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR PCA Biplot')
     try:
         LOGGER.info('Visualizer Manifold')
         visualizer = Manifold(classes=classes)
         if visualizer.__class__.__name__ in params.keys():
             visualizer = Manifold(**params[visualizer.__class__.__name__])
         visualizer.fit_transform(self.X, self.y)
         visualizer.show(outpath=os.path.join(
             os.getcwd(),
             f"visualizer/{visualizer.__class__.__name__}.png"))
         plt.cla()
     except:
         LOGGER.warn('ERROR Manifold')
Example #7
0
    def voting_model(self,
                     estimator=None,
                     cv=2,
                     scoring=['roc_auc_ovr'],
                     sort=None,
                     estimator_params={},
                     fit_params={},
                     verbose=True,
                     n_jobs=-1):
        LOGGER.info('VOTING MODELs')

        if sort is None:
            sort = scoring[0]
        estimator_model = self.choose_model(estimator=estimator,
                                            estimator_params=estimator_params,
                                            fit_params=fit_params)

        model_voting = []
        for name_model, model in estimator_model.items():
            try:
                estimator = model.estimator
            except:
                estimator = model
            model_voting.append((name_model, estimator))
        name_model = 'classification-votingclassifer'
        try:
            LOGGER.info('TRY soft voting')
            estimator = VotingClassifier(estimators=model_voting,
                                         voting='soft')
            scores = sklearn.model_selection.cross_validate(
                estimator=estimator,
                X=self.X,
                y=self.y,
                scoring=scoring,
                cv=cv,
                n_jobs=n_jobs,
                verbose=verbose,
                fit_params=fit_params,
                return_train_score=True,
                return_estimator=True,
                error_score=-1)
        except:
            LOGGER.warn('TRY hard voting')
            estimator = VotingClassifier(estimators=model_voting,
                                         voting='hard')
            scores = sklearn.model_selection.cross_validate(
                estimator=estimator,
                X=self.X,
                y=self.y,
                scoring=scoring,
                cv=cv,
                n_jobs=n_jobs,
                verbose=verbose,
                fit_params=fit_params,
                return_train_score=True,
                return_estimator=True,
                error_score=-1)
        print('score voting: ', scores)
        self.estimator['classification-votingclassifer'] = scores['estimator'][
            np.argmax(scores['test_' + sort])]
        scores.pop('estimator')
        name_model = ''.join(name_model.split('-')[1:])
        for key, values in scores.items():
            for i, value in enumerate(values):
                if i not in self.metrics.keys():
                    self.metrics[i] = {}
                if name_model not in self.metrics[i].keys():
                    self.metrics[i][name_model] = dict()
                self.metrics[i][name_model][key] = value
        return self
Example #8
0
    def tune_model(
        self,
        estimator=None,
        n_iter=2,
        optimize='accuracy',
        search_library: str = 'optuna',
        search_algorithm='random',
        early_stopping='asha',
        early_stopping_max_iters=10,
        estimator_params={},
        n_jobs=-1,
        verbose=True,
    ):
        LOGGER.info('TUNNING MODEL ...')
        best_params_model = {}
        model_grid = None

        estimator_model = {}
        if estimator is None:
            if len(self.estimator.keys()) > 0:
                for name_model, estimator in self.estimator.items():
                    if name_model in estimator_params.keys():
                        estimator_model[
                            name_model] = ModelFactory.create_executor(
                                name_model, **estimator_params[name_model])
                    else:
                        estimator_model[name_model] = estimator

            else:
                for name_model in ModelFactory.name_registry:
                    if name_model in estimator_params.keys():
                        estimator_model[
                            name_model] = ModelFactory.create_executor(
                                name_model, **estimator_params[name_model])
                    else:
                        estimator_model[
                            name_model] = ModelFactory.create_executor(
                                name_model)
        else:
            for name_model in estimator:
                if name_model in estimator_params.keys():
                    estimator_model[name_model] = ModelFactory.create_executor(
                        name_model, **estimator_params[name_model])
                else:
                    estimator_model[name_model] = ModelFactory.create_executor(
                        name_model)

        # update estimator_params
        for name_model, params in estimator_params.items():
            self.estimator_params[name_model] = params

        for name_model, model in estimator_model.items():
            LOGGER.info('tunning model_name: {}'.format(name_model))
            estimator = model.estimator

            parameter_grid = model.tune_grid
            parameter_distributions = model.tune_distributions

            if (search_library == 'scikit-learn' or search_library
                    == 'tune-sklearn') and (search_algorithm == 'grid'
                                            or search_algorithm == 'random'):
                parameter_grid = model.tune_grid
            else:
                parameter_grid = model.tune_distributions
            model_grid = None
            if search_library == 'optuna':
                pruner_translator = {
                    "asha": optuna.pruners.SuccessiveHalvingPruner(),
                    "hyperband": optuna.pruners.HyperbandPruner(),
                    "median": optuna.pruners.MedianPruner(),
                    False: optuna.pruners.NopPruner(),
                    None: optuna.pruners.NopPruner(),
                }
                pruner = early_stopping
                if pruner in pruner_translator:
                    pruner = pruner_translator[early_stopping]

                sampler_translator = {
                    "tpe": optuna.samplers.TPESampler(seed=24),
                    "random": optuna.samplers.RandomSampler(seed=24),
                }
                sampler = sampler_translator[search_algorithm]

                try:
                    param_grid = get_optuna_distributions(
                        parameter_distributions)
                except:
                    LOGGER.warn(
                        "Couldn't convert param_grid to specific library distributions. Exception:"
                    )
                    LOGGER.warn(traceback.format_exc())
                study = optuna.create_study(direction='maximize',
                                            sampler=sampler,
                                            pruner=pruner)
                LOGGER.info('Initializing optuna.intergration.OptnaSearchCV')
                model_grid = optuna.integration.OptunaSearchCV(
                    estimator=estimator,
                    param_distributions=param_grid,
                    max_iter=early_stopping_max_iters,
                    n_jobs=n_jobs,
                    n_trials=n_iter,
                    random_state=24,
                    scoring=optimize,
                    study=study,
                    refit=False,
                    verbose=verbose,
                    error_score='raise')
            elif search_library == 'tune-sklearn':
                early_stopping_translator = {
                    "asha": "ASHAScheduler",
                    "hyperband": "HyperBandScheduler",
                    "median": "MedianStoppingRule",
                }
                if early_stopping in early_stopping_translator:
                    early_stopping = early_stopping_translator[early_stopping]
                do_early_stop = early_stopping and can_early_stop(
                    estimator, True, True, True, parameter_grid)

                if search_algorithm == 'grid':

                    LOGGER.info('Initializing tune_sklearn.TuneGridSearchCV')
                    model_grid = TuneGridSearchCV(
                        estimator=estimator,
                        param_grid=parameter_grid,
                        early_stopping=do_early_stop,
                        scoring=optimize,
                        cv=fold,
                        max_iters=early_stopping_max_iters,
                        refit=True,
                        n_jobs=n_jobs)

            model_grid.fit(self.X, self.y)
            best_params = model_grid.best_params_
            best_params_model[name_model] = best_params

        # update estimator_params
        for name_model, params in best_params_model.items():
            self.estimator_params[name_model] = params
        LOGGER.info('best_params_model: {}'.format(best_params_model))
        return best_params_model