Ejemplo n.º 1
0
    def cross_validate_model(self, X, y, model, num_folds, score):
        '''
        Realiza la validación de un modelo, con los dataset X e y, haciendo CV de numfolds
        y devolviendo la metrica score.

        Parametros
        ----------
        X: numpy.array
            Datos de las instancias del dataset con el que entrenar.
        Y: numpy.array
            Targets de las instancias de X.
        model: scikit_model
            modelo al que entrenar
        num_folds: int
            numero de folds de la validacion cruzada
        score: string 
            métrica que devuelve, de los strings disponibles en valido en sklearn.metrics.SCORERS.keys().
        Return
        -------
        array de resultados
            array con los resultados de cada fold de la CV
        '''
        if score not in SCORERS.keys():
            raise AttributeError(
                "Atributo score debe ser válido. Ver válidos en sklearn.metrics.SCORERS.keys()"
            )
        print('\t' + str(model)[:20], end=' - ')
        mod_scores = cross_val_score(model, X, y, cv=num_folds, scoring=score)
        print('FM')
        return np.array(mod_scores)
Ejemplo n.º 2
0
    def _fit_by_cross_validation(self, X, y, number_of_splits: int = 5, label_name: str = None, cores_for_training: int = 1,
                                 optimization_metric: str = "balanced_accuracy"):

        model = self._get_ml_model()
        scoring = optimization_metric

        if optimization_metric not in SCORERS.keys():
            scoring = "balanced_accuracy"
            warnings.warn(
                f"{self.__class__.__name__}: specified optimization metric ({optimization_metric}) is not defined as a sklearn scoring function, using {scoring} instead... ")

        if not self.show_warnings:
            warnings.simplefilter("ignore")
            os.environ["PYTHONWARNINGS"] = "ignore"

        self.model = RandomizedSearchCV(model, param_distributions=self._parameter_grid, cv=number_of_splits, n_jobs=cores_for_training,
                                        scoring=scoring, refit=True)
        self.model.fit(X, y)

        if not self.show_warnings:
            del os.environ["PYTHONWARNINGS"]
            warnings.simplefilter("always")

        self.model = self.model.best_estimator_  # do not leave RandomSearchCV object to be in models, use the best estimator instead

        return self.model
Ejemplo n.º 3
0
def score_options():
    '''
    Return a list of possible scorers for a regression model
    '''
    from sklearn.metrics import SCORERS
    score_types = sorted(SCORERS.keys())
    print('Possible scores to choose from: ')
    for s in score_types:
        print(s)
Ejemplo n.º 4
0
def evaluation_p(population: pd.DataFrame,
                 list_caracteres: np.array,
                 data: pd.DataFrame,
                 target: pd.Series,
                 model: Any,
                 scorer: str,
                 n_cv: int = 5,
                 sort_scores: bool = True) -> pd.DataFrame:
    """
    Évaluation des individus d'une population
    :param population: ensemble des individus à évaluer
    :param list_caracteres: liste des caractères qui peuvent être exprimés
    :param data: données à utiliser pour l'évaluation
    :param target: sortie à prédire
    :param model: modèle à ajuster
    :param scorer: score de performance à maximiser
    :param n_cv: nombre de plis pour validation croisée, doit être au moins de 2
    :param sort_scores: appliquer un tri (descendant) ou non
    :return:
    population avec score de performance obtenu par validation croisée
    """
    # TODO: voir pour passer directement la fonction scorer=make_score()
    #  et les paramètres associés pour ne pas la définir en dur
    population_eval = population.copy()
    mean_scores = []
    xval_strategy = KFold(n_cv, shuffle=True, random_state=123)
    for indiv in population_eval.values:
        lstcols = list_caracteres[indiv]
        if scorer in SCORERS.keys():
            scoring_function = scorer
        elif scorer == 'bic':
            kwargs = {'k': len(lstcols)}
            scoring_function = make_scorer(get_bic,
                                           greater_is_better=False,
                                           **kwargs)
        else:
            print(f'Scorer {scorer} unknown')
            break
        scores = cross_val_score(model,
                                 data[lstcols],
                                 target,
                                 cv=xval_strategy,
                                 scoring=scoring_function,
                                 n_jobs=-1)
        mean_scores.append(scores.mean())
    population_scores = pd.Series(mean_scores,
                                  index=population_eval.index,
                                  name='score')
    population_eval['score'] = population_scores
    if sort_scores:
        population_eval = population_eval.sort_values(by='score',
                                                      ascending=False)
    else:
        population_eval = population_eval.reset_index(drop=True)
    return population_eval
Ejemplo n.º 5
0
    def cv(self):
        data = input("Predictors Name: ")
        target = input("Target: ")
        scaler = input(
            "Input type of Problem: Regression = R, Classification = C")
        no_cv = input("k-folds, k = ")

        from sklearn.metrics import SCORERS
        list(SCORERS.keys())
        reg_scorers = [
            'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error',
            'neg_mean_squared_error', 'neg_mean_squared_log_error',
            'explained_variance'
        ]
        class_scorers = ['precision', 'recall', 'f1', 'accuracy', 'roc_auc']

        step2 = "Get scorers to cross validate on. Please separate scorers by a comma only. "
        print(step2)
        if scaler == "R":
            metrics = input(", ".join(reg_scorers))
        elif scaler == "C":
            metrics = input(", ".join(class_scorers))
        scorers = [i.strip() for i in metrics.split(",")]

        print(self.get_script)
        print("\n_____________Copy from Here_____________\n")
        step1 = "# Import module to cross-validate"
        print(step1)

        module_import = "from sklearn.model_selection import cross_validate"
        print(module_import + "\n")

        step3 = f"#Create base model to cross validate "
        print(step3)
        print(f"model = ...\n")

        step4 = f"#Define the scorers to validate on"
        print(step4)
        print(f"scorers = {scorers}")

        step4 = f"#Cross validate model on {data}"
        print(step4)
        print(
            f"scores = cross_validate(model, X = {data}, y = {target}, scoring = scorers)\n",
            cv=no_cv)

        step5 = f"#Check performance on {data}"
        print(step5)
        str = r"print(f'Performance : { scores }')"
        print(str)
Ejemplo n.º 6
0
    def parse_cfg(self):
        transformations = self.cfg['dataset'].get('transform')
        if transformations:
            self.cfg['dataset']['transform'] = [
                eval(t) for t in transformations
            ]

        target_transform = self.cfg['dataset'].get('target_transform')
        if target_transform:
            self.cfg['dataset']['target_transform'] = eval(target_transform)

        scorers = self.cfg['training'].get('scorers')
        if isinstance(scorers, str):
            scorers = [scorers]
        scorers_dict = {}
        for s in scorers:
            scorers_dict[s] = s if s in SCORERS.keys() else make_scorer(
                eval(s))
        self.cfg['training']['scorers'] = scorers_dict

        models = self.cfg['training'].get('models')
        if isinstance(models, list):
            self.cfg['training']['models'] = [
                Pipeline([(e, _get_model(e)) for e in model])
                for model in models
            ]
        elif isinstance(models, str):
            self.cfg['training']['models'] = jb_load(models)

        self.cfg['training']['cfg_path'] = self.cfg_path

        if 'holdout' in self.cfg:
            self.cfg['holdout']['cfg_path'] = self.cfg_path
            scorers = self.cfg['holdout'].get('scorers')
            if scorers:
                valid_scorers = {
                    'balanced_accuracy': balanced_accuracy_score,
                    'accuracy': accuracy_score,
                    'roc_auc': roc_auc_score,
                    'recall': recall_score,
                    'specificity': specificity,
                }
                self.cfg['holdout']['scorers'] = [
                    valid_scorers[s] for s in scorers
                ]
Ejemplo n.º 7
0
def test_scorer_memmap_input():
    # Non-regression test for #6147: some score functions would
    # return singleton memmap when computed on memmap data instead of scalar
    # float values.
    for name in SCORERS.keys():
        yield check_scorer_memmap, name
Ejemplo n.º 8
0
def test_scorer_memmap_input():
    # Non-regression test for #6147: some score functions would
    # return singleton memmap when computed on memmap data instead of scalar
    # float values.
    for name in SCORERS.keys():
        yield check_scorer_memmap, name
Ejemplo n.º 9
0
        X_test[sc_cols] = \
            sc.transform(X_test[sc_cols])

        features = list(X_train.columns.values)
        print(X_train.shape)
        print(X_test.shape)
        ##############################################################
        param_grid = {
            'C': [0.001, 0.01, 0.1, 1,
                  10],  #[.1,.9], #[1e-5,1e-4, 1e-3, 1e-2, 0.1],# 1, 10, 100],
            'gamma': [
                0.001, 0.01, 0.1, 1
            ],  #['scale',1,10,100], #'['auto','scale', 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100],
            'kernel': ['sigmoid']
        }
        print(sorted(SCORERS.keys()))

        clf = GridSearchCV(
            svm.SVC(probability=True),
            param_grid=param_grid,
            scoring='accuracy',  #'roc_auc', #'f1_macro',
            # 'f1_weighted', #'precision_weighted',#'average_precision', #'f1_macro',
            cv=3,
            refit=True,
            verbose=10,  # 10 to see results
            return_train_score=True,
            # n_jobs=multiprocessing.cpu_count() - 5
            # get error if using too much memory , njobs < cpu's availale -2
            #n_jobs=30
        )  # higher verbose =more printed
Ejemplo n.º 10
0
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import SCORERS

data = pd.read_csv('IRIS.csv')
print(data.describe())
print(data.head())

y = data['species']
X = data.drop(['species'], axis=1)
print(y.head())
print(X.head())

dt_model = DecisionTreeClassifier()
print(SCORERS.keys())
scores = cross_val_score(dt_model, X, y, cv=5, n_jobs=4, scoring='accuracy')
print(scores)
print(scores.mean())
print(scores.std())
Ejemplo n.º 11
0
import xgboost as xgb
from xgboost import XGBRegressor as XGBR
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error as mse, SCORERS

data = load_boston()

X = data.data
Y = data.target

X_trian, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

sk_xgb_model = XGBR(n_estimators=100, random_state=0).fit(X_trian, Y_train)

pre1 = sk_xgb_model.predict(X_test)
score1 = sk_xgb_model.score(X_test, Y_test)
mse = mse(y_true=Y_test, y_pred=pre1)
important = sk_xgb_model.feature_importances_

print('pre:  ', pre1)
print('score1:  ', score1)
print('mse:  ', mse)
print('important:  ', important)
print('mean:   ', Y.mean())

print(SCORERS.keys())  #所有可用的评估指标
Ejemplo n.º 12
0
from sklearn.metrics import SCORERS

if __name__ == "__main__":
    print("These below are going to be lots of fun")
    for scorer in SCORERS.keys():
        print(scorer)
Ejemplo n.º 13
0
    def evaluate(self, params, df):
        """Evaluates the data.


        Evaluates the data with a given scoring function and given hyper-parameters
        of the whole pipeline. If no parameters are set, default configuration for
        each step is evaluated : no feature selection is applied and no meta features are
        created.

        Parameters
        ----------
        params : dict, default = None.
            Hyper-parameters dictionary for the whole pipeline.

            - The keys must respect the following syntax : "enc__param".

                - "enc" = "ne" for na encoder
                - "enc" = "ce" for categorical encoder
                - "enc" = "fs" for feature selector [OPTIONAL]
                - "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
                - "enc" = "est" for the final estimator

                - "param" : a correct associated parameter for each step. Ex: "max_depth" for "enc"="est", ...

            - The values are those of the parameters. Ex: 4 for key = "est__max_depth", ...

        df : dict, default = None
            Dataset dictionary. Must contain keys and values:

            - "train": pandas DataFrame for the train set.
            - "target" : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification). Indexes should match the train set.

        Returns
        -------
        float.
            The score. The higher the better.
            Positive for a score and negative for a loss.

        Examples
        --------
        >>> from mlbox.optimisation import *
        >>> from sklearn.datasets import load_boston
        >>> #load data
        >>> dataset = load_boston()
        >>> #evaluating the pipeline
        >>> opt = Optimiser()
        >>> params = {
        ...     "ne__numerical_strategy" : 0,
        ...     "ce__strategy" : "label_encoding",
        ...     "fs__threshold" : 0.1,
        ...     "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")],
        ...     "est__strategy" : "Linear"
        ... }
        >>> df = {"train" : pd.DataFrame(dataset.data), "target" : pd.Series(dataset.target)}
        >>> opt.evaluate(params, df)
        """

        ne = NA_encoder()
        ce = Categorical_encoder()

        ##########################################
        #    Automatically checking the task
        ##########################################

        # TODO: a lot of code can be factorized for the different tasks

        ##########################################
        #             Classification
        ##########################################

        if (df['target'].dtype == 'int'):

            # Cross validation

            counts = df['target'].value_counts()
            classes_to_drop = counts[counts < self.n_folds].index
            mask_to_drop = df['target'].apply(lambda x: x in classes_to_drop)
            indexes_to_drop = df['target'][mask_to_drop].index
            n_classes = len(counts) - len(classes_to_drop)

            if n_classes == 1:
                raise ValueError(
                    "Your target has not enough classes. You can't run the optimiser"
                )

            cv = StratifiedKFold(n_splits=self.n_folds,
                                 shuffle=True,
                                 random_state=self.random_state)

            # Estimator

            est = Classifier()

            # Feature selection if specified

            fs = None
            if (params is not None):
                for p in params.keys():
                    if (p.startswith("fs__")):
                        fs = Clf_feature_selector()
                    else:
                        pass

            # Stacking if specified

            STCK = {}
            if (params is not None):
                for p in params.keys():
                    if (p.startswith("stck")):
                        # TODO: Check if p.split("__")[1] instead?
                        STCK[p.split("__")[0]] = StackingClassifier(
                            verbose=False)  # noqa
                    else:
                        pass

            # Default scoring for classification

            if (self.scoring is None):
                self.scoring = 'neg_log_loss'  # works also for multiclass pb

            else:
                if (type(self.scoring) == str):
                    if (self.scoring not in list(SCORERS.keys())):

                        warnings.warn("Unknown or invalid scoring metric. "
                                      "neg_log_loss is used instead.")

                        self.scoring = 'neg_log_loss'

                    else:

                        # binary classification
                        if n_classes <= 2:
                            pass

                        # multiclass classification
                        else:
                            warnings.warn(
                                "This is a multiclass problem. Please make sure that your scoring metric is "
                                "appropriate.")

                            if self.scoring + "_weighted" in list(
                                    SCORERS.keys()):

                                warnings.warn(
                                    "Weighted strategy for the scoring metric is used."
                                )
                                self.scoring = self.scoring + "_weighted"

                            # specific scenarios
                            else:
                                if self.scoring == "roc_auc":
                                    self.scoring = make_scorer(
                                        lambda y_true, y_pred: roc_auc_score(
                                            pd.get_dummies(y_true), y_pred
                                        ),  # noqa
                                        greater_is_better=True,
                                        needs_proba=True)
                else:
                    pass

        ##########################################
        #               Regression
        ##########################################

        elif (df['target'].dtype == 'float'):

            # Cross validation

            indexes_to_drop = []
            cv = KFold(n_splits=self.n_folds,
                       shuffle=True,
                       random_state=self.random_state)

            # Estimator

            est = Regressor()

            # Feature selection if specified

            fs = None
            if (params is not None):
                for p in params.keys():
                    if (p.startswith("fs__")):
                        fs = Reg_feature_selector()
                    else:
                        pass

            # Stacking if specified

            STCK = {}
            if (params is not None):
                for p in params.keys():
                    if (p.startswith("stck")):
                        # TODO: Check if p.split("__")[1] instead?
                        STCK[p.split("__")[0]] = StackingRegressor(
                            verbose=False)
                    else:
                        pass

            # Default scoring for regression

            if (self.scoring is None):
                self.scoring = "neg_mean_squared_error"

            else:
                if (type(self.scoring) == str):
                    if (self.scoring not in list(SCORERS.keys())):

                        warnings.warn(
                            "Unknown or invalid scoring metric. "
                            "neg_mean_squared_error is used instead.")

                        self.scoring = 'neg_mean_squared_error'

                    else:
                        pass
                else:
                    pass

        else:
            raise ValueError("Impossible to determine the task. "
                             "Please check that your target is encoded.")

        ##########################################
        #          Creating the Pipeline
        ##########################################

        pipe = [("ne", ne), ("ce", ce)]

        # Do we need to cache transformers?

        cache = False

        if (params is not None):
            if ("ce__strategy" in params):
                if (params["ce__strategy"] == "entity_embedding"):
                    cache = True
                else:
                    pass
            else:
                pass

        if (fs is not None):
            if ("fs__strategy" in params):
                if (params["fs__strategy"] != "variance"):
                    cache = True
                else:
                    pass
        else:
            pass

        if (len(STCK) != 0):
            cache = True
        else:
            pass

        # Pipeline creation

        if (fs is not None):
            pipe.append(("fs", fs))
        else:
            pass

        for stck in np.sort(list(STCK)):
            pipe.append((stck, STCK[stck]))

        pipe.append(("est", est))

        if cache:
            pp = Pipeline(pipe, memory=self.to_path)
        else:
            pp = Pipeline(pipe)

        ##########################################
        #          Fitting the Pipeline
        ##########################################

        start_time = time.time()

        # No params : default configuration

        if (params is None):
            set_params = True
            print('No parameters set. Default configuration is tested')

        else:
            try:
                pp = pp.set_params(**params)
                set_params = True
            except:
                set_params = False

        if (set_params):

            if (self.verbose):
                print("")
                print("#####################################################"
                      " testing hyper-parameters... "
                      "#####################################################")
                print("")
                print(">>> NA ENCODER :" + str(ne.get_params()))
                print("")
                print(">>> CA ENCODER :" + str({'strategy': ce.strategy}))

                if (fs is not None):
                    print("")
                    print(">>> FEATURE SELECTOR :" + str(fs.get_params()))

                for i, stck in enumerate(np.sort(list(STCK))):

                    stck_params = STCK[stck].get_params().copy()
                    stck_params_display = {
                        k: stck_params[k]
                        for k in stck_params.keys() if k not in
                        ["level_estimator", "verbose", "base_estimators"]
                    }

                    print("")
                    print(">>> STACKING LAYER n°" + str(i + 1) + " :" +
                          str(stck_params_display))

                    for j, model in enumerate(stck_params["base_estimators"]):
                        print("")
                        print("    > base_estimator n°" + str(j + 1) + " :" +
                              str(
                                  dict(
                                      list(model.get_params().items()) +
                                      list(model.get_estimator().get_params().
                                           items()))))

                print("")
                print(">>> ESTIMATOR :" + str(
                    dict(
                        list(est.get_params().items()) +
                        list(est.get_estimator().get_params().items()))))
                print("")

            try:

                # Computing the mean cross validation score across the folds
                scores = cross_val_score(estimator=pp,
                                         X=df['train'].drop(indexes_to_drop),
                                         y=df['target'].drop(indexes_to_drop),
                                         scoring=self.scoring,
                                         cv=cv)
                score = np.mean(scores)

            except:

                scores = [-np.inf for _ in range(self.n_folds)]
                score = -np.inf

        else:
            raise ValueError("Pipeline cannot be set with these parameters."
                             " Check the name of your stages.")

        if (score == -np.inf):
            warnings.warn(
                "An error occurred while computing the cross "
                "validation mean score. Please check that the parameter values are correct "
                "and that your scoring function is valid and appropriate to the task."
            )

        ##########################################
        #             Reporting scores
        ##########################################

        out = " ("

        for i, s in enumerate(scores[:-1]):
            out = out + "fold " + str(i + 1) + " = " + str(s) + ", "

        if (self.verbose):
            print("")
            print("MEAN SCORE : " + str(self.scoring) + " = " + str(score))
            print("VARIANCE : " + str(np.std(scores)) + out + "fold " +
                  str(i + 2) + " = " + str(scores[-1]) + ")")
            print("CPU time: %s seconds" % (time.time() - start_time))
            print("")

        return score
Ejemplo n.º 14
0
import random
from pprint import pprint

from numpy import ravel
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, SCORERS

if __name__ == '__main__':
    data = load_boston()

    model = RandomForestRegressor(n_estimators=10, random_state=0)
    cross = cross_val_score(model,
                            data.data,
                            data.target,
                            cv=10,
                            scoring='neg_mean_squared_error')

    # 模型指标评估列表
    k = sorted(SCORERS.keys())
    pprint(k)
rmse = math.sqrt(mse)
print(rmse)
'''summary table for coefficients'''
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression as lr
X2_train = sm.add_constant(X_train)  # add a column of 1 beside x col
ols = sm.OLS(
    y_train.astype(float),
    X2_train.astype(float))  # ordinary least square = linear regression
lr = ols.fit()
print(lr.summary())
'''Cross validation'''
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression as lr
from sklearn.metrics import SCORERS
SCORERS.keys()

kf = KFold(n_splits=5, shuffle=True, random_state=1)
lr = lr()
'''Cross validation score (R2 for test data, and full data)'''
r2score = cross_val_score(lr1, X_test, y_test, cv=kf, scoring='r2')
print(r2score.mean())
r2score_b = cross_val_score(lr, X, y, cv=kf, scoring='r2')
print(r2score_b.mean())
'''Cross validation score (RMSE for test data, and full data)'''
RMSE = np.sqrt(-cross_val_score(
    lr1, X_test, y_test, cv=kf, scoring='neg_mean_squared_error'))
print(RMSE.mean())
RMSE_b = np.sqrt(
    -cross_val_score(lr, X, y, cv=kf, scoring='neg_mean_squared_error'))
print(RMSE_b.mean())
Ejemplo n.º 16
0
                  ['grid_search', 'random_search', 'bayesian_search']),
              required=True)
@click.option(
    '--cv',
    help="Number of cross validation steps",
    type=int,
    required=False,
    default=5,
    show_default=True,
)
@click.option(
    '-m',
    '--metrics',
    help=
    "Metrics that should be tested during cross validation (comma separated)",
    type=click.Choice(list(SCORERS.keys())),
    required=False,
    multiple=True,
)
@click.option(
    '--randomize',
    help=
    "Randomize sample labels to test the stability of and effectiveness of the machine learning algorithm",
    is_flag=True,
    required=False,
)
def classify(
    data: str,
    out: str,
    model: str,
    optimizer: str,