Ejemplo n.º 1
0
def main():
    try:
        import sklearn
        import joblib

        if sklearn.__version__ < "0.20":
            gs.fatal(
                "Package python3-scikit-learn 0.20 or newer is not installed")

    except ImportError:
        gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed")

    # parser options
    group = options["group"]
    output = options["output"]
    model_load = options["load_model"]
    probability = flags["p"]
    prob_only = flags["z"]
    chunksize = int(options["chunksize"])

    # remove @ from output in case overwriting result
    if "@" in output:
        output = output.split("@")[0]

    # check probabilities=True if prob_only=True
    if prob_only is True and probability is False:
        gs.fatal("Need to set probabilities=True if prob_only=True")

    # reload fitted model and training data
    estimator, y, class_labels = joblib.load(model_load)

    # define RasterStack
    stack = RasterStack(group=group)

    # perform raster prediction
    region = Region()
    row_incr = math.ceil(chunksize / region.cols)

    # do not read by increments if increment > n_rows
    if row_incr >= region.rows:
        row_incr = None

    # prediction
    if prob_only is False:
        gs.message("Predicting classification/regression raster...")
        stack.predict(
            estimator=estimator,
            output=output,
            height=row_incr,
            overwrite=gs.overwrite(),
        )

    if probability is True:
        gs.message("Predicting class probabilities...")
        stack.predict_proba(
            estimator=estimator,
            output=output,
            class_labels=np.unique(y),
            overwrite=gs.overwrite(),
            height=row_incr,
        )

    # assign categories for classification map
    if class_labels is not None and prob_only is False:
        rules = []

        for val, lab in class_labels.items():
            rules.append(",".join([str(val), lab]))

        rules = "\n".join(rules)
        rules_file = string_to_rules(rules)
        r.category(map=output, rules=rules_file, separator="comma")
Ejemplo n.º 2
0
from grass.pygrass.utils import set_path
set_path('r.learn.ml')

from raster import RasterStack

stack = RasterStack(rasters=[
    "lsat5_1987_10", "lsat5_1987_20", "lsat5_1987_30", "lsat5_1987_40",
    "lsat5_1987_50", "lsat5_1987_70"
])
stack = RasterStack(rasters=maplist)
stack.lsat5_1987_10

maplist2 = deepcopy(maplist)
maplist2 = [i.split('@')[0] for i in maplist2]

stack = RasterStack(rasters=maplist2)
stack.lsat5_1987_10

X, y, crd = stack.extract_points(vect_name='landclass96_roi',
                                 fields=['value', 'cat'])
df = stack.extract_points(vect_name='landclass96_roi',
                          field='value',
                          as_df=True)
df = stack.extract_pixels(response='landclass96_roi', as_df=True)

X, y, crd = stack.extract_pixels(response='landclass96_roi')

stack.head()
stack.tail()

data = stack.read()
Ejemplo n.º 3
0
def main():
    try:
        import sklearn

        if sklearn.__version__ < '0.20':
            gs.fatal("Scikit learn 0.20 or newer is required")

    except ImportError:
        gs.fatal("Scikit learn 0.20 or newer is not installed")

    try:
        import pandas as pd

    except ImportError:
        gs.fatal("Pandas is not installed ")

    # -------------------------------------------------------------------------
    # Parser options
    # -------------------------------------------------------------------------

    # required gui section
    group = options['group']
    training_map = options['training_map']
    training_points = options['training_points']
    field = options['field']
    model_save = options['save_model']

    # estimator gui section
    model_name = options['model_name']
    grid_search = options['grid_search']
    hyperparams = {
        'C': options['c'],
        'min_samples_split': options['min_samples_split'],
        'min_samples_leaf': options['min_samples_leaf'],
        'n_estimators': options['n_estimators'],
        'learning_rate': options['learning_rate'],
        'subsample': options['subsample'],
        'max_depth': options['max_depth'],
        'max_features': options['max_features'],
        'max_degree': options['max_degree'],
        'n_neighbors': options['n_neighbors'],
        'weights': options['weights']
    }

    # cross validation
    cv = int(options['cv'])
    group_raster = options['group_raster']
    tune_only = flags['t']
    importances = flags['f']
    n_permutations = int(options['n_permutations'])
    errors_file = options['errors_file']
    preds_file = options['preds_file']
    fimp_file = options['fimp_file']
    param_file = options['param_file']

    # general options
    norm_data = flags['s']
    category_maps = option_to_list(options['category_maps'])
    random_state = int(options['random_state'])
    load_training = options['load_training']
    save_training = options['save_training']
    n_jobs = int(options['n_jobs'])
    balance = flags['b']

    # -------------------------------------------------------------------------
    # Make dicts for hyperparameters, datatypes and parameters for tuning
    # -------------------------------------------------------------------------

    hyperparams_type = dict.fromkeys(hyperparams, int)
    hyperparams_type['C'] = float
    hyperparams_type['learning_rate'] = float
    hyperparams_type['subsample'] = float
    hyperparams_type['weights'] = str
    param_grid = deepcopy(hyperparams_type)
    param_grid = dict.fromkeys(param_grid, None)

    for key, val in hyperparams.items():
        # split any comma separated strings and add them to the param_grid
        if ',' in val:

            # add all vals to param_grid
            param_grid[key] = [
                hyperparams_type[key](i) for i in val.split(',')
            ]

            # use first param for default
            hyperparams[key] = [
                hyperparams_type[key](i) for i in val.split(',')
            ][0]

        # else convert the single strings to int or float
        else:
            hyperparams[key] = hyperparams_type[key](val)

    if hyperparams['max_depth'] == 0: hyperparams['max_depth'] = None
    if hyperparams['max_features'] == 0: hyperparams['max_features'] = 'auto'
    param_grid = {k: v for k, v in param_grid.items() if v is not None}

    # retrieve sklearn estimator object and parameters
    estimator, mode = model_classifiers(model_name, random_state, n_jobs,
                                        hyperparams, balance)

    # remove dict keys that are incompatible for the selected estimator
    estimator_params = estimator.get_params()
    param_grid = {
        key: value
        for key, value in param_grid.items() if key in estimator_params
    }

    scoring, search_scorer = scoring_metrics(mode)

    # -------------------------------------------------------------------------
    # Error checking of input options
    # -------------------------------------------------------------------------

    # feature importances selected by no cross-validation scheme used
    if importances is True and cv == 1:
        gs.fatal('Feature importances require cross-validation cv > 1')

    # check for field attribute if training_points are used
    if training_points != '' and field == '':
        gs.fatal('No attribute column specified for training points')

    # check that cv > 1 if hyperparameter tuning is selected
    if any(param_grid
           ) is True and cv == 1 and grid_search == 'cross-validation':
        gs.fatal(
            'Hyperparameter search using cross validation requires cv > 1')

    # check the cross-validation occurs if feature importances is True
    if importances is True and tune_only is True:
        gs.fatal('Permutation feature importances require cross validation')

    if importances is True and cv == 1:
        gs.fatal('Permutation feature importances require cv > 1')

    # -------------------------------------------------------------------------
    # Define RasterStack
    # -------------------------------------------------------------------------

    # fetch individual raster names from group
    maplist = gs.read_command("i.group", group=group,
                              flags="g").split(os.linesep)[:-1]

    # create RasterStack
    stack = RasterStack(rasters=maplist)

    if category_maps is not None:
        stack.categorical = category_maps

    # -------------------------------------------------------------------------
    # Extract training data
    # -------------------------------------------------------------------------

    # Sample training data and group id
    if load_training != '':
        X, y, group_id, sample_coords = load_training_data(load_training)
    else:
        gs.message('Extracting training data')

        # append spatial clumps or group raster to the predictors
        if group_raster != '':
            stack.append(group_raster)

        # extract training data
        if training_map != '':
            X, y, sample_coords = stack.extract_pixels(training_map)
        elif training_points != '':
            X, y, sample_coords = stack.extract_points(training_points, field)

        y = y.flatten()  # reshape to 1 dimension

        # take group id from last column and remove from predictors
        if group_raster != '':
            group_id = X[:, -1]
            X = np.delete(X, -1, axis=1)
            stack.drop(group_raster)
        else:
            group_id = None

        # check for labelled pixels and training data
        if y.shape[0] == 0 or X.shape[0] == 0:
            gs.fatal('No training pixels or pixels in imagery group '
                     '...check computational region')

        # shuffle data
        from sklearn.utils import shuffle

        if group_id is None:
            X, y, sample_coords = shuffle(X,
                                          y,
                                          sample_coords,
                                          random_state=random_state)
        else:
            X, y, sample_coords, group_id = shuffle(X,
                                                    y,
                                                    sample_coords,
                                                    group_id,
                                                    random_state=random_state)

        # optionally save extracted data to .csv file
        if save_training != '':
            save_training_data(X, y, group_id, sample_coords, save_training)

    # ---------------------------------------------------------------------
    # Define the inner search resampling method
    # ---------------------------------------------------------------------

    from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
                                         GroupKFold, KFold, ShuffleSplit,
                                         GroupShuffleSplit)

    # define inner resampling using cross-validation method
    if any(param_grid) is True and grid_search == 'cross-validation':

        if group_id is None and mode == 'classification':
            inner = StratifiedKFold(n_splits=cv, random_state=random_state)

        elif group_id is None and mode == 'regression':
            inner = KFold(n_splits=cv, random_state=random_state)

        else:
            inner = GroupKFold(n_splits=cv)

    # define inner resampling using the holdout method
    elif any(param_grid) is True and grid_search == 'holdout':

        if group_id is None:
            inner = ShuffleSplit(n_splits=1,
                                 test_size=0.33,
                                 random_state=random_state)

        else:
            inner = GroupShuffleSplit(n_splits=1,
                                      test_size=0.33,
                                      random_state=random_state)
    else:
        inner = None

    # ---------------------------------------------------------------------
    # Define the outer search resampling method
    # ---------------------------------------------------------------------
    if cv > 1:

        if group_id is None and mode == 'classification':
            outer = StratifiedKFold(n_splits=cv, random_state=random_state)

        elif group_id is None and mode == 'regression':
            outer = KFold(n_splits=cv, random_state=random_state)

        else:
            outer = GroupKFold(n_splits=cv)

    # ---------------------------------------------------------------------
    # Define sample weights for estimators that require weights in fit method
    # ---------------------------------------------------------------------

    # estimators that take sample_weights
    if balance is True and mode == 'classification' and model_name in (
            'GradientBoostingClassifier', 'GaussianNB'):

        from sklearn.utils import compute_class_weight
        class_weights = compute_class_weight(class_weight='balanced',
                                             classes=(y),
                                             y=y)

    else:
        class_weights = None

    # ---------------------------------------------------------------------
    # Define the preprocessing pipeline
    # ---------------------------------------------------------------------

    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer

    # standardization
    if norm_data is True and category_maps is None:
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()

        trans = ColumnTransformer(
            remainder='passthrough',
            transformers=[('scaling', scaler,
                           np.setxor1d(range(stack.count),
                                       stack.categorical).astype('int'))])

    # onehot encoding
    if category_maps is not None:
        from sklearn.preprocessing import OneHotEncoder
        enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

        trans.transformers.append(('onehot', enc, stack.categorical))

    # combine transformers
    if norm_data is True or category_maps is not None:
        estimator = Pipeline([('preprocessing', trans),
                              ('estimator', estimator)])

    # ---------------------------------------------------------------------
    # Create the hyperparameter grid search method
    # ---------------------------------------------------------------------

    # check if dict contains and keys - perform GridSearchCV
    if any(param_grid) is True:

        # if Pipeline then change param_grid keys to named_step
        if isinstance(estimator, Pipeline):
            for key in param_grid.keys():
                newkey = 'estimator__' + key
                param_grid[newkey] = param_grid.pop(key)

        # create grid search method
        estimator = GridSearchCV(estimator=estimator,
                                 param_grid=param_grid,
                                 scoring=search_scorer,
                                 n_jobs=n_jobs,
                                 cv=inner)

    # ---------------------------------------------------------------------
    # Estimator training
    # ---------------------------------------------------------------------

    gs.message(os.linesep)
    gs.message(('Fitting model using ' + model_name))

    # fitting ensuring that all options are passed
    if model_name in ('GradientBoostingClassifier',
                      'GausianNB') and balance is True:
        if isinstance(estimator, Pipeline):
            fit_params = {'estimator__sample_weight': class_weights}
        else:
            fit_params = {'sample_weight': class_weights}
    else:
        fit_params = {}

    if isinstance(inner, (GroupKFold, GroupShuffleSplit)):
        estimator.fit(X, y, groups=group_id, **fit_params)
    else:
        estimator.fit(X, y, **fit_params)

    # message best hyperparameter setup and optionally save using pandas
    if any(param_grid) is True:
        gs.message(os.linesep)
        gs.message('Best parameters:')
        gs.message(str(estimator.best_params_))
        if param_file != '':
            param_df = pd.DataFrame(estimator.cv_results_)
            param_df.to_csv(param_file)

    # ---------------------------------------------------------------------
    # Cross-validation
    # ---------------------------------------------------------------------


#        from sklearn.model_selection import cross_validate
#        scores = cross_validate(estimator, X, y, group_id, scoring, outer, n_jobs, fit_params=fit_params)
#        gs.message(scores)
#        test_scoring = ['test_' + i for i in scoring]
#        gs.message(os.linesep)
#        gs.message(('Metric \t Mean \t Error'))
#        for sc in test_scoring:
#            gs.message(sc + '\t' + str(scores[sc].mean()) + '\t' + str(scores[sc].std()))

    if cv > 1 and tune_only is not True:

        if mode == 'classification' and cv > np.histogram(
                y, bins=np.unique(y))[0].min():
            gs.message(os.linesep)
            gs.fatal('Number of cv folds is greater than number of ' +
                     'samples in some classes')

        gs.message(os.linesep)
        gs.message("Cross validation global performance measures......:")

        # add auc and mcc as scorer if classification is binary
        if mode == 'classification' and \
            len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)):
            scoring.append('roc_auc')
            scoring.append('matthews_corrcoef')

        # perform the cross-validatation
        scores, cscores, fimp, models, preds = cross_val_scores(
            estimator, X, y, group_id, class_weights, outer, scoring,
            importances, n_permutations, random_state, n_jobs)

        preds = np.hstack((preds, sample_coords))

        for method, val in scores.items():
            gs.message(method + ":\t%0.3f\t+/-SD\t%0.3f" %
                       (val.mean(), val.std()))

        # individual class scores
        if mode == 'classification' and len(np.unique(y)) != 2:

            gs.message(os.linesep)
            gs.message('Cross validation class performance measures......:')
            gs.message('Class \t' + '\t'.join(map(str, np.unique(y))))

            for method, val in cscores.items():
                mat_cscores = np.matrix(val)
                gs.message(method + ':\t' + '\t'.join(
                    map(str,
                        np.round(mat_cscores.mean(axis=0), 2)[0])))
                gs.message(
                    method + ' std:\t' +
                    '\t'.join(map(str,
                                  np.round(mat_cscores.std(axis=0), 2)[0])))

        # write cross-validation results for csv file
        if errors_file != '':
            errors = pd.DataFrame(scores)
            errors.to_csv(errors_file, mode='w')

        # write cross-validation predictions to csv file
        if preds_file != '':
            preds = pd.DataFrame(preds)
            preds.columns = ['y_true', 'y_pred', 'fold', 'x', 'y']
            preds.to_csv(preds_file, mode='w')
            text_file = open(preds_file + 't', "w")
            text_file.write('"Integer","Real","Real","integer","Real","Real"')
            text_file.close()

        # feature importances
        if importances is True:
            gs.message(os.linesep)
            gs.message("Feature importances")
            gs.message("id" + "\t" + "Raster" + "\t" + "Importance")

            # mean of cross-validation feature importances
            for i in range(len(fimp.mean(axis=0))):
                gs.message(
                    str(i) + "\t" + maplist[i] + "\t" +
                    str(round(fimp.mean(axis=0)[i], 4)))

            if fimp_file != '':
                np.savetxt(fname=fimp_file,
                           X=fimp,
                           delimiter=',',
                           header=','.join(maplist),
                           comments='')

    # Save the fitted model
    from sklearn.externals import joblib
    joblib.dump((X, y, sample_coords, group_id, estimator), model_save)
Ejemplo n.º 4
0
def main():
    try:
        import sklearn

        if sklearn.__version__ < "0.20":
            gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed")

    except ImportError:
        gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed")

    try:
        import pandas as pd
        
    except ImportError:
        gs.fatal("Package python3-pandas 0.25 or newer is not installed")

    # parser options ---------------------------------------------------------------------------------------------------
    group = options["group"]
    training_map = options["training_map"]
    training_points = options["training_points"]
    field = options["field"]
    model_save = options["save_model"]
    model_name = options["model_name"]
    hyperparams = {
        "penalty": options["penalty"],
        "alpha": options["alpha"],
        "l1_ratio": options["l1_ratio"],
        "C": options["c"],
        "epsilon": options["epsilon"],
        "min_samples_leaf": options["min_samples_leaf"],
        "n_estimators": options["n_estimators"],
        "learning_rate": options["learning_rate"],
        "subsample": options["subsample"],
        "max_depth": options["max_depth"],
        "max_features": options["max_features"],
        "n_neighbors": options["n_neighbors"],
        "weights": options["weights"],
        "hidden_layer_sizes": options["hidden_units"],
    }
    cv = int(options["cv"])
    group_raster = options["group_raster"]
    importances = flags["f"]
    preds_file = options["preds_file"]
    classif_file = options["classif_file"]
    fimp_file = options["fimp_file"]
    param_file = options["param_file"]
    norm_data = flags["s"]
    random_state = int(options["random_state"])
    load_training = options["load_training"]
    save_training = options["save_training"]
    n_jobs = int(options["n_jobs"])
    balance = flags["b"]
    category_maps = option_to_list(options["category_maps"])

    # define estimator -------------------------------------------------------------------------------------------------
    hyperparams, param_grid = process_param_grid(hyperparams)
    estimator, mode = predefined_estimators(
        model_name, random_state, n_jobs, hyperparams
    )

    # remove dict keys that are incompatible for the selected estimator
    estimator_params = estimator.get_params()
    param_grid = {
        key: value for key, value in param_grid.items() if key in estimator_params
    }
    scoring, search_scorer = scoring_metrics(mode)

    # checks of input options ------------------------------------------------------------------------------------------
    if (
        mode == "classification"
        and balance is True
        and model_name not in check_class_weights()
    ):

        gs.warning(model_name + " does not support class weights")
        balance = False

    if mode == "regression" and balance is True:
        gs.warning("Balancing of class weights is only possible for classification")
        balance = False

    if classif_file:
        if cv <= 1:
            gs.fatal(
                "Output of cross-validation global accuracy requires cross-validation cv > 1"
            )
        if not os.path.exists(os.path.dirname(classif_file)):
            gs.fatal("Directory for output file {} does not exist".format(classif_file))

    # feature importance file selected but no cross-validation scheme used
    if importances:
        if sklearn.__version__ < "0.22":
            gs.fatal("Feature importances calculation requires scikit-learn version >= 0.22")

    if fimp_file:
        if importances is False:
            gs.fatal('Output of feature importance requires the "f" flag to be set')
        if not os.path.exists(os.path.dirname(fimp_file)):
            gs.fatal("Directory for output file {} does not exist".format(fimp_file))

    # predictions file selected but no cross-validation scheme used
    if preds_file:
        if cv <= 1:
            gs.fatal(
                "Output of cross-validation predictions requires cross-validation cv > 1"
            )
        if not os.path.exists(os.path.dirname(preds_file)):
            gs.fatal("Directory for output file {} does not exist".format(preds_file))

    # define RasterStack -----------------------------------------------------------------------------------------------
    stack = RasterStack(group=group)

    if category_maps is not None:
        stack.categorical = category_maps

    # extract training data --------------------------------------------------------------------------------------------
    if load_training != "":
        X, y, cat, class_labels, group_id = load_training_data(load_training)

        if class_labels is not None:
            a = pd.DataFrame({"response": y, "labels": class_labels})
            a = a.drop_duplicates().values
            class_labels = {k: v for (k, v) in a}

    else:
        gs.message("Extracting training data")

        if group_raster != "":
            stack.append(group_raster)

        if training_map != "":
            X, y, cat = stack.extract_pixels(training_map)
            y = y.flatten()

            with RasterRow(training_map) as src:
                class_labels = {v: k for (k, v, m) in src.cats}

                if "" in class_labels.values():
                    class_labels = None

        elif training_points != "":
            X, y, cat = stack.extract_points(training_points, field)
            y = y.flatten()
            
            if y.dtype in (np.object_, np.object):
                from sklearn.preprocessing import LabelEncoder
                le = LabelEncoder()
                y = le.fit_transform(y)
                class_labels = {k: v for (k, v) in enumerate(le.classes_)}
            else:
                class_labels = None

        # take group id from last column and remove from predictors
        if group_raster != "":
            group_id = X[:, -1]
            X = np.delete(X, -1, axis=1)
            stack.drop(group_raster)
        else:
            group_id = None

        # check for labelled pixels and training data
        if y.shape[0] == 0 or X.shape[0] == 0:
            gs.fatal(
                "No training pixels or pixels in imagery group "
                "...check computational region"
            )

        from sklearn.utils import shuffle

        if group_id is None:
            X, y, cat = shuffle(X, y, cat, random_state=random_state)
        else:
            X, y, cat, group_id = shuffle(
                X, y, cat, group_id, random_state=random_state
            )

        if save_training != "":
            save_training_data(
                save_training, X, y, cat, class_labels, group_id, stack.names
            )

    # cross validation settings ----------------------------------------------------------------------------------------
    # inner resampling method (cv=2)
    from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, KFold

    if any(param_grid) is True:
        if group_id is None and mode == "classification":
            inner = StratifiedKFold(n_splits=2, random_state=random_state)
        elif group_id is None and mode == "regression":
            inner = KFold(n_splits=2, random_state=random_state)
        else:
            inner = GroupKFold(n_splits=2)
    else:
        inner = None

    # outer resampling method (cv=cv)
    if cv > 1:
        if group_id is None and mode == "classification":
            outer = StratifiedKFold(n_splits=cv, random_state=random_state)
        elif group_id is None and mode == "regression":
            outer = KFold(n_splits=cv, random_state=random_state)
        else:
            outer = GroupKFold(n_splits=cv)

    # modify estimators that take sample_weights -----------------------------------------------------------------------
    if balance is True:
        from sklearn.utils import compute_class_weight

        class_weights = compute_class_weight(class_weight="balanced", classes=(y), y=y)
        fit_params = {"sample_weight": class_weights}

    else:
        class_weights = None
        fit_params = {}

    # preprocessing ----------------------------------------------------------------------------------------------------
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    # standardization
    if norm_data is True and category_maps is None:
        scaler = StandardScaler()
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[("scaling", scaler, np.arange(0, stack.count))],
        )

    # one-hot encoding
    elif norm_data is False and category_maps is not None:
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(
            remainder="passthrough", transformers=[("onehot", enc, stack.categorical)]
        )

    # standardization and one-hot encoding
    elif norm_data is True and category_maps is not None:
        scaler = StandardScaler()
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[
                ("onehot", enc, stack.categorical),
                ("scaling", scaler, np.setxor1d(
                    range(stack.count), stack.categorical).astype('int')),
            ],
        )

    # combine transformers
    if norm_data is True or category_maps is not None:
        estimator = Pipeline([("preprocessing", trans), ("estimator", estimator)])
        param_grid = wrap_named_step(param_grid)
        fit_params = wrap_named_step(fit_params)

    if any(param_grid) is True:
        estimator = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid,
            scoring=search_scorer,
            n_jobs=n_jobs,
            cv=inner,
        )

    # estimator training -----------------------------------------------------------------------------------------------
    gs.message(os.linesep)
    gs.message(("Fitting model using " + model_name))
    if balance is True and group_id is not None:
        estimator.fit(X, y, groups=group_id, **fit_params)
    elif balance is True and group_id is None:
        estimator.fit(X, y, **fit_params)
    else:
        estimator.fit(X, y)

    # message best hyperparameter setup and optionally save using pandas
    if any(param_grid) is True:
        gs.message(os.linesep)
        gs.message("Best parameters:")

        optimal_pars = [
            (k.replace("estimator__", "").replace("selection__", "") + " = " + str(v))
            for (k, v) in estimator.best_params_.items()
        ]

        for i in optimal_pars:
            gs.message(i)

        if param_file != "":
            param_df = pd.DataFrame(estimator.cv_results_)
            param_df.to_csv(param_file)

    # cross-validation -------------------------------------------------------------------------------------------------
    if cv > 1:
        from sklearn.metrics import classification_report
        from sklearn import metrics

        if (
            mode == "classification"
            and cv > np.histogram(y, bins=np.unique(y))[0].min()
        ):
            gs.message(os.linesep)
            gs.fatal(
                "Number of cv folds is greater than number of "
                "samples in some classes"
            )

        gs.message(os.linesep)
        gs.message("Cross validation global performance measures......:")

        if (
            mode == "classification"
            and len(np.unique(y)) == 2
            and all([0, 1] == np.unique(y))
        ):
            scoring["roc_auc"] = metrics.roc_auc_score

        from sklearn.model_selection import cross_val_predict

        preds = cross_val_predict(
            estimator, X, y, group_id, cv=outer, n_jobs=n_jobs, fit_params=fit_params
        )

        test_idx = [test for train, test in outer.split(X, y)]
        n_fold = np.zeros((0,))

        for fold in range(outer.get_n_splits()):
            n_fold = np.hstack((n_fold, np.repeat(fold, test_idx[fold].shape[0])))

        preds = {"y_pred": preds, "y_true": y, "cat": cat, "fold": n_fold}

        preds = pd.DataFrame(data=preds, columns=["y_pred", "y_true", "cat", "fold"])
        gs.message(os.linesep)
        gs.message("Global cross validation scores...")
        gs.message(os.linesep)
        gs.message("Metric \t Mean \t Error")

        for name, func in scoring.items():
            score_mean = (
                preds.groupby("fold")
                .apply(lambda x: func(x["y_true"], x["y_pred"]))
                .mean()
            )

            score_std = (
                preds.groupby("fold")
                .apply(lambda x: func(x["y_true"], x["y_pred"]))
                .std()
            )

            gs.message(
                name + "\t" + str(score_mean.round(3)) + "\t" + str(score_std.round(3))
            )

        if mode == "classification":
            gs.message(os.linesep)
            gs.message("Cross validation class performance measures......:")

            report_str = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=False,
            )

            report = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=True,
            )
            report = pd.DataFrame(report)

            gs.message(report_str)

            if classif_file != "":
                report.to_csv(classif_file, mode="w", index=True)

        # write cross-validation predictions to csv file
        if preds_file != "":
            preds.to_csv(preds_file, mode="w", index=False)
            text_file = open(preds_file + "t", "w")
            text_file.write('"Real", "Real", "integer", "integer"')
            text_file.close()

    # feature importances ----------------------------------------------------------------------------------------------
    if importances is True:
        from sklearn.inspection import permutation_importance

        fimp = permutation_importance(
            estimator,
            X,
            y,
            scoring=search_scorer,
            n_repeats=5,
            n_jobs=n_jobs,
            random_state=random_state,
        )

        feature_names = deepcopy(stack.names)
        feature_names = [i.split("@")[0] for i in feature_names]

        fimp = pd.DataFrame(
            {
                "feature": feature_names,
                "importance": fimp["importances_mean"],
                "std": fimp["importances_std"],
            }
        )

        gs.message(os.linesep)
        gs.message("Feature importances")
        gs.message("Feature" + "\t" + "Score")

        for index, row in fimp.iterrows():
            gs.message(
                row["feature"] + "\t" + str(row["importance"]) + "\t" + str(row["std"])
            )

        if fimp_file != "":
            fimp.to_csv(fimp_file, index=False)

    # save the fitted model
    import joblib

    joblib.dump((estimator, y, class_labels), model_save)
Ejemplo n.º 5
0
def main():
    try:
        import sklearn
        from sklearn.externals import joblib

        if sklearn.__version__ < '0.20':
            gs.fatal("Scikit learn 0.20 or newer is required")

    except ImportError:
        gs.fatal("Scikit learn 0.20 or newer is not installed")

    # -------------------------------------------------------------------------
    # Parser options
    # -------------------------------------------------------------------------

    group = options['group']
    output = options['output']
    model_load = options['load_model']
    probability = flags['p']
    prob_only = flags['z']
    chunksize = int(options['chunksize'])

    # remove @ from output in case overwriting result
    if '@' in output:
        output = output.split('@')[0]

    # check that probabilities=True if prob_only=True
    if prob_only is True and probability is False:
        gs.fatal('Need to set probabilities=True if prob_only=True')

    # -------------------------------------------------------------------------
    # Reload fitted model and trainign data
    # -------------------------------------------------------------------------
    X, y, sample_coords, group_id, estimator = joblib.load(model_load)

    # -------------------------------------------------------------------------
    # Define RasterStack
    # -------------------------------------------------------------------------

    # fetch individual raster names from group
    maplist = gs.read_command("i.group", group=group,
                              flags="g").split(os.linesep)[:-1]

    # create RasterStack
    stack = RasterStack(rasters=maplist)

    # -------------------------------------------------------------------------
    # Perform raster prediction
    # -------------------------------------------------------------------------

    # calculate chunksize
    row = stack.read(1)
    rowsize_mg = row.nbytes * 1e-6
    row_incr = int(float(chunksize) / float(rowsize_mg))

    # prediction
    if prob_only is False:

        gs.message('Predicting classification/regression raster...')

        stack.predict(estimator=estimator,
                      output=output,
                      height=row_incr,
                      overwrite=gs.overwrite())

    if probability is True:

        gs.message('Predicting class probabilities...')

        stack.predict_proba(estimator=estimator,
                            output=output,
                            class_labels=np.unique(y),
                            overwrite=gs.overwrite(),
                            height=row_incr)