Beispiel #1
0
 def test_append_with_copy(self):
     """Append another grass raster to a RasterStack not modifying
     in_place"""
     stack = RasterStack(self.predictors[0:5])
     other = stack.append(self.predictors[5], in_place=False)
     self.assertListEqual(other.names, self.predictors)
     self.assertListEqual(stack.names, self.predictors[0:5])
Beispiel #2
0
    def test_append_identical(self):
        """Append another grass raster containing a single layer with an
        identical name and mapset

        Expected that this will fail because the name and mapset is identical
        and a RasterStack has index keys based on fullnames
        """
        stack = RasterStack(self.predictors)
        stack.append(self.predictors[5], in_place=True)
Beispiel #3
0
    def test_append_diff_mapset(self):
        """Append another grass raster containing a single layer that has an
        identical name but is in a different mapset

        Expected that this will pass because the name is identical but the
        mapset is different and a RasterStack has index keys based on
        fullnames
        """
        stack = RasterStack(self.predictors)
        stack.append(self.other, in_place=True)
Beispiel #4
0
 def setUp(self) -> None:
     """Setup that is run before every test"""
     self.stack = RasterStack(self.predictors)
Beispiel #5
0
def main():
    try:
        import sklearn

        if sklearn.__version__ < "0.20":
            gs.fatal(
                "Package python3-scikit-learn 0.20 or newer is not installed")

    except ImportError:
        gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed")

    try:
        import pandas as pd

    except ImportError:
        gs.fatal("Package python3-pandas 0.25 or newer is not installed")

    # parser options ----------------------------------------------------------
    group = options["group"]
    training_map = options["training_map"]
    training_points = options["training_points"]
    field = options["field"]
    model_save = options["save_model"]
    model_name = options["model_name"]
    hyperparams = {
        "penalty": options["penalty"],
        "alpha": options["alpha"],
        "l1_ratio": options["l1_ratio"],
        "C": options["c"],
        "epsilon": options["epsilon"],
        "min_samples_leaf": options["min_samples_leaf"],
        "n_estimators": options["n_estimators"],
        "learning_rate": options["learning_rate"],
        "subsample": options["subsample"],
        "max_depth": options["max_depth"],
        "max_features": options["max_features"],
        "n_neighbors": options["n_neighbors"],
        "weights": options["weights"],
        "hidden_layer_sizes": options["hidden_units"],
    }
    cv = int(options["cv"])
    group_raster = options["group_raster"]
    importances = flags["f"]
    preds_file = options["preds_file"]
    classif_file = options["classif_file"]
    fimp_file = options["fimp_file"]
    param_file = options["param_file"]
    norm_data = flags["s"]
    random_state = int(options["random_state"])
    load_training = options["load_training"]
    save_training = options["save_training"]
    n_jobs = int(options["n_jobs"])
    balance = flags["b"]
    category_maps = option_to_list(options["category_maps"])

    # define estimator --------------------------------------------------------
    hyperparams, param_grid = process_param_grid(hyperparams)
    estimator, mode = predefined_estimators(model_name, random_state, n_jobs,
                                            hyperparams)

    # remove dict keys that are incompatible for the selected estimator
    estimator_params = estimator.get_params()
    param_grid = {
        key: value
        for key, value in param_grid.items() if key in estimator_params
    }
    scoring, search_scorer = scoring_metrics(mode)

    # checks of input options -------------------------------------------------
    if (mode == "classification" and balance is True
            and model_name not in check_class_weights()):
        gs.warning(model_name + " does not support class weights")
        balance = False

    if mode == "regression" and balance is True:
        gs.warning(
            "Balancing of class weights is only possible for classification")
        balance = False

    if classif_file:
        if cv <= 1:
            gs.fatal("Output of cross-validation global accuracy requires "
                     "cross-validation cv > 1")

        if not os.path.exists(os.path.dirname(classif_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                classif_file))

    # feature importance file selected but no cross-validation scheme used
    if importances:
        if sklearn.__version__ < "0.22":
            gs.fatal("Feature importances calculation requires scikit-learn "
                     "version >= 0.22")

    if fimp_file:
        if importances is False:
            gs.fatal(
                'Output of feature importance requires the "f" flag to be set')

        if not os.path.exists(os.path.dirname(fimp_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                fimp_file))

    # predictions file selected but no cross-validation scheme used
    if preds_file:
        if cv <= 1:
            gs.fatal("Output of cross-validation predictions requires "
                     "cross-validation cv > 1")

        if not os.path.exists(os.path.dirname(preds_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                preds_file))

    # define RasterStack ------------------------------------------------------
    stack = RasterStack(group=group)

    if category_maps is not None:
        stack.categorical = category_maps

    # extract training data ---------------------------------------------------
    if load_training != "":
        X, y, cat, class_labels, group_id = load_training_data(load_training)

        if class_labels is not None:
            a = pd.DataFrame({"response": y, "labels": class_labels})
            a = a.drop_duplicates().values
            class_labels = {k: v for (k, v) in a}

    else:
        gs.message("Extracting training data")

        if group_raster != "":
            stack.append(group_raster)

        if training_map != "":
            X, y, cat = stack.extract_pixels(training_map)
            y = y.flatten()

            with RasterRow(training_map) as src:

                if mode == "classification":
                    src_cats = {v: k for (k, v, m) in src.cats}
                    class_labels = {k: k for k in np.unique(y)}
                    class_labels.update(src_cats)
                else:
                    class_labels = None

        elif training_points != "":
            X, y, cat = stack.extract_points(training_points, field)
            y = y.flatten()

            if y.dtype in (np.object_, np.object):
                from sklearn.preprocessing import LabelEncoder
                le = LabelEncoder()
                y = le.fit_transform(y)
                class_labels = {k: v for (k, v) in enumerate(le.classes_)}
            else:
                class_labels = None

        # take group id from last column and remove from predictors
        if group_raster != "":
            group_id = X[:, -1]
            X = np.delete(X, -1, axis=1)
            stack.drop(group_raster)
        else:
            group_id = None

        # check for labelled pixels and training data
        if y.shape[0] == 0 or X.shape[0] == 0:
            gs.fatal("No training pixels or pixels in imagery group ...check "
                     "computational region")

        from sklearn.utils import shuffle

        if group_id is None:
            X, y, cat = shuffle(X, y, cat, random_state=random_state)
        else:
            X, y, cat, group_id = shuffle(X,
                                          y,
                                          cat,
                                          group_id,
                                          random_state=random_state)

        if save_training != "":
            save_training_data(save_training, X, y, cat, class_labels,
                               group_id, stack.names)

    # cross validation settings -----------------------------------------------
    # inner resampling method (cv=2)
    from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
                                         GroupKFold, KFold)

    if any(param_grid) is True:
        if group_id is None and mode == "classification":
            inner = StratifiedKFold(n_splits=3)
        elif group_id is None and mode == "regression":
            inner = KFold(n_splits=3)
        else:
            inner = GroupKFold(n_splits=3)
    else:
        inner = None

    # outer resampling method (cv=cv)
    if cv > 1:
        if group_id is None and mode == "classification":
            outer = StratifiedKFold(n_splits=cv)
        elif group_id is None and mode == "regression":
            outer = KFold(n_splits=cv)
        else:
            outer = GroupKFold(n_splits=cv)

    # modify estimators that take sample_weights ------------------------------
    if balance is True:
        from sklearn.utils import compute_class_weight

        class_weights = compute_class_weight(class_weight="balanced",
                                             classes=(y),
                                             y=y)
        fit_params = {"sample_weight": class_weights}

    else:
        class_weights = None
        fit_params = {}

    # preprocessing -----------------------------------------------------------
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    # standardization
    if norm_data is True and category_maps is None:
        scaler = StandardScaler()
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[("scaling", scaler, np.arange(0, stack.count))],
        )

    # one-hot encoding
    elif norm_data is False and category_maps is not None:
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(remainder="passthrough",
                                  transformers=[("onehot", enc,
                                                 stack.categorical)])

    # standardization and one-hot encoding
    elif norm_data is True and category_maps is not None:
        scaler = StandardScaler()
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[
                ("onehot", enc, stack.categorical),
                ("scaling", scaler,
                 np.setxor1d(range(stack.count),
                             stack.categorical).astype('int')),
            ],
        )

    # combine transformers
    if norm_data is True or category_maps is not None:
        estimator = Pipeline([("preprocessing", trans),
                              ("estimator", estimator)])
        param_grid = wrap_named_step(param_grid)
        fit_params = wrap_named_step(fit_params)

    if any(param_grid) is True:
        estimator = GridSearchCV(estimator=estimator,
                                 param_grid=param_grid,
                                 scoring=search_scorer,
                                 n_jobs=n_jobs,
                                 cv=inner)

    # estimator training ------------------------------------------------------
    gs.message(os.linesep)
    gs.message(("Fitting model using " + model_name))
    if balance is True and group_id is not None:
        estimator.fit(X, y, groups=group_id, **fit_params)
    elif balance is True and group_id is None:
        estimator.fit(X, y, **fit_params)
    else:
        estimator.fit(X, y)

    # message best hyperparameter setup and optionally save using pandas
    if any(param_grid) is True:
        gs.message(os.linesep)
        gs.message("Best parameters:")

        optimal_pars = [
            (k.replace("estimator__", "").replace("selection__", "") + " = " +
             str(v)) for (k, v) in estimator.best_params_.items()
        ]

        for i in optimal_pars:
            gs.message(i)

        if param_file != "":
            param_df = pd.DataFrame(estimator.cv_results_)
            param_df.to_csv(param_file)

    # cross-validation --------------------------------------------------------
    if cv > 1:
        from sklearn.metrics import classification_report
        from sklearn import metrics

        if (mode == "classification"
                and cv > np.histogram(y, bins=np.unique(y))[0].min()):
            gs.message(os.linesep)
            gs.fatal("Number of cv folds is greater than number of samples in "
                     "some classes ")

        gs.message(os.linesep)
        gs.message("Cross validation global performance measures......:")

        if (mode == "classification" and len(np.unique(y)) == 2
                and all([0, 1] == np.unique(y))):
            scoring["roc_auc"] = metrics.roc_auc_score

        from sklearn.model_selection import cross_val_predict

        preds = cross_val_predict(estimator,
                                  X,
                                  y,
                                  group_id,
                                  cv=outer,
                                  n_jobs=n_jobs,
                                  fit_params=fit_params)

        test_idx = [test for train, test in outer.split(X, y)]
        n_fold = np.zeros((0, ))

        for fold in range(outer.get_n_splits()):
            n_fold = np.hstack((n_fold, np.repeat(fold,
                                                  test_idx[fold].shape[0])))

        preds = {"y_pred": preds, "y_true": y, "cat": cat, "fold": n_fold}

        preds = pd.DataFrame(data=preds,
                             columns=["y_pred", "y_true", "cat", "fold"])
        gs.message(os.linesep)
        gs.message("Global cross validation scores...")
        gs.message(os.linesep)
        gs.message("Metric \t Mean \t Error")

        for name, func in scoring.items():
            score_mean = (preds.groupby("fold").apply(
                lambda x: func(x["y_true"], x["y_pred"])).mean())

            score_std = (preds.groupby("fold").apply(
                lambda x: func(x["y_true"], x["y_pred"])).std())

            gs.message(name + "\t" + str(score_mean.round(3)) + "\t" +
                       str(score_std.round(3)))

        if mode == "classification":
            gs.message(os.linesep)
            gs.message("Cross validation class performance measures......:")

            report_str = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=False,
            )

            report = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=True,
            )
            report = pd.DataFrame(report)

            gs.message(report_str)

            if classif_file != "":
                report.to_csv(classif_file, mode="w", index=True)

        # write cross-validation predictions to csv file
        if preds_file != "":
            preds.to_csv(preds_file, mode="w", index=False)
            text_file = open(preds_file + "t", "w")
            text_file.write('"Real", "Real", "integer", "integer"')
            text_file.close()

    # feature importances -----------------------------------------------------
    if importances is True:
        from sklearn.inspection import permutation_importance

        fimp = permutation_importance(estimator,
                                      X,
                                      y,
                                      scoring=search_scorer,
                                      n_repeats=5,
                                      n_jobs=n_jobs,
                                      random_state=random_state)

        feature_names = deepcopy(stack.names)
        feature_names = [i.split("@")[0] for i in feature_names]

        fimp = pd.DataFrame({
            "feature": feature_names,
            "importance": fimp["importances_mean"],
            "std": fimp["importances_std"],
        })

        gs.message(os.linesep)
        gs.message("Feature importances")
        gs.message("Feature" + "\t" + "Score")

        for index, row in fimp.iterrows():
            gs.message(row["feature"] + "\t" + str(row["importance"]) + "\t" +
                       str(row["std"]))

        if fimp_file != "":
            fimp.to_csv(fimp_file, index=False)

    # save the fitted model
    import joblib

    joblib.dump((estimator, y, class_labels), model_save)
Beispiel #6
0
 def test_append(self):
     """Append another grass raster to a RasterStack"""
     stack = RasterStack(self.predictors[0:5])
     stack.append(self.predictors[5])
     self.assertListEqual(stack.names, self.predictors)
Beispiel #7
0
def main():
    try:
        import sklearn
        import joblib

        if sklearn.__version__ < "0.20":
            gs.fatal(
                "Package python3-scikit-learn 0.20 or newer is not installed")

    except ImportError:
        gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed")

    # parser options
    group = options["group"]
    output = options["output"]
    model_load = options["load_model"]
    probability = flags["p"]
    prob_only = flags["z"]
    chunksize = int(options["chunksize"])

    # remove @ from output in case overwriting result
    if "@" in output:
        output = output.split("@")[0]

    # check probabilities=True if prob_only=True
    if prob_only is True and probability is False:
        gs.fatal("Need to set probabilities=True if prob_only=True")

    # reload fitted model and training data
    estimator, y, class_labels = joblib.load(model_load)

    # define RasterStack
    stack = RasterStack(group=group)

    # perform raster prediction
    region = Region()
    row_incr = math.ceil(chunksize / region.cols)

    # do not read by increments if increment > n_rows
    if row_incr >= region.rows:
        row_incr = None

    # prediction
    if prob_only is False:
        gs.message("Predicting classification/regression raster...")
        stack.predict(
            estimator=estimator,
            output=output,
            height=row_incr,
            overwrite=gs.overwrite(),
        )

    if probability is True:
        gs.message("Predicting class probabilities...")
        stack.predict_proba(
            estimator=estimator,
            output=output,
            class_labels=np.unique(y),
            overwrite=gs.overwrite(),
            height=row_incr,
        )

    # assign categories for classification map
    if class_labels and prob_only is False:
        rules = []

        for val, lab in class_labels.items():
            rules.append(",".join([str(val), str(lab)]))

        rules = "\n".join(rules)
        rules_file = string_to_rules(rules)
        r.category(map=output, rules=rules_file, separator="comma")