def main(split_filepath, model_filepath, output_dir, dst_filepath, method,
         img_cluster, refine, refine_beta, refine_int_rescale, tree_val,
         nontree_val):
    logger = logging.getLogger(__name__)

    logger.info("classifying tiles for cluster %d with classifier from %s",
                img_cluster, model_filepath)
    split_df = pd.read_csv(split_filepath)
    clf = jl.load(model_filepath)

    pred_imgs = dtr.Classifier(
        tree_val=tree_val,
        nontree_val=nontree_val,
        refine=refine,
        refine_beta=refine_beta,
        refine_int_rescale=refine_int_rescale).classify_imgs(
            split_df,
            output_dir,
            clf=clf,
            method=method,
            img_cluster=img_cluster)

    logger.info("dumped %d classified tiles to %s", len(pred_imgs), output_dir)

    pd.Series(pred_imgs).to_csv(dst_filepath, index=False, header=False)
    logger.info("dumped list of classified tiles to %s", dst_filepath)
Esempio n. 2
0
def make_confusion_df(
    lidar_gdf,
    lidar_raw_dir,
    split_df=None,
    img_filepaths=None,
    n=None,
    frac=0.05,
    clf=None,
    clf_dict=None,
):

    c = dtr.Classifier()
    truth_pred_lazy = []
    if clf is not None:
        if split_df is None:
            num_validation_tiles = int(frac * len(img_filepaths))
            test_filepaths = random.choices(img_filepaths,
                                            k=num_validation_tiles)
        else:
            test_filepaths = _get_validation_df(split_df, n,
                                                frac)["img_filepath"]

        for img_filepath in test_filepaths:
            truth_pred_lazy.append(
                dask.delayed(_inner_loop)(img_filepath, lidar_gdf,
                                          lidar_raw_dir, c, clf))
    else:
        validation_df = _get_validation_df(split_df, n, frac)
        for img_cluster, cluster_df in validation_df.groupby("img_cluster"):
            clf = clf_dict[img_cluster]
            for img_filepath in cluster_df["img_filepath"]:
                truth_pred_lazy.append(
                    dask.delayed(_inner_loop)(img_filepath, lidar_gdf,
                                              lidar_raw_dir, c, clf))

    with diagnostics.ProgressBar():
        truth_pred = np.hstack(dask.compute(*truth_pred_lazy))

    truth_ser = pd.Series(truth_pred[0], name="actual")
    pred_ser = pd.Series(truth_pred[1], name="predicted")
    return pd.crosstab(truth_ser, pred_ser) / len(truth_ser)
Esempio n. 3
0
def classify_imgs(ctx, split_filepath, clf_filepath, clf_dir, method,
                  img_cluster, tree_val, nontree_val, refine, refine_beta,
                  refine_int_rescale, pixel_features_builder_kws, output_dir):
    logger = ctx.obj['LOGGER']

    split_df = pd.read_csv(split_filepath)

    if clf_filepath is not None:
        clf_dict = None
        clf = joblib.load(clf_filepath)
        logger.info("Classifying images from %s with classifier of %s",
                    split_filepath, clf_filepath)

    if clf_dir is not None:
        clf = None
        clf_dict = {}
        for img_cluster in split_df['img_cluster'].unique():
            clf_dict[img_cluster] = joblib.load(
                path.join(clf_dir, f"{img_cluster}.joblib"))

    pixel_features_builder_kws = _dict_from_kws(pixel_features_builder_kws)

    c = dtr.Classifier(tree_val=tree_val,
                       nontree_val=nontree_val,
                       refine=refine,
                       refine_beta=refine_beta,
                       refine_int_rescale=refine_int_rescale,
                       **pixel_features_builder_kws)

    if output_dir is None:
        output_dir = ''

    pred_imgs = c.classify_imgs(split_df,
                                output_dir,
                                clf=clf,
                                clf_dict=clf_dict,
                                method=method,
                                img_cluster=img_cluster)
    logger.info("Dumped %d predicted images to %s", len(pred_imgs), output_dir)
Esempio n. 4
0
def classify_img(ctx, img_filepath, clf_filepath, tree_val, nontree_val,
                 refine, refine_beta, refine_int_rescale,
                 pixel_features_builder_kws, output_filepath):
    logger = ctx.obj['LOGGER']

    logger.info("Classifying %s with classifier of %s", img_filepath,
                clf_filepath)

    pixel_features_builder_kws = _dict_from_kws(pixel_features_builder_kws)
    c = dtr.Classifier(tree_val=tree_val,
                       nontree_val=nontree_val,
                       refine=refine,
                       refine_beta=refine_beta,
                       refine_int_rescale=refine_int_rescale,
                       **pixel_features_builder_kws)

    if output_filepath is None:
        filename, ext = path.splitext(path.basename(img_filepath))
        output_filepath = f"{filename}-pred{ext}"

    c.classify_img(img_filepath, joblib.load(clf_filepath), output_filepath)
    logger.info("Dumped predicted image to %s", output_filepath)
Esempio n. 5
0
def main(validation_img_dir, split_filepath, models_dir, dst_filepath):
    logger = logging.getLogger(__name__)

    validation_img_filepaths = glob.glob(
        path.join(validation_img_dir, settings.IMG_DEFAULT_FILENAME_PATTERN))

    logger.info("computing confusion data frame with the tiles in %s",
                validation_img_dir)
    split_df = pd.read_csv(split_filepath, index_col=0)
    c = dtr.Classifier()
    observations = []
    predictions = []
    for validation_img_filepath in validation_img_filepaths:
        validation_img_filename = path.basename(validation_img_filepath)
        try:
            img_filepath, img_cluster = split_df[
                split_df['img_filepath'].str.endswith(
                    validation_img_filename)][['img_filepath',
                                               'img_cluster']].iloc[0]
        except IndexError:
            raise ValueError(
                f'Could not find an image named {validation_img_filename} in '
                f' {split_filepath}')
        with rio.open(validation_img_filepath) as src:
            observations.append(src.read(1))
        predictions.append(
            c.classify_img(
                img_filepath,
                jl.load(path.join(models_dir, f'{img_cluster}.joblib'))))

    truth_ser = pd.Series(np.hstack(observations).flatten(), name='obs')
    pred_ser = pd.Series(np.hstack(predictions).flatten(), name='pred')
    df = pd.crosstab(truth_ser, pred_ser) / len(truth_ser)
    logger.info("estimated accuracy score is %f", np.trace(df))

    df.to_csv(dst_filepath)
    logger.info("dumped confusion data frame to %s", dst_filepath)
Esempio n. 6
0
    def test_classifier(self):
        # TODO: test init arguments of `Classifier`
        c = dtr.Classifier()

        img_filepath = self.split_i_df.iloc[0]['img_filepath']
        # test that `classify_img` returns a ndarray
        self.assertIsInstance(c.classify_img(img_filepath, self.clf),
                              np.ndarray)
        # test that `classify_img` with `output_filepath` returns a ndarray
        # and dumps it
        output_filepath = path.join(self.tmp_output_dir, 'foo.tif')
        y_pred = c.classify_img(img_filepath, self.clf, output_filepath)
        self.assertIsInstance(y_pred, np.ndarray)
        self.assertTrue(os.path.exists(output_filepath))
        # remove it so that the output dir is clean in the tests below
        os.remove(output_filepath)

        # test that `classify_imgs` with implicit `cluster-I` method returns a
        # list and that the images have been dumped
        pred_imgs = c.classify_imgs(self.split_i_df, self.tmp_output_dir,
                                    self.clf)
        self.assertIsInstance(pred_imgs, list)
        self._test_imgs_exist_and_rm(pred_imgs)

        # test that `classify_imgs` with implicit `cluster-II` method, `clf`
        # and `img_label` returns a list and that the images have been dumped
        pred_imgs = c.classify_imgs(self.split_ii_df,
                                    self.tmp_output_dir,
                                    self.clf,
                                    img_cluster=self.img_cluster)
        self.assertIsInstance(pred_imgs, list)
        self._test_imgs_exist_and_rm(pred_imgs)
        # test that this works equally when providing `clf_dict`
        pred_imgs = c.classify_imgs(self.split_ii_df,
                                    self.tmp_output_dir,
                                    clf_dict=self.clf_dict,
                                    img_cluster=self.img_cluster)
        self.assertIsInstance(pred_imgs, list)
        self._test_imgs_exist_and_rm(pred_imgs)

        # test that `classify_imgs` with implicit `cluster-II` method and
        # `clf_dict` returns a dict and that the images have been dumped
        pred_imgs = c.classify_imgs(self.split_ii_df,
                                    self.tmp_output_dir,
                                    clf_dict=self.clf_dict)
        self.assertIsInstance(pred_imgs, dict)
        for img_cluster in pred_imgs:
            self._test_imgs_exist_and_rm(pred_imgs[img_cluster])

        # test that `clf=None` with 'cluster-I' raises a `ValueError`
        self.assertRaises(ValueError, c.classify_imgs, self.split_i_df,
                          self.tmp_output_dir)

        # test that `clf=None` and `clf_dict=None` with 'cluster-II' raises a
        # `ValueError`
        self.assertRaises(ValueError, c.classify_imgs, self.split_ii_df,
                          self.tmp_output_dir)
        # test that `clf_dict=None` with 'cluster-II' and `img_cluster=None`
        # raises a `ValueError`, even when providing a non-None `clf`
        self.assertRaises(ValueError,
                          c.classify_imgs,
                          self.split_ii_df,
                          self.tmp_output_dir,
                          clf=c)

        # TODO: test with explicit `method` keyword argument

        # test that `Classifier` with `refine=False` also returns an ndarray
        c = dtr.Classifier(refine=False)
        img_filepath = self.split_i_df.iloc[0]['img_filepath']
        # test that `classify_img` returns a ndarray
        self.assertIsInstance(c.classify_img(img_filepath, self.clf),
                              np.ndarray)
Esempio n. 7
0
def main(
    tile_filepath,
    split_filepath,
    models_dir,
    lidar_dir,
    validation_tiles_dir,
    dst_filepath,
    high_veg_val,
    num_opening_iterations,
    num_dilation_iterations,
    output_dtype,
    output_tree_val,
    output_nodata,
):
    logger = logging.getLogger(__name__)

    # predict the tile using the trained classifier
    split_df = pd.read_csv(split_filepath, index_col=0)
    tile_cluster = split_df[split_df["img_filepath"] ==
                            tile_filepath]["img_cluster"].iloc[0]
    pred_arr = dtr.Classifier().classify_img(
        tile_filepath, jl.load(path.join(models_dir,
                                         f"{tile_cluster}.joblib")))

    # load lidar default settings
    high_veg_val = high_veg_val if high_veg_val else lidar_utils.HIGH_VEG_VAL
    num_opening_iterations = (num_opening_iterations if num_opening_iterations
                              else lidar_utils.NUM_OPENING_ITERATIONS)
    num_dilation_iterations = (num_dilation_iterations
                               if num_dilation_iterations else
                               lidar_utils.NUM_DILATION_ITERATIONS)
    output_dtype = output_dtype if output_dtype else lidar_utils.OUTPUT_DTYPE
    output_tree_val = (output_tree_val
                       if output_tree_val else lidar_utils.OUTPUT_TREE_VAL)
    output_nodata = output_nodata if output_nodata else lidar_utils.OUTPUT_NODATA

    # estimate the "ground-truth" mask with LIDAR data
    lidar_filepath = path.join(lidar_dir,
                               lidar_utils.get_lidar_filename(tile_filepath))
    validation_tile_filepath = path.join(validation_tiles_dir,
                                         path.basename(tile_filepath))
    _ = dtr.LidarToCanopy(
        three_threshold=high_veg_val,
        output_dtype=output_dtype,
        output_tree_val=output_tree_val,
        output_nodata=output_nodata,
    ).to_canopy_mask(
        lidar_filepath,
        lidar_utils.LIDAR_TREE_VALUES,
        tile_filepath,
        output_filepath=validation_tile_filepath,
        postprocess_func=lidar_utils.postprocess_canopy_mask,
        postprocess_func_args=[
            high_veg_val,
            num_opening_iterations,
            num_dilation_iterations,
            output_dtype,
            output_tree_val,
        ],
    )
    with rio.open(validation_tile_filepath) as src:
        obs_arr = src.read(1)

    # compute the confusion matrix and dump it to a file
    obs_ser = pd.Series(obs_arr.flatten(), name="obs")
    pred_ser = pd.Series(pred_arr.flatten(), name="pred")
    df = pd.crosstab(obs_ser, pred_ser) / len(obs_ser)
    logger.info("estimated accuracy score is %f", np.trace(df))
    df.to_csv(dst_filepath)
    logger.info("dumped confusion data frame to %s", dst_filepath)