Example #1
0
def _get_data(subchunk, config):
    features_names = geoio.feature_names(config)

    # NOTE: This returns an *untransformed* x,
    # which is ok as we just need dummies here
    if config.mask:
        mask_x = _mask(subchunk, config)
        all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x))
        if all_mask_x.shape[0] == np.sum(all_mask_x.mask):
            x = np.ma.zeros((mask_x.shape[0], len(features_names)),
                            dtype=np.bool)
            x.mask = True
            log.info('Partition {} covariates are not loaded as '
                     'the partition is entirely masked.'.format(subchunk + 1))
            return x, features_names

    transform_sets = [k.transform_set for k in config.feature_sets]
    extracted_chunk_sets = geoio.image_subchunks(subchunk, config)
    log.info("Applying feature transforms")
    x = features.transform_features(extracted_chunk_sets, transform_sets,
                                    config.final_transform, config)[0]

    # only check/correct float32 conversion for Ensemble models
    if not config.clustering:
        if (isinstance(modelmaps[config.algorithm](), BaseEnsemble)
                or config.multirandomforest):
            x = _fix_for_corrupt_data(x, features_names)
    return _mask_rows(x, subchunk, config), features_names
Example #2
0
def permutation_importance(model, x_all, targets_all, config):
    _logger.info("Computing permutation importance!!")
    if config.algorithm not in transformed_modelmaps.keys():
        raise AttributeError("Only the following can be used for permutation "
                             "importance {}".format(
            list(transformed_modelmaps.keys())))

    y = targets_all.observations

    classification = hasattr(model, 'predict_proba')

    if not classification:
        for score in ['explained_variance',
                      'r2',
                      'neg_mean_absolute_error',
                      'neg_mean_squared_error']:
            pi_cv = apply_multiple_masked(
                PermutationImportance(model, scoring=score,
                                      cv='prefit', n_iter=10,
                                      refit=False).fit, data=(x_all, y)
            )
            feature_names = geoio.feature_names(config)
            df_picv = eli5.explain_weights_df(
                pi_cv, feature_names=feature_names, top=100)
            csv = Path(config.output_dir).joinpath(
                config.name + "_permutation_importance_{}.csv".format(
                    score)).as_posix()
            df_picv.to_csv(csv, index=False)
Example #3
0
def _get_data(subchunk, config):
    features_names = geoio.feature_names(config)

    if config.mask:
        mask_x = _mask(subchunk, config)
        all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x))
        if all_mask_x.shape[0] == np.sum(all_mask_x.mask):
            x = np.ma.zeros((mask_x.shape[0], len(features_names)),
                            dtype=np.bool)
            x.mask = True
            log.info('Partition {} covariates are not loaded as '
                     'the partition is entirely masked.'.format(subchunk + 1))
            return x, features_names

    extracted_chunk_sets = geoio.image_subchunks(subchunk, config)
    transform_sets = [k.transform_set for k in config.feature_sets]
    log.info("Applying feature transforms")
    x = features.transform_features(extracted_chunk_sets, transform_sets,
                                    config.final_transform, config)[0]
    return _mask_rows(x, subchunk, config), features_names