Beispiel #1
0
def _get_data(subchunk, config):
    features_names = geoio.feature_names(config)

    # NOTE: This returns an *untransformed* x,
    # which is ok as we just need dummies here
    if config.mask:
        mask_x = _mask(subchunk, config)
        all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x))
        if all_mask_x.shape[0] == np.sum(all_mask_x.mask):
            x = np.ma.zeros((mask_x.shape[0], len(features_names)),
                            dtype=np.bool)
            x.mask = True
            log.info('Partition {} covariates are not loaded as '
                     'the partition is entirely masked.'.format(subchunk + 1))
            return x, features_names

    transform_sets = [k.transform_set for k in config.feature_sets]
    extracted_chunk_sets = geoio.image_subchunks(subchunk, config)
    log.info("Applying feature transforms")
    x = features.transform_features(extracted_chunk_sets, transform_sets,
                                    config.final_transform, config)[0]

    # only check/correct float32 conversion for Ensemble models
    if not config.clustering:
        if (isinstance(modelmaps[config.algorithm](), BaseEnsemble)
                or config.multirandomforest):
            x = _fix_for_corrupt_data(x, features_names)
    return _mask_rows(x, subchunk, config), features_names
Beispiel #2
0
def _get_data(subchunk, config):
    features_names = geoio.feature_names(config)

    if config.mask:
        mask_x = _mask(subchunk, config)
        all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x))
        if all_mask_x.shape[0] == np.sum(all_mask_x.mask):
            x = np.ma.zeros((mask_x.shape[0], len(features_names)),
                            dtype=np.bool)
            x.mask = True
            log.info('Partition {} covariates are not loaded as '
                     'the partition is entirely masked.'.format(subchunk + 1))
            return x, features_names

    extracted_chunk_sets = geoio.image_subchunks(subchunk, config)
    transform_sets = [k.transform_set for k in config.feature_sets]
    log.info("Applying feature transforms")
    x = features.transform_features(extracted_chunk_sets, transform_sets,
                                    config.final_transform, config)[0]
    return _mask_rows(x, subchunk, config), features_names
Beispiel #3
0
def local_rank_features(image_chunk_sets, transform_sets, targets, config):
    """ Ranks the importance of the features based on their performance.
    This function trains and cross-validates a model with each individual
    feature removed and then measures the performance of the model with that
    feature removed. The most important feature is the one which; when removed,
    causes the greatest degradation in the performance of the model.

    Parameters
    ----------
    image_chunk_sets: dict
        A dictionary used to get the set of images to test on.
    transform_sets: list
        A dictionary containing the applied transformations
    targets: instance of geoio.Targets class
        The targets used in the cross validation
    config: config class instance
        The global config file
    """

    feature_scores = {}

    # Get all the images
    all_names = []
    for c in image_chunk_sets:
        all_names.extend(list(c.keys()))
    all_names = sorted(list(set(all_names)))  # make unique

    if len(all_names) <= 1:
        raise ValueError("Cannot perform feature ranking with only one "
                         "feature! Try turning off the 'feature_rank' option.")

    for name in all_names:
        transform_sets_leaveout = copy.deepcopy(transform_sets)
        final_transform_leaveout = copy.deepcopy(config.final_transform)
        image_chunks_leaveout = [copy.copy(k) for k in image_chunk_sets]

        for i, c in enumerate(image_chunks_leaveout):
            if name in c:
                c.pop(name)
            # if only one covariate of a feature type, delete
            # this feature type, and transformset
            if not c:
                image_chunks_leaveout.pop(i)
                transform_sets_leaveout.pop(i)

        fname = name.rstrip(".tif")
        _logger.info("Computing {} feature importance of {}"
                 .format(config.algorithm, fname))
        x, keep = feat.transform_features(image_chunks_leaveout,
                                          transform_sets_leaveout,
                                          final_transform_leaveout,
                                          config)
        x_all = feat.gather_features(x[keep], node=0)
        targets_all = targ.gather_targets_main(targets, keep, node=0)


        # Feature ranking occurs before top-level shared training data
        # is created, so share the memory now so we can parallel 
        # validate.
        if config.parallel_validate:
            training_data = geoio.create_shared_training_data(targets_all, x_all)
            targets_all = training_data.targets_all
            x_all = training_data.x_all
        
        results = local_crossval(x_all, targets_all, config)
        feature_scores[fname] = results

        geoio.deallocate_shared_training_data(training_data)
    
    # Get the different types of score from one of the outputs
    if mpiops.chunk_index == 0:
        measures = list(next(feature_scores.values().__iter__()).scores.keys())
        features = sorted(feature_scores.keys())
        scores = np.empty((len(measures), len(features)))
        for m, measure in enumerate(measures):
            for f, feature in enumerate(features):
                scores[m, f] = feature_scores[feature].scores[measure]
        return measures, features, scores
    else:
        return None, None, None