def _get_data(subchunk, config): features_names = geoio.feature_names(config) # NOTE: This returns an *untransformed* x, # which is ok as we just need dummies here if config.mask: mask_x = _mask(subchunk, config) all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x)) if all_mask_x.shape[0] == np.sum(all_mask_x.mask): x = np.ma.zeros((mask_x.shape[0], len(features_names)), dtype=np.bool) x.mask = True log.info('Partition {} covariates are not loaded as ' 'the partition is entirely masked.'.format(subchunk + 1)) return x, features_names transform_sets = [k.transform_set for k in config.feature_sets] extracted_chunk_sets = geoio.image_subchunks(subchunk, config) log.info("Applying feature transforms") x = features.transform_features(extracted_chunk_sets, transform_sets, config.final_transform, config)[0] # only check/correct float32 conversion for Ensemble models if not config.clustering: if (isinstance(modelmaps[config.algorithm](), BaseEnsemble) or config.multirandomforest): x = _fix_for_corrupt_data(x, features_names) return _mask_rows(x, subchunk, config), features_names
def _get_data(subchunk, config): features_names = geoio.feature_names(config) if config.mask: mask_x = _mask(subchunk, config) all_mask_x = np.ma.vstack(mpiops.comm.allgather(mask_x)) if all_mask_x.shape[0] == np.sum(all_mask_x.mask): x = np.ma.zeros((mask_x.shape[0], len(features_names)), dtype=np.bool) x.mask = True log.info('Partition {} covariates are not loaded as ' 'the partition is entirely masked.'.format(subchunk + 1)) return x, features_names extracted_chunk_sets = geoio.image_subchunks(subchunk, config) transform_sets = [k.transform_set for k in config.feature_sets] log.info("Applying feature transforms") x = features.transform_features(extracted_chunk_sets, transform_sets, config.final_transform, config)[0] return _mask_rows(x, subchunk, config), features_names
def local_rank_features(image_chunk_sets, transform_sets, targets, config): """ Ranks the importance of the features based on their performance. This function trains and cross-validates a model with each individual feature removed and then measures the performance of the model with that feature removed. The most important feature is the one which; when removed, causes the greatest degradation in the performance of the model. Parameters ---------- image_chunk_sets: dict A dictionary used to get the set of images to test on. transform_sets: list A dictionary containing the applied transformations targets: instance of geoio.Targets class The targets used in the cross validation config: config class instance The global config file """ feature_scores = {} # Get all the images all_names = [] for c in image_chunk_sets: all_names.extend(list(c.keys())) all_names = sorted(list(set(all_names))) # make unique if len(all_names) <= 1: raise ValueError("Cannot perform feature ranking with only one " "feature! Try turning off the 'feature_rank' option.") for name in all_names: transform_sets_leaveout = copy.deepcopy(transform_sets) final_transform_leaveout = copy.deepcopy(config.final_transform) image_chunks_leaveout = [copy.copy(k) for k in image_chunk_sets] for i, c in enumerate(image_chunks_leaveout): if name in c: c.pop(name) # if only one covariate of a feature type, delete # this feature type, and transformset if not c: image_chunks_leaveout.pop(i) transform_sets_leaveout.pop(i) fname = name.rstrip(".tif") _logger.info("Computing {} feature importance of {}" .format(config.algorithm, fname)) x, keep = feat.transform_features(image_chunks_leaveout, transform_sets_leaveout, final_transform_leaveout, config) x_all = feat.gather_features(x[keep], node=0) targets_all = targ.gather_targets_main(targets, keep, node=0) # Feature ranking occurs before top-level shared training data # is created, so share the memory now so we can parallel # validate. if config.parallel_validate: training_data = geoio.create_shared_training_data(targets_all, x_all) targets_all = training_data.targets_all x_all = training_data.x_all results = local_crossval(x_all, targets_all, config) feature_scores[fname] = results geoio.deallocate_shared_training_data(training_data) # Get the different types of score from one of the outputs if mpiops.chunk_index == 0: measures = list(next(feature_scores.values().__iter__()).scores.keys()) features = sorted(feature_scores.keys()) scores = np.empty((len(measures), len(features))) for m, measure in enumerate(measures): for f, feature in enumerate(features): scores[m, f] = feature_scores[feature].scores[measure] return measures, features, scores else: return None, None, None