def save_intersected_features(feature_sets, transform_sets, config): """ This function saves raw covariates values at the target locations, i.e., after the targets have been intersected. This will save the following two files if they are provided in the config file: a) rawcovariates.csv: the covariate values in csv b) rawcovariates_mask.csv: the corresponding mask in csv This function will also optionally output intersected covariates scatter plot. """ transform_sets_mod = [] names = [ '{}_{}'.format(b, basename(k)) for ec in feature_sets for k in ec for b in range(ec[k].shape[3]) ] header = ', '.join(names) for t in transform_sets: dummy_transform = transforms.ImageTransformSet( image_transforms=None, imputer=None, global_transforms=None, is_categorical=t.is_categorical) transform_sets_mod.append(dummy_transform) transformed_vectors = [ t(c) for c, t in zip(feature_sets, transform_sets_mod) ] x = np.ma.concatenate(transformed_vectors, axis=1) x_all = gather_features(x, node=0) if mpiops.chunk_index == 0: np.savetxt(config.rawcovariates, X=x_all.data, delimiter=',', fmt='%.4e', header=header) np.savetxt(config.rawcovariates_mask, X=x_all.mask.astype(int), delimiter=',', fmt='%.4e', header=header) if config.plot_covariates: import matplotlib.pyplot as plt for i, name in enumerate(names): log.info('plotting {}'.format(name)) plt.figure() vals = x_all[:, i] vals_no_mask = vals[~vals.mask].data plt.scatter(x=list(range(vals_no_mask.shape[0])), y=vals_no_mask.data) plt.title(name) plt.savefig(name.rstrip('.tif') + '.png') plt.close()
def __init__(self, config_dict: dict): d = config_dict if d['type'] not in ('ordinal', 'categorical'): _logger.warning( "Feature set type must be ordinal or categorical. " "Unknwon option: '%s'. Type has been set to 'ordinal'.", d['type']) self.type = 'ordinal' else: self.type = d['type'] is_categorical = d['type'] == 'categorical' # get list of all the files if 'files' in d: self.tabular = False files = [] for source in d['files']: key = next(iter(source.keys())) if key == 'path': files.append(path.abspath(source[key])) elif key == 'directory': glob_string = path.join(path.abspath(source[key]), "*.tif") f_list = glob.glob(glob_string) files.extend(f_list) elif key == 'list': csvfile = path.abspath(source[key]) with open(csvfile, 'r') as f: reader = csv.reader(f) tifs = list(reader) tifs = [ f[0].strip() for f in tifs if (len(f) > 0 and f[0].strip() and f[0].strip()[0] != '#') ] for f in tifs: files.append(path.abspath(f)) self.files = sorted(files, key=str.lower) n_feat = len(self.files) _logger.debug("Loaded feature set with files: {self.files}") elif 'shapefile' in d: self.tabular = True self.fields = sorted(d['shapefile']['fields'], key=str.lower) n_feat = len(self.fields) self.file = d['shapefile']['file'] self.ndv = d['shapefile'].get('ndv', None) _logger.debug(f"Loaded feature set with fields: {self.fields}") trans_i, im, trans_g = _parse_transform_set(d['transforms'], d['imputation'], n_feat) self.transform_set = transforms.ImageTransformSet( trans_i, im, trans_g, is_categorical)
def cull_all_null_rows(feature_sets): # cull targets with all null values dummy_transform = transforms.ImageTransformSet(image_transforms=None, imputer=None, global_transforms=None, is_categorical=True) transformed_vectors = [dummy_transform(c) for c in feature_sets] bool_transformed_vectors = np.concatenate([t.mask for t in transformed_vectors], axis=1) covaraiates = bool_transformed_vectors.shape[1] rows_to_keep = np.sum(bool_transformed_vectors, axis=1) != covaraiates return rows_to_keep
def __init__(self, d): self.name = d['name'] self.type = d['type'] if d['type'] not in {'ordinal', 'categorical'}: log.warning("Feature set type must be ordinal or categorical: " "Unknown option " "{} (assuming ordinal)".format(d['type'])) is_categorical = d['type'] == 'categorical' # get list of all the files files = [] for source in d['files']: key = next(iter(source.keys())) if key == 'path': files.append(path.abspath(source[key])) elif key == 'directory': glob_string = path.join(path.abspath(source[key]), "*.tif") f_list = glob.glob(glob_string) files.extend(f_list) elif key == 'list': csvfile = path.abspath(source[key]) with open(csvfile, 'r') as f: reader = csv.reader(f) tifs = list(reader) tifs = [f[0].strip() for f in tifs if (len(f) > 0 and f[0].strip() and f[0].strip()[0] != '#')] for f in tifs: files.append(path.abspath(f)) self.files = sorted(files, key=str.lower) n_files = len(self.files) if 'transforms' not in d: d['transforms'] = None if 'imputation' not in d: d['imputation'] = None trans_i, im, trans_g = _parse_transform_set(d['transforms'], d['imputation'], n_files) self.transform_set = transforms.ImageTransformSet(trans_i, im, trans_g, is_categorical)
def save_intersected_features_and_targets(feature_sets, transform_sets, targets, config, impute=True): """ This function saves a table of covariate values and the target value intersected at each point. It also contains columns for UID 'index' and a predicted value. If the target shapefile contains an 'index' field, this will be used to populate the 'index' column. This is intended to be used as a unique ID for each point in post-processing. If no 'index' field exists this column will be zero filled. The 'prediction' column is for predicted values created during cross-validation. Again, this is for post-processing. It will only be populated if cross-validation is run later on. If not, it will be zero filled. Two files will be output: .../output_dir/{name_of_config}_rawcovariates.csv .../output_dir/{name_of_config}_rawcovariates_mask.csv This function will also optionally output intersected covariates scatter plot and covariate correlation matrix plot. """ if config.fields_to_write_to_csv: for f in config.fields_to_write_to_csv: if f not in targets.fields: raise ValueError( f"write_to_csv field '{f}' does not exist in shapefile records" ) transform_sets_mod = [] cov_names = [] for fs in feature_sets: cov_names.extend(fs.keys()) other_names = ['X', 'Y', 'target', 'prediction'] if config.fields_to_write_to_csv: other_names = config.fields_to_write_to_csv + other_names header = ','.join(cov_names + other_names) mask_header = ','.join(cov_names) for t in transform_sets: imputer = copy.deepcopy(t.imputer) if impute else None dummy_transform = transforms.ImageTransformSet( image_transforms=None, imputer=imputer, global_transforms=None, is_categorical=t.is_categorical) transform_sets_mod.append(dummy_transform) transformed_vectors = [ t(c) for c, t in zip(feature_sets, transform_sets_mod) ] x = np.ma.concatenate(transformed_vectors, axis=1) x_all = gather_features(x, node=0) all_xy = mpiops.comm.gather(targets.positions, root=0) all_targets = mpiops.comm.gather(targets.observations, root=0) if config.fields_to_write_to_csv: if config.target_search: raise NotImplementedError( "Can't write 'write_to_csv' columns with target search feature at this time." ) field_values = [] for f in config.fields_to_write_to_csv: field_values.append(mpiops.comm.gather(targets.fields[f])) if mpiops.chunk_index == 0: data = [x_all.data] if config.fields_to_write_to_csv: for f, v in zip(config.fields_to_write_to_csv, field_values): data.append(np.atleast_2d(np.ma.concatenate(v, axis=0)).T) all_xy = np.ma.concatenate(all_xy, axis=0) all_targets = np.ma.concatenate(all_targets, axis=0) xy = np.atleast_2d(all_xy) t = np.atleast_2d(all_targets).T data += [xy, t] # Zeros for prediction values data.append(np.zeros(t.shape)) data = np.hstack(data) np.savetxt(config.raw_covariates, X=data, fmt='%s', delimiter=',', header=header, comments='') np.savetxt(config.raw_covariates_mask, X=~x_all.mask.astype(bool), fmt='%f', delimiter=',', header=mask_header, comments='') if config.plot_intersection: diagnostics.plot_covariates_x_targets(config.raw_covariates, cols=2).savefig( config.plot_intersection) if config.plot_correlation: diagnostics.plot_covariate_correlation( config.raw_covariates).savefig(config.plot_correlation)
def save_intersected_features_and_targets(feature_sets, transform_sets, targets, config, impute=True): """ This function saves raw covariates values at the target locations, i.e., after the targets have been intersected. This will save the following two files if they are provided in the config file: a) rawcovariates.csv: the covariate values in csv b) rawcovariates_mask.csv: the corresponding mask in csv This function will also optionally output intersected covariates scatter plot. """ transform_sets_mod = [] names = [ '{}_{}'.format(b, basename(k)) for ec in feature_sets for k in ec for b in range(ec[k].shape[3]) ] names += ["X", "Y", config.target_property + "(target)"] header = ', '.join(names) for t in transform_sets: imputer = copy.deepcopy(t.imputer) if impute else None dummy_transform = transforms.ImageTransformSet( image_transforms=None, imputer=imputer, global_transforms=None, is_categorical=t.is_categorical) transform_sets_mod.append(dummy_transform) transformed_vectors = [ t(c) for c, t in zip(feature_sets, transform_sets_mod) ] x = np.ma.concatenate(transformed_vectors, axis=1) x_all = gather_features(x, node=0) all_xy = mpiops.comm.gather(targets.positions, root=0) all_targets = mpiops.comm.gather(targets.observations, root=0) if mpiops.chunk_index == 0: all_xy = np.ma.concatenate(all_xy, axis=0) all_targets = np.ma.concatenate(all_targets, axis=0) xy = np.atleast_2d(all_xy) t = np.atleast_2d(all_targets).T data = np.hstack((x_all.data, xy, t)) np.savetxt(config.raw_covariates, X=data, delimiter=',', header=header, comments='') mask = np.hstack((x_all.mask.astype(int), np.zeros_like(t))) np.savetxt(config.raw_covariates_mask, X=mask, delimiter=',', header=header, comments='') if config.plot_intersection: diagnostics.plot_covariates_x_targets(config.raw_covariates, cols=2).savefig( config.plot_intersection) if config.plot_correlation: diagnostics.plot_covariate_correlation( config.raw_covariates).savefig(config.plot_correlation)