def __init__(self, yaml_file: str, cluster=False): def _grp(d, k, msg=None): """ Get required parameter. """ try: return d[k] except KeyError: if msg is None: msg = f"Required parameter {k} not present in config." _logger.exception(msg) raise Config._configure_pyyaml() with open(yaml_file, 'r') as f: s = yaml.load(f, Loader=Config.yaml_loader) self.name = path.basename(yaml_file).rsplit(".", 1)[0] # LEARNING BLOCK if not cluster: learn_block = _grp(s, 'learning') self.clustering = False self.cluster_analysis = False self.algorithm = _grp( learn_block, 'algorithm', "'algorithm' must be provided as part of 'learning' block.") self.algorithm_args = _grp( learn_block, 'arguments', "'arguments' must be provided for learning algorithm.") # CLUSTERING BLOCK else: cb = _grp(s, 'clustering', "'clustering' block must be provided when clustering.") self.clustering = True self.algorithm = cb.get('algorithm', 'kmeans') self.n_classes = _grp( cb, 'n_classes', "'n_classes' must be provided when clustering.") self.oversample_factor = _grp( cb, 'oversample_factor', "'oversample_factor' must be provided when clustering.") self.cluster_analysis = cb.get('cluster_analysis', False) self.class_file = cb.get('file') if self.class_file: self.class_property = _grp( cb, 'property', "'property' must be provided when " "providing a file for semisupervised clustering.") self.semi_supervised = self.class_file is not None # Set flags based on algorithm being used - these control # some special behaviours in the code. self.cubist = self.algorithm == 'cubist' self.multicubist = self.algorithm == 'multicubist' self.multirandomforest = self.algorithm == 'multirandomforest' self.krige = self.algorithm == 'krige' # PICKLING BLOCK pk_block = s.get('pickling') if pk_block: self.pk_covariates = pk_block.get('covariates') self.pk_targets = pk_block.get('targets') # Load from pickle files if covariates and targets exist. self.pk_load = self.pk_covariates and os.path.exists(self.pk_covariates) \ and self.pk_targets and os.path.exists(self.pk_targets) if self.cubist or self.multicubist: self.pk_featurevec = pk_block.get('featurevec') # If running multicubist, we also need featurevec to load from pickle files. self.pk_load = self.pk_load \ and self.pk_featurevec and os.path.exists(self.pk_featurevec) else: self.pk_load = False self.pk_covariates = None self.pk_targets = None self.pk_featurevec = None # FEATURES BLOCK # Todo: fix get_image_spec so features are optional if using pickled data. # if not self.pk_load: _logger.warning( "'features' are required even when loading from pickled data - this is " "a work around for getting image specifications. Needs to be fixed." ) features = _grp( s, 'features', "'features' block must be provided when not loading " "from pickled data.") self.feature_sets = [FeatureSetConfig(f) for f in features] # Not yet implemented. if 'patchsize' in s: _logger.info("Patchsize currently fixed at 0 -- ignoring") self.patchsize = 0 # TARGET BLOCK if not self.pk_load: tb = _grp( s, 'targets', "'targets' block my be provided when not loading from " "pickled data.") self.target_file = _grp( tb, 'file', "'file' needs to be provided when specifying " "targets.") self.target_property = _grp( tb, 'property', "'property needs to be provided when " "specifying targets.") self.resample = tb.get('resample') # FINAL TRANSFORM BLOCK ftb = s.get('final_transform') if ftb is not None: _, im, trans_g = _parse_transform_set(ftb.get('transforms'), ftb.get('imputation')) self.final_transform = transforms.TransformSet(im, trans_g) else: self.final_transform = None # VALIDATION BLOCK vb = s.get('validation') if vb: self.rank_features = vb.get('feature_rank', False) if self.pk_load and self.rank_features: _logger.warning( "Feature ranking cannot be performed when loading covariates and " "targets from pickled data.") self.rank_features = False self.permutation_importance = vb.get('permutation_importance', False) kfb = vb.get('k-fold') if kfb: self.cross_validate = True self.folds = _grp( kfb, 'folds', "'folds' (number of folds) must be specified " "if k-fold cross validation is being used.") self.crossval_seed = _grp( kfb, 'random_seed', "'random_seed' must be specified " "if k-fold cross validation is being used.") self.parallel_validate = kfb.get('parallel', False) else: self.cross_validate = False self.rank_features = False self.permutation_importance = False self.parallel_validate = False # OPTIMISATION BLOCK # Note: optimisation options get parsed in scripts/gridsearch.py self.optimisation = s.get('optimisation') # PREDICT BLOCK pb = _grp(s, 'prediction', "'prediction' block must be provided.") self.geotif_options = pb.get('geotif', {}) self.quantiles = _grp( pb, 'quantiles', "'quantiles' must be provided as part of " "prediction block.") self.outbands = _grp( pb, 'outbands', "'outbands' must be provided as part of prediction " "block.") self.thumbnails = pb.get('thumbnails', 10) mb = s.get('mask') if mb: self.mask = mb.get('file') self.mask = None if not os.path.exists(self.mask) else self.mask if self.mask: self.retain = _grp( mb, 'retain', "'retain' must be provided if providing a " "prediction mask.") else: self.mask = None if self.krige: # Todo: don't know if lon/lat is compulsory or not for kriging self.lon_lat = s.get('lon_lat') else: self.lon_lat = None # OUTPUT BLOCK def _outpath(filename): return os.path.join(self.output_dir, self.name + f'{filename}') ob = _grp(s, 'output', "'output' block is required.") self.output_dir = _grp(ob, 'directory', "'directory' for output is required.") self.model_file = ob.get('model', _outpath('.model')) if ob.get('plot_feature_ranks', False): self.plot_feature_ranks = _outpath('_featureranks.png') self.plot_feature_rank_curves = _outpath('_featurerank_curves.png') else: self.plot_feature_ranks = None self.plot_feature_rank = None if ob.get('plot_intersection', False): self.plot_intersection = _outpath('_intersected.png') else: self.plot_intersection = None if ob.get('plot_real_vs_pred', False): self.plot_real_vs_pred = _outpath('_real_vs_pred.png') self.plot_residual = _outpath('_residual.png') else: self.plot_real_vs_pred = None self.plot_residual = None if ob.get('plot_correlation', False): self.plot_correlation = _outpath('_correlation.png') else: self.plot_correlation = None if ob.get('plot_target_scaling', False): self.plot_target_scaling = _outpath('_target_scaling.png') else: self.plot_target_scaling = None self.raw_covariates = _outpath('_rawcovariates.csv') self.raw_covariates_mask = _outpath('_rawcovariates_mask.csv') self.feature_ranks_file = _outpath('_featureranks.json') self.crossval_scores_file = _outpath('_crossval_scores.json') self.crossval_results_file = _outpath('_crossval_results.csv') self.crossval_results_plot = _outpath('_crossval_results.png') self.dropped_targets_file = _outpath('_dropped_targets.txt') self.transformed_targets_file = _outpath('_transformed_targets.csv') self.metadata_file = _outpath('_metadata.txt') self.optimisation_results_file = _outpath('_optimisation.csv') self.prediction_file = _outpath('_{}.tif') paths = [self.output_dir, os.path.split(self.model_file)[0]] for p in paths: if p: makedirs(p, exist_ok=True)
def __init__(self, yaml_file): with open(yaml_file, 'r') as f: s = yaml.safe_load(f) self.name = path.basename(yaml_file).rsplit(".", 1)[0] # TODO expose this option when fixed if 'patchsize' in s: log.info("Patchsize currently fixed at 0 -- ignoring") self.patchsize = 0 if 'intersected_features' in s: self.intersected_features = s['intersected_features'] else: self.intersected_features = None if 'learning' in s: self.algorithm = s['learning']['algorithm'] self.algorithm_args = s['learning']['arguments'] else: self.algorithm = None self.cubist = self.algorithm == 'cubist' self.multicubist = self.algorithm == 'multicubist' self.multirandomforest = self.algorithm == 'multirandomforest' self.krige = self.algorithm == 'krige' if 'prediction' in s: self.quantiles = s['prediction']['quantiles'] self.geotif_options = s['prediction']['geotif'] if 'geotif' in \ s['prediction'] else {} self.outbands = None if 'outbands' in s['prediction']: self.outbands = s['prediction']['outbands'] self.thumbnails = s['prediction']['thumbnails'] \ if 'thumbnails' in s['prediction'] else 10 if 'features' in s: self.pickle = any(True for d in s['features'] if d['type'] == 'pickle') else: self.pickle = False self.rawcovariates = False self.train_data_pk = False if self.pickle: self.pickle_load = True for n, d in enumerate(s['features']): if d['type'] == 'pickle': if 'covariates' in d['files']: self.pickled_covariates = \ path.abspath(d['files']['covariates']) if 'targets' in d['files']: self.pickled_targets = d['files']['targets'] if 'rawcovariates' in d['files']: self.rawcovariates = d['files']['rawcovariates'] self.rawcovariates_mask = \ d['files']['rawcovariates_mask'] if 'train_data_pk' in d['files']: self.train_data_pk = d['files']['train_data_pk'] if not (path.exists(d['files']['covariates']) and path.exists(d['files']['targets'])): self.pickle_load = False if self.cubist or self.multicubist: if 'featurevec' in d['files']: self.featurevec = \ path.abspath(d['files']['featurevec']) if not path.exists(d['files']['featurevec']): self.pickle_load = False if 'plot_covariates' in d['files']: self.plot_covariates = d['files']['plot_covariates'] else: self.plot_covariates = False s['features'].pop(n) # pop `pickle` features else: self.pickle_load = False if 'features' in s: self.feature_sets = [FeatureSetConfig(k) for k in s['features']] if 'preprocessing' in s: final_transform = s['preprocessing'] if 'transforms' not in final_transform: final_transform['transforms'] = None if 'imputation' not in final_transform: final_transform['imputation'] = None _, im, trans_g = _parse_transform_set( final_transform['transforms'], final_transform['imputation']) self.final_transform = transforms.TransformSet(im, trans_g) else: self.final_transform = None self.output_dir = s['output']['directory'] # create output dir if does not exist makedirs(self.output_dir, exist_ok=True) if 'oos_validation' in s: self.oos_validation_file = s['oos_validation']['file'] self.oos_validation_property = s['oos_validation']['property'] if 'targets' in s: self.target_file = s['targets']['file'] if 'property' in s['targets']: self.target_property = s['targets']['property'] self.resample = None if 'resample' in s['targets']: self.resample = s['targets']['resample'] self.value_resampling_args = None self.spatial_resampling_args = None if 'value' in s['targets']['resample']: self.value_resampling_args = s['targets']['resample']['value'] if 'spatial' in s['targets']['resample']: self.spatial_resampling_args = s['targets']['resample']['spatial'] if self.value_resampling_args is None and self.spatial_resampling_args is None: raise ValueError("provide at least one of value or spatial args for resampling") if 'group_targets' in s['targets']: self.group_targets = True self.groups_eps = s['targets']['group_targets']['groups_eps'] if 'group_col' in s['targets']['group_targets']: self.group_col = s['targets']['group_targets']['group_col'] else: self.group_col = None self.target_groups_file = path.join(self.output_dir, 'target_groups.jpg') else: self.group_targets = False self.target_groups_file = path.join(self.output_dir, 'target_groups.jpg') if 'weight_col_name' in s['targets']: self.weighted_model = True self.weight_col_name = s['targets']['weight_col_name'] else: self.weighted_model = False if 'group' in s['targets']: self.output_group_col_name = s['targets']['group']['output_group_col_name'] if 'spatial' in s['targets']['group']: self.spatial_grouping_args = s['targets']['group']['spatial'] else: self.spatial_grouping_args = {} if 'fields_to_keep' in s['targets']['group']: self.grouping_fields_to_keep = s['targets']['group']['fields_to_keep'] else: self.grouping_fields_to_keep = [] self.grouped_output = Path(self.output_dir).joinpath(s['output']['grouped_shapefile']) if 'split' in s['targets']: self.split_group_col_name = s['targets']['split']['group_col_name'] self.split_oos_fraction = s['targets']['split']['oos_fraction'] self.train_shapefile = Path(self.output_dir).joinpath(s['output']['train_shapefile']) self.oos_shapefile = Path(self.output_dir).joinpath(s['output']['oos_shapefile']) self.mask = None if 'mask' in s: self.mask = s['mask']['file'] self.retain = s['mask']['retain'] # mask areas that are predicted if 'pca' in s: self.pca = True preprocessing_transforms = s['preprocessing']['transforms'] if 'n_components' in preprocessing_transforms[0]['whiten']: self.n_components = preprocessing_transforms[0]['whiten']['n_components'] else: self.n_components = None if 'geotif' not in s['pca']: tif_opts = {} else: tif_opts = s['pca']['geotif'] if s['pca']['geotif'] is not None else {} self.geotif_options = tif_opts self.pca_json = path.join(self.output_dir, s['output']['pca_json']) else: self.pca = False self.lon_lat = False if 'lon_lat' in s: self.lon_lat = True self.lat = s['lon_lat']['lat'] self.lon = s['lon_lat']['lon'] # TODO pipeline this better self.rank_features = False self.permutation_importance = False self.feature_importance = False self.cross_validate = False self.parallel_validate = False if 'validation' in s: for i in s['validation']: if i == 'feature_rank': self.rank_features = True if i == 'permutation_importance': self.permutation_importance = True if i == 'feature_importance': self.feature_importance = True if i == 'parallel': self.parallel_validate = True if type(i) is dict and 'k-fold' in i: self.cross_validate = True self.folds = i['k-fold']['folds'] self.crossval_seed = i['k-fold']['random_seed'] break if self.rank_features and self.pickle_load: self.pickle_load = False log.info('Feature ranking does not work with ' 'pickled files. Pickled files will not be used. ' 'All covariates will be intersected.') self.optimised_model = False if 'learning' in s: self.hpopt = False self.skopt = False if 'optimisation' in s['learning']: if 'searchcv_params' in s['learning']['optimisation']: self.opt_searchcv_params = s['learning']['optimisation']['searchcv_params'] self.opt_params_space = s['learning']['optimisation']['params_space'] self.skopt = True if 'hyperopt_params' in s['learning']['optimisation']: self.hyperopt_params = s['learning']['optimisation']['hyperopt_params'] self.hp_params_space = s['learning']['optimisation']['hp_params_space'] self.hpopt = True if self.skopt and self.hpopt: raise ConfigException("Only one of searchcv_params or hyperopt_params can be specified") self.cluster_analysis = False self.clustering = False if 'clustering' in s: self.clustering = True self.clustering_algorithm = s['clustering']['algorithm'] cluster_args = s['clustering']['arguments'] self.n_classes = cluster_args['n_classes'] self.oversample_factor = cluster_args['oversample_factor'] if 'file' in s['clustering'] and s['clustering']['file']: self.semi_supervised = True self.class_file = s['clustering']['file'] self.class_property = s['clustering']['property'] else: self.semi_supervised = False if 'cluster_analysis' in s['clustering']: self.cluster_analysis = s['clustering']['cluster_analysis'] output_model = s['output']['model'] if 'model' in s['output'] \ else self.name + ('.cluster' if self.clustering else '.model') self.model_file = Path(self.output_dir).joinpath(output_model) self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp') self.optimisation_output_skopt = Path(self.output_dir).joinpath(self.name + '_optimisation_skopt.csv') self.optimisation_output_hpopt = Path(self.output_dir).joinpath(self.name + '_optimisation_hpopt.csv') self.optimised_model_params = Path(self.output_dir).joinpath(self.name + "_optimised_params.json") self.optimised_model_file = Path(self.output_dir).joinpath(self.name + "_optimised.model") self.outfile_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json") self.optimised_model_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json")
def __init__(self, yaml_file, clustering=False, learning=False, resampling=False, predicting=False, shiftmap=True): def _grp(d, k, msg=None): """ Get required parameter. """ try: return d[k] except KeyError: if msg is None: msg = f"Required parameter {k} not present in config." _logger.exception(msg) raise Config._configure_pyyaml() with open(yaml_file, 'r') as f: try: s = yaml.load(f, Loader=Config.yaml_loader) except UnicodeDecodeError: if yaml_file.endswith('.model'): _logger.error( "You're attempting to run uncoverml but have provided the " "'.model' file instead of the '.yaml' config file. The predict " "now requires the configuration file and not the model. Please " "try rerunning the command with the configuration file." ) else: _logger.error( "Couldn't parse the yaml file. Ensure you've provided the correct " "file as config file and that the YAML is valid.") self.name = path.basename(yaml_file).rsplit(".", 1)[0] if clustering: # CLUSTERING BLOCK cb = _grp(s, 'clustering', "'clustering' block must be provided when clustering.") self.clustering = True self.algorithm = cb.get('algorithm', 'kmeans') self.n_classes = _grp( cb, 'n_classes', "'n_classes' must be provided when clustering.") self.oversample_factor = _grp( cb, 'oversample_factor', "'oversample_factor' must be provided when clustering.") self.cluster_analysis = cb.get('cluster_analysis', False) self.class_file = cb.get('file') if self.class_file: self.class_property = _grp( cb, 'property', "'property' must be provided when " "providing a file for semisupervised clustering.") self.semi_supervised = self.class_file is not None elif learning: # LEARNING BLOCK learn_block = _grp(s, 'learning') self.clustering = False self.cluster_analysis = False self.target_search = learn_block.get('target_search', False) self.targetsearch_threshold = learn_block.get( 'target_search_threshold', 0.8) tsexb = learn_block.get('target_search_extents') self.targetsearch_extents, self.tse_are_pixel_coordinates = Config.parse_extents( tsexb) self.algorithm = _grp( learn_block, 'algorithm', "'algorithm' must be provided as part of 'learning' block.") self.algorithm_args = learn_block.get('arguments', {}) else: self.bootstrap = False self.algorithm = None self.clustering = False self.cluster_analysis = False self.target_search = False self.set_algo_flags() # EXTENTS exb = s.get('extents') self.extents, self.extents_are_pixel_coordinates = Config.parse_extents( exb) _logger.debug("loaded crop box %s", self.extents) # PICKLING BLOCK pk_block = s.get('pickling') if pk_block: self.pk_covariates = pk_block.get('covariates') self.pk_targets = pk_block.get('targets') # Load from pickle files if covariates and targets exist. self.pk_load = self.pk_covariates and os.path.exists(self.pk_covariates) \ and self.pk_targets and os.path.exists(self.pk_targets) if self.cubist or self.multicubist: self.pk_featurevec = pk_block.get('featurevec') # If running multicubist, we also need featurevec to load from pickle files. self.pk_load = self.pk_load \ and self.pk_featurevec and os.path.exists(self.pk_featurevec) else: self.pk_load = False self.pk_covariates = None self.pk_targets = None self.pk_featurevec = None # FEATURES BLOCK # Todo: fix get_image_spec so features are optional if using pickled data. # if not self.pk_load: if not resampling: _logger.warning( "'features' are required even when loading from pickled data - this " "is a work around for getting image specifications. Needs to be fixed." ) features = _grp( s, 'features', "'features' block must be provided when not loading " "from pickled data.") print("loading features") self.feature_sets = [FeatureSetConfig(f) for f in features] # Mixing tabular and image features not currently supported if any(f.tabular for f in self.feature_sets): self.tabular_prediction = True if not all(f.tabular for f in self.feature_sets): raise ValueError( "Mixing tabular and image features not currently supported. Ensure " "features are only sourced from 'files' or 'table' but not both." ) else: self.tabular_prediction = False # Not yet implemented. if 'patchsize' in s: _logger.info("Patchsize currently fixed at 0 -- ignoring") self.patchsize = 0 # TARGET BLOCK if (not predicting and not clustering) or shiftmap: tb = _grp( s, 'targets', "'targets' block must be provided when not loading from " "pickled data.") self.target_file = _grp( tb, 'file', "'file' needs to be provided when specifying " "targets.") if not path.exists(self.target_file): raise FileNotFoundError( "Target shapefile provided in config does not exist. Check " "that the 'file' property of the 'targets' block is correct." ) self.target_property = _grp( tb, 'property', "'property needs to be provided when " "specifying targets.") self.target_drop_values = tb.get('drop', None) self.target_weight_property = tb.get('weight_property') self.fields_to_write_to_csv = tb.get('write_to_csv') self.shiftmap_targets = tb.get('shiftmap') rb = tb.get('resample') if rb: self.spatial_resampling_args = rb.get('spatial') self.value_resampling_args = rb.get('value') if not (self.spatial_resampling_args or self.value_resampling_args): raise ValueError( "At least one of 'spatial' or 'value' resampling parameters " "must be provided when resampling.") # FINAL TRANSFORM BLOCK ftb = s.get('final_transform') if ftb is not None: _, im, trans_g = _parse_transform_set(ftb.get('transforms'), ftb.get('imputation')) self.final_transform = transforms.TransformSet(im, trans_g) else: self.final_transform = None # VALIDATION BLOCK vb = s.get('validation') if vb: oos = vb.get('out_of_sample') if oos: self.oos_percentage = oos.get('percentage', None) self.oos_shapefile = oos.get('shapefile', None) self.oos_property = oos.get('property', None) self.out_of_sample_validation = oos is not None self.rank_features = vb.get('feature_rank', False) if self.pk_load and self.rank_features: _logger.warning( "Feature ranking cannot be performed when loading covariates and " "targets from pickled data.") self.rank_features = False self.permutation_importance = vb.get('permutation_importance', False) kfb = vb.get('k-fold') if kfb: self.folds = _grp( kfb, 'folds', "'folds' (number of folds) must be specified " "if k-fold cross validation and/or feature ranking is being used." ) self.crossval_seed = _grp( kfb, 'random_seed', "'random_seed' must be specified " "if k-fold cross validation and/or feature ranking is " "being used.") self.parallel_validate = kfb.get('parallel', False) elif self.rank_features: # Feature ranking requires crossval params. Provide defaults if not available. self.folds = 5 self.crossval_seed = 1 self.parallel_validate = False self.cross_validate = kfb is not None else: self.rank_features = False self.permutation_importance = False self.parallel_validate = False self.out_of_sample_validation = False self.cross_validate = False # OPTIMISATION BLOCK # Note: optimisation options get parsed in scripts/gridsearch.py self.optimisation = s.get('optimisation') # PREDICT BLOCK if predicting: pb = _grp(s, 'prediction', "'prediction' block must be provided.") self.geotif_options = pb.get('geotif', {}) self.quantiles = _grp( pb, 'quantiles', "'quantiles' must be provided as part of " "prediction block.") self.outbands = _grp( pb, 'outbands', "'outbands' must be provided as part of prediction " "block.") self.thumbnails = pb.get('thumbnails', 10) self.bootstrap_predictions = pb.get('bootstrap') mb = s.get('mask') if mb: self.mask = mb.get('file') if not os.path.exists(self.mask): raise FileNotFoundError( "Mask file provided in config does not exist. Check that " "the 'file' property of the 'mask' block is correct.") self.retain = _grp( mb, 'retain', "'retain' must be provided if providing a " "prediction mask.") else: self.mask = None if self.krige: # Todo: don't know if lon/lat is compulsory or not for kriging self.lon_lat = s.get('lon_lat') else: self.lon_lat = None # OUTPUT BLOCK def _outpath(filename): return os.path.join(self.output_dir, self.name + f'{filename}') ob = _grp(s, 'output', "'output' block is required.") self.output_dir = _grp(ob, 'directory', "'directory' for output is required.") self.model_file = ob.get('model', _outpath('.model')) if ob.get('plot_feature_ranks', False): self.plot_feature_ranks = _outpath('_featureranks.png') self.plot_feature_rank_curves = _outpath('_featurerank_curves.png') else: self.plot_feature_ranks = None self.plot_feature_rank = None if ob.get('plot_intersection', False): self.plot_intersection = _outpath('_intersected.png') else: self.plot_intersection = None if ob.get('plot_real_vs_pred', False): self.plot_real_vs_pred = _outpath('_real_vs_pred.png') self.plot_residual = _outpath('_residual.png') else: self.plot_real_vs_pred = None self.plot_residual = None if ob.get('plot_correlation', False): self.plot_correlation = _outpath('_correlation.png') else: self.plot_correlation = None if ob.get('plot_target_scaling', False): self.plot_target_scaling = _outpath('_target_scaling.png') else: self.plot_target_scaling = None self.raw_covariates = _outpath('_rawcovariates.csv') self.raw_covariates_mask = _outpath('_rawcovariates_mask.csv') self.feature_ranks_file = _outpath('_featureranks.json') self.crossval_scores_file = _outpath('_crossval_scores.json') self.crossval_results_file = _outpath('_crossval_results.csv') self.crossval_results_plot = _outpath('_crossval_results.png') self.oos_scores_file = _outpath('_oos_scores.json') self.oos_results_file = _outpath('_oos_results.csv') self.oos_targets_file = _outpath('_oos_targets.shp') self.dropped_targets_file = _outpath('_dropped_targets.txt') self.transformed_targets_file = _outpath('_transformed_targets.csv') self.metadata_file = _outpath('_metadata.txt') self.optimisation_results_file = _outpath('_optimisation.csv') self.prediction_file = _outpath('_{}.tif') self.prediction_shapefile = _outpath('_prediction') self.prediction_prjfile = _outpath('_prediction.prj') self.shiftmap_file = _outpath('_shiftmap_{}.tif') self.shiftmap_points = _outpath('_shiftmap_generated_points.csv') self.targetsearch_generated_points = _outpath( '_targetsearch_generated_points.csv') self.targetsearch_likelihood = _outpath('_targetsearch_likelihood.csv') self.targetsearch_result_data = _outpath('_targetsearch_result.pk') self.resampled_shapefile_dir = os.path.join(self.output_dir, '{}_resampled') paths = [self.output_dir, os.path.split(self.model_file)[0]] for p in paths: if p: makedirs(p, exist_ok=True) self._tmpdir = None
def __init__(self, yaml_file): with open(yaml_file, 'r') as f: s = yaml.load(f) self.name = path.basename(yaml_file).rsplit(".", 1)[0] # TODO expose this option when fixed if 'patchsize' in s: log.info("Patchsize currently fixed at 0 -- ignoring") self.patchsize = 0 self.algorithm = s['learning']['algorithm'] self.cubist = self.algorithm == 'cubist' self.multicubist = self.algorithm == 'multicubist' self.multirandomforest = self.algorithm == 'multirandomforest' self.krige = self.algorithm == 'krige' self.algorithm_args = s['learning']['arguments'] self.quantiles = s['prediction']['quantiles'] self.outbands = None if 'outbands' in s['prediction']: self.outbands = s['prediction']['outbands'] self.thumbnails = s['prediction']['thumbnails'] \ if 'thumbnails' in s['prediction'] else None self.pickle = any(True for d in s['features'] if d['type'] == 'pickle') self.rawcovariates = False if self.pickle: self.pickle_load = True for n, d in enumerate(s['features']): if d['type'] == 'pickle': self.pickled_covariates = \ path.abspath(d['files']['covariates']) self.pickled_targets = d['files']['targets'] if 'rawcovariates' in d['files']: self.rawcovariates = d['files']['rawcovariates'] self.rawcovariates_mask = \ d['files']['rawcovariates_mask'] if not (path.exists(d['files']['covariates']) and path.exists(d['files']['targets'])): self.pickle_load = False if self.cubist or self.multicubist: self.featurevec = \ path.abspath(d['files']['featurevec']) if not path.exists(d['files']['featurevec']): self.pickle_load = False if 'plot_covariates' in d['files']: self.plot_covariates = d['files']['plot_covariates'] else: self.plot_covariates = False s['features'].pop(n) else: self.pickle_load = False if not self.pickle_load: log.info('One or both pickled files were not ' 'found. All targets will be intersected.') self.feature_sets = [FeatureSetConfig(k) for k in s['features']] if 'preprocessing' in s: final_transform = s['preprocessing'] _, im, trans_g = _parse_transform_set( final_transform['transforms'], final_transform['imputation']) self.final_transform = transforms.TransformSet(im, trans_g) else: self.final_transform = None self.target_file = s['targets']['file'] self.target_property = s['targets']['property'] self.resample = None if 'resample' in s['targets']: self.resample = s['targets']['resample'] self.mask = None if 'mask' in s: self.mask = s['mask']['file'] self.retain = s['mask']['retain'] # mask areas that are predicted self.lon_lat = False if 'lon_lat' in s: self.lon_lat = True self.lat = s['lon_lat']['lat'] self.lon = s['lon_lat']['lon'] # TODO pipeline this better self.rank_features = False self.cross_validate = False self.parallel_validate = False if s['validation']: for i in s['validation']: if i == 'feature_rank': self.rank_features = True if i == 'parallel': self.parallel_validate = True if type(i) is dict and 'k-fold' in i: self.cross_validate = True self.folds = i['k-fold']['folds'] self.crossval_seed = i['k-fold']['random_seed'] break if self.rank_features and self.pickle_load: self.pickle_load = False log.info('Feature ranking does not work with ' 'pickled files. Pickled files will not be used. ' 'All covariates will be intersected.') self.output_dir = s['output']['directory'] # create output dir if does not exist makedirs(self.output_dir, exist_ok=True) if 'optimisation' in s: self.optimisation = s['optimisation'] if 'optimisation_output' in self.optimisation: self.optimisation_output = \ self.optimisation['optimisation_output'] self.cluster_analysis = False if 'clustering' in s: self.clustering_algorithm = s['clustering']['algorithm'] cluster_args = s['clustering']['arguments'] self.n_classes = cluster_args['n_classes'] self.oversample_factor = cluster_args['oversample_factor'] if 'file' in s['clustering'] and s['clustering']['file']: self.semi_supervised = True self.class_file = s['clustering']['file'] self.class_property = s['clustering']['property'] else: self.semi_supervised = False if 'cluster_analysis' in s['clustering']: self.cluster_analysis = s['clustering']['cluster_analysis']