Exemple #1
0
    def _validate_run(self):
        '''Validate config's "pipeline"'''

        self.run = run = self.config.get('run', []) or []
        if not run or not isinstance(run, (tuple, list)):
            raise ElmConfigError('Expected a "pipeline" list of action '
                                 'dicts in config but found '
                                 '"pipeline": {}'.format(repr(run)))
        for idx, action in enumerate(run):
            if not action or not isinstance(action, dict):
                raise ElmConfigError('Expected each item in "pipeline" to '
                                     'be a dict but found {}'.format(action))
            if not 'pipeline' in action:
                raise ElmConfigError(
                    'Expected a pipeline key in pipeline action {}'.format(
                        action))
            pipeline = action['pipeline']
            if not isinstance(pipeline, (list, tuple)):
                pipeline = self.pipelines[pipeline]
                action['pipeline'] = pipeline
            data_source = action.get('data_source') or ''
            if not data_source in self.data_sources:
                raise ElmConfigError(
                    'Expected a data_source key in pipeline action {}'.format(
                        action))
Exemple #2
0
    def _validate_train_or_transform_funcs(self, name, t):
        '''Validate functions given in "train" section of config'''
        if not isinstance(t, dict):
            raise ElmConfigError('In train:{} expected a dict '
                                 'but found {}'.format(name, t))
        training_funcs = (('model_init_class', True),
                           ('get_y_func', False),
                           ('get_weight_func', False),
                           )

        has_fit_func = False
        has_funcs = {}
        sel = t.get('model_selection')
        no_selection = not sel
        for f, required in training_funcs:
            cls_or_func = self._validate_custom_callable(t.get(f), required,
                                           'train:{} - {}'.format(name, f))
            has_funcs[f] = bool(cls_or_func)
            if f == 'model_init_class':
                model_init_class = cls_or_func
                fit_func = getattr(model_init_class, 'partial_fit', getattr(model_init_class, 'fit', None))
                if fit_func is None:
                    raise ElmConfigError('model_init_class {} '
                                         'does not have "fit" or "partial_fit" method'.format(t.get('model_init_class')))
                fargs, fkwargs,var_keyword = get_args_kwargs_defaults(cls_or_func.fit)
        requires_y = any(x.lower() == 'y' for x in fargs)
        if not fkwargs.get('sample_weight') and has_funcs['get_weight_func']:
            raise ElmConfigError('train:{} - {} does not support a '
                                 '"sample_weight" (sample_weights were implied '
                                 'giving "get_sample_weight" '
                                 'function {}'.format(name, model_init_class, t['get_sample_weight']))
        return has_fit_func, requires_y, no_selection
Exemple #3
0
    def _validate_band_specs(self, band_specs, name):
        '''Validate "band_specs"'''

        if all(isinstance(bs, BandSpec) for bs in band_specs):
            return band_specs
        if not band_specs or not isinstance(band_specs, (tuple, list)):
            raise ElmConfigError(
                'data_sources:{} gave band_specs which are not a '
                'list {}'.format(name, band_specs))
        if not all(isinstance(bs, (dict, string_types)) for bs in band_specs):
            raise ElmConfigError(
                'Expected "band_specs" to be a list of dicts or list of strings'
            )

        new_band_specs = []
        field_names = [x.name for x in attr.fields(BandSpec)]
        for band_spec in band_specs:
            if isinstance(band_spec, string_types):
                new_band_specs.append(
                    BandSpec(
                        **{
                            'search_key': 'sub_dataset_name',
                            'search_value': band_spec,
                            'name': band_spec
                        }))
            elif not all(k in field_names for k in band_spec):
                raise ElmConfigError(
                    "band_spec {} did not have keys: {}".format(
                        band_spec, field_names))
            else:
                new_band_specs.append(BandSpec(**band_spec))
        return new_band_specs
Exemple #4
0
    def _validate_band_specs(self, band_specs, name):
        '''Validate "band_specs"'''
        from elm.readers.util import BandSpec
        if all(isinstance(bs, BandSpec) for bs in band_specs):
            return band_specs
        if not band_specs or not isinstance(band_specs, (tuple, list)):
            raise ElmConfigError(
                'data_sources:{} gave band_specs which are not a '
                'list {}'.format(name, band_specs))
        if not all(isinstance(bs, (dict, str)) for bs in band_specs):
            raise ElmConfigError(
                'Expected "band_specs" to be a list of dicts or list of strings'
            )

        new_band_specs = []
        for band_spec in band_specs:
            if isinstance(band_spec, str):
                new_band_specs.append(
                    BandSpec(
                        **{
                            'search_key': 'sub_dataset_name',
                            'search_value': band_spec,
                            'name': band_spec
                        }))
            elif not all(k.name in band_spec for k in attr.fields(BandSpec)
                         if not k.default == attr.NOTHING):
                raise ElmConfigError(
                    "band_spec {} did not have keys: {}".format(
                        band_spec, attr.fields(BandSpec)))
            else:
                new_band_specs.append(BandSpec(**band_spec))
        return new_band_specs
Exemple #5
0
    def __init__(self, config_file_name=None, config=None, cmd_args=None):
        '''Parses an elm config dictionary or yaml file

        Params:
            config_file_name: file name of a yaml elm config
            config:           elm config dict

        Returns:
            ConfigParser instance
        '''

        if not config:
            if not config_file_name or not os.path.exists(config_file_name):
                raise ElmConfigError('config_file_name {} does not '
                                       'exist'.format(config_file_name))
            with open(config_file_name) as f:
                self.raw_config = yaml.safe_load(f.read())
        elif config:
            self.raw_config = copy.deepcopy(config)
        else:
            raise ElmConfigError('ConfigParser expects either '
                                   'config_file_name or config '
                                   '(dict) as keyword arguments')
        self.config = copy.deepcopy(self.raw_config)
        self._update_for_env()
        self._update_for_cmd_args(cmd_args=cmd_args)
        self.validate()
Exemple #6
0
    def _validate_one_data_source(self, name, ds):
        '''Validate one data source within "data_sources"
        section of config'''

        if not name or not isinstance(name, string_types):
            raise ElmConfigError('Expected a "name" key in {}'.format(d))
        sampler = ds.get('sampler')
        if sampler:
            self._validate_custom_callable(sampler,
                                    True,
                                    'train:{} sampler'.format(name))
        if 'layer_specs' in ds:
            ds['layer_specs'] = self._validate_layer_specs(ds.get('layer_specs'), name)
        reader = ds.get('reader')
        reader_words = ('hdf4', 'hdf5', 'tif', 'netcdf')
        if not reader:
            ds['reader'] = None
        if reader and not reader in self.readers and reader not in reader_words:
            raise ElmConfigError('Data source config dict {} '
                                 'refers to a "reader" {} that is not defined in '
                                 '"readers" or in the tuple {}'.format(reader, self.readers, reader_words))
        s = ds.get('args_list')
        if s and isinstance(s, (list, tuple)):
            def anon(*args, **kwargs):
                return s
            ds['args_list'] = anon
        elif s:
            try:
                ds['args_list'] = import_callable(s)
            except Exception as e:
                raise ElmConfigError('data_source:{} uses a args_list {} that '
                                     'is neither importable nor in '
                                     '"args_list" dict'.format(name, s))
        self._validate_selection_kwargs(ds, name)
        self.data_sources[name] = ds
Exemple #7
0
 def _validate_pipelines(self):
     '''Validate the "pipelines" section of config'''
     from elm.sample_util.change_coords import CHANGE_COORDS_ACTIONS
     self.pipelines = self.config.get('pipelines') or {}
     self._validate_type(self.pipelines, 'pipelines', dict)
     for k, v in self.pipelines.items():
         msg = 'pipelines:{}'.format(k)
         self._validate_type(v, msg, (list, tuple))
         for item in v:
             self._validate_type(item, msg + ' - {}'.format(item), dict)
             ok = False
             ok_words = SAMPLE_PIPELINE_ACTIONS + CHANGE_COORDS_ACTIONS
             for k in ok_words:
                 if k in item:
                     ok = True
                     match = ok_words.index(k)
                     match = ok_words[match]
                     if match in CHANGE_COORDS_ACTIONS:
                         val = item[match]
                         if match == 'change_coords':
                             self._validate_custom_callable(
                                 item[match], True,
                                 'pipelines:{} - {} ({})'.format(
                                     k, match, item[match]))
                         if match == 'flatten':
                             self._validate_type(val,
                                                 'flatten:{}'.format(val),
                                                 str)
                             if not val == 'C':
                                 raise ElmConfigError(
                                     'Expected flatten:{} to be "C" for flatten order (more options may be there over time)'
                                     .format(val))
                     elif not item[k] in (self.config.get(match)
                                          or {}) and k != 'pipeline':
                         raise ElmConfigError(
                             'pipeline item {0} is of type '
                             '{1} and refers to a key ({2}) that is not in '
                             '"{1}" dict of config'.format(
                                 item, match, item[k]))
             if not ok:
                 raise ElmConfigError('pipeline item {} does not '
                                      'have a key that is in the set {}'
                                      ''.format(item,
                                                SAMPLE_PIPELINE_ACTIONS))
     p = (self.config.get('pipeline') or [])
     self._validate_type(p, 'pipeline', (list, tuple))
     for step in p:
         self._validate_type(step, 'pipeline: {}'.format(step), dict)
Exemple #8
0
    def _validate_readers(self):
        '''Validate the "readers" section of config'''
        err_msg = "Expected a 'readers' dictionary in config"
        self.readers = readers = self.config.get('readers') or {}
        if not isinstance(readers, dict):
            raise ElmConfigError(err_msg)
        for k, v in readers.items():
            if not v or not isinstance(v, dict):
                raise ElmConfigError('In readers:{} expected {} to be '
                                       'a dict'.format(k, v))

            load = v.get('load_array')
            meta = v.get('load_meta')
            self._validate_custom_callable(load, True, 'readers:{} load_array'.format(k))
            self._validate_custom_callable(meta, True, 'readers:{} load_meta'.format(k))
            self.readers[k] = v
Exemple #9
0
 def _validate_transform(self):
     '''Validate the "transform" section of config'''
     self.transform = self.config.get('transform') or {}
     for name, t in self.transform.items():
         self._validate_type(t, 'transform:{}'.format(name), dict)
         if not 'model_init_class' in t:
             raise ElmConfigError('Expected {} to have at least "model_init_class" and possibly "model_init_kwargs" in each transform dict')
Exemple #10
0
 def _validate_selection_kwargs(self, data_source, name):
     '''Validate the "selection_kwargs" related to
     sample pre-processing'''
     selection_kwargs = data_source  # TODO renaming needed further
     if not selection_kwargs:
         return
     selection_kwargs['geo_filters'] = selection_kwargs.get(
         'geo_filters', {}) or {}
     for poly_field in ('include_polys', 'exclude_polys'):
         pf = selection_kwargs['geo_filters'].get(poly_field, []) or []
         for item in pf:
             if not item in self.polys:
                 raise ElmConfigError(
                     'config\'s data_source dict {} '
                     '"include_polys" or "exclude_poly" '
                     'must refer to a list of keys from config\'s '
                     '"polys"'.format(self.selection_kwargs))
     for filter_name in ('data_filter', 'metadata_filter',
                         'filename_filter'):
         f = selection_kwargs.get(filter_name, {})
         if f:
             self._validate_custom_callable(
                 f, True, 'data_source:{} - {}'.format(name, filter_name))
         else:
             selection_kwargs[filter_name] = None
     self.data_sources[name] = selection_kwargs
Exemple #11
0
    def _update_for_env(self):
        '''Update the config based on environment vars'''
        self._env = parse_env_vars()
        self._interpolate_env_vars(self._env)
        updates = {}
        for str_var in ENVIRONMENT_VARS_SPEC['str_fields_specs']:
            choices = str_var.get('choices', [])
            val = self._env.get(str_var['name'], None)
            if choices and val not in choices and str_var.get('required'):
                raise ElmConfigError('Expected config key or env '
                                     'var {} to be in '
                                     '{} but got {}'.format(
                                         str_var['name'], choices, val))

            if val:
                updates[str_var['name']] = val

        for int_var in ENVIRONMENT_VARS_SPEC['int_fields_specs']:
            val = getattr(self, int_var['name'], None)
            if val:
                updates[int_var['name']] = int(val)

        for k, v in updates.items():
            self.config[k] = v
            setattr(self, k, v)
Exemple #12
0
 def _validate_one_model_scoring(self, key, value):
     '''Validate one model_scoring key-value (one scoring configuration)'''
     # TODO Note PR 192 deleted the elm.model_selection.metrics
     # module - this section needs to be fixed for the
     # missing import and other Elm PR 192 changes
     # from elm.model_selection.metrics import METRICS
     scoring = value.get('scoring')
     if scoring in METRICS:
         context = 'model_scoring:{}'.format(key)
         # TODO more validation of custom scorers?
         scoring_agg = value.get('scoring_agg')
         if scoring_agg:
             self._validate_custom_callable(scoring_agg, True, context + '(scoring_agg)')
         greater_is_better = value.get('greater_is_better') or None
         score_weights = value.get('score_weights') or None
         err_msg = 'In {}, expected either one of "greater_is_better", "score_weights"'
         if greater_is_better is not None:
             self._validate_type(greater_is_better, context + '(greater_is_better)', bool)
             self.config['model_scoring'][key]['score_weights'] = [-1 if not greater_is_better else 1]
         if score_weights is not None:
             self._validate_type(score_weights, context + '(score_weights)', Iterable)
         if greater_is_better is None and score_weights is None:
             raise ElmConfigError(err_msg)
     else:
         # I think there is little validation
         # that can be done?
         pass
Exemple #13
0
 def _validate_data_sources(self):
     '''Validate all "data_sources" of config'''
     self.data_sources = self.config.get('data_sources', {}) or {}
     if not self.data_sources or not isinstance(self.data_sources, dict):
         raise ElmConfigError('Expected "data_sources" in config to be a '
                                'dict. Got: {}'.format(self.data_sources))
     for name, ds in self.data_sources.items():
         self._validate_one_data_source(name, ds)
Exemple #14
0
 def _validate_pipeline_transform(self, step):
     '''Validate a "train" step within config's "pipeline"'''
     transform = step.get('transform')
     if not transform in self.transform:
         raise ElmConfigError('Pipeline refers to an undefined "transform"'
                                ' key: {}'.format(repr(transform)))
     step['transform'] = transform
     return step
Exemple #15
0
 def _validate_custom_callable(self, func_or_not, required, context):
     '''Validate a callable given like "numpy:mean" can be imported'''
     if callable(func_or_not):
         return func_or_not
     if func_or_not or (not func_or_not and required):
         if not isinstance(func_or_not, string_types):
             raise ElmConfigError('In {} expected {} to be a '
                                    'string'.format(func_or_not, context))
     return import_callable(func_or_not, required=required, context=context)
Exemple #16
0
def process_str_env_var(env_var_name, expanduser=False,
                        default='', required=False, choices=None):
    '''Process a string environment variable that may be a path or have
    fixed choices'''
    val =  os.environ.get(env_var_name, default)
    if choices:
        if not val in choices:
            raise ElmConfigError('Expected env var {} to be '
                                   'in choices {} '
                                   '(go {}'
                                   ')'.format(env_var_name, choices, val))
    if required and (not val and not default):
        raise ElmConfigError('Expected env var {} to be '
                               'defined'.format(env_var_name))
    elif not val:
        val = default
    if expanduser:
        val = os.path.expanduser(val)
    return val
Exemple #17
0
 def _validate_param_grids(self):
     from elm.model_selection.evolve import get_param_grid
     self.param_grids = self.config.get('param_grids') or {}
     self._validate_type(self.param_grids, 'param_grids', dict)
     for k, v in self.param_grids.items():
         for step in self.run:
             if not step['param_grid'] in self.param_grids:
                 raise ElmConfigError('Pipeline step {} refers to '
                                      'a param_grid which is not defined '
                                      'in param_grids ({})'.format(step, step['param_grid']))
             get_param_grid(self, step)
Exemple #18
0
def process_int_env_var(env_var_name, default='0', required=False):
    '''Process an env var which must be an integer'''
    val = os.environ.get(env_var_name, default)
    try:
        val = bool(int(val))
    except Exception as e:
        if required:
            raise ElmConfigError('Expected env var {} to be parsed '
                                 'as int (got {})'.format(env_var_name, val))
        val = bool(val)
    return val
Exemple #19
0
    def _validate_one_train_entry(self, name, t):
        '''Validate one dict within "train" or "transform" section of config'''
        has_fit_func, requires_y, no_selection = self._validate_train_or_transform_funcs(name, t)
        kwargs_fields = tuple(k for k in t if k.endswith('_kwargs'))
        for k in kwargs_fields:
            self._validate_type(t[k], 'train:{}'.format(k), dict)
        mod = t.get('model_selection')

        if mod:
            t['model_selection'] = import_callable(mod)
        if t.get('sort_fitness') is not None:
            self._validate_custom_callable(t.get('sort_fitness'),
                                           True,
                                           'train:{} (sort_fitness)'.format(repr(t.get('sort_fitness'))))
        ms = t.get('model_scoring')
        if ms is not None:
            self._validate_type(ms, 'train:{} (model_scoring)'.format(name),
                                (str, numbers.Number, tuple))
            if not ms in self.model_scoring:
                raise ElmConfigError('train:{}\'s model_scoring: {} is not a '
                                     'key in config\'s model_scoring '
                                     'dict'.format(name, ms))
        self.config['train'][name] = t
Exemple #20
0
 def _validate_positive_int(self, val, context):
     '''Validate that a positive int was given'''
     if not isinstance(val, int) and val:
         raise ElmConfigError('In {} expected {} to be an int'.format(context, val))
Exemple #21
0
 def _validate_type(self, k, name, typ):
     if k and not isinstance(k, typ):
         raise ElmConfigError('In {} expected a {} but '
                                'found {}'.format(name, typ, k))