def __init__(self, config, profile=False): if isinstance(config, (str, PISAConfigParser)): config = parse_pipeline_config(config=config) elif isinstance(config, OrderedDict): pass else: raise TypeError( "`config` passed is of type %s but must be string," " PISAConfigParser, or OrderedDict" % type(config).__name__ ) self.pisa_version = None self.name = config['pipeline']['name'] self.data = ContainerSet(self.name) self.detector_name = config['pipeline']['detector_name'] self.output_binning = config['pipeline']['output_binning'] self.output_key = config['pipeline']['output_key'] self._profile = profile self._stages = [] self._config = config self._init_stages() self._source_code_hash = None
class Pipeline(object): """Instantiate stages according to a parsed config object; excecute stages. Parameters ---------- config : string, OrderedDict, or PISAConfigParser If string, interpret as resource location; send to the `config_parser.parse_pipeline_config()` method to get a config OrderedDict. If `OrderedDict`, use directly as pipeline configuration. profile : bool Perform timings """ def __init__(self, config, profile=False): if isinstance(config, (str, PISAConfigParser)): config = parse_pipeline_config(config=config) elif isinstance(config, OrderedDict): pass else: raise TypeError("`config` passed is of type %s but must be string," " PISAConfigParser, or OrderedDict" % type(config).__name__) self.pisa_version = None self.name = config['pipeline']['name'] self.data = ContainerSet(self.name) self.detector_name = config['pipeline']['detector_name'] self.output_binning = config['pipeline']['output_binning'] self.output_key = config['pipeline']['output_key'] self._profile = profile self._stages = [] self._config = config self._init_stages() self._source_code_hash = None def __repr__(self): return self.tabulate(tablefmt="presto") def _repr_html_(self): return self.tabulate(tablefmt="html") def tabulate(self, tablefmt="plain"): headers = [ 'stage number', 'name', 'calc_mode', 'apply_mode', 'has setup', 'has compute', 'has apply', '# fixed params', '# free params' ] colalign = ["right"] + ["center"] * (len(headers) - 1) table = [] for i, s in enumerate(self.stages): table.append([i, s.__class__.__name__, s.calc_mode, s.apply_mode]) table[-1].append( s.setup_function.__func__.__module__ == s.__class__.__module__) table[-1].append(s.compute_function.__func__.__module__ == s.__class__.__module__) table[-1].append( s.apply_function.__func__.__module__ == s.__class__.__module__) table[-1] += [len(s.params.fixed), len(s.params.free)] return tabulate(table, headers, tablefmt=tablefmt, colalign=colalign) def report_profile(self, detailed=False): for stage in self.stages: stage.report_profile(detailed=detailed) @property def profile(self): return self._profile @profile.setter def profile(self, value): for stage in self.stages: stage.profile = value self._profile = value def index(self, stage_id): """Return the index in the pipeline of `stage_id`. Parameters ---------- stage_id : string or int Name of the stage, or stage number (0-indexed) Returns ------- idx : integer stage number (0-indexed) Raises ------ ValueError : if `stage_id` not in pipeline. """ assert isinstance(stage_id, (int, str)) for stage_num, stage in enumerate(self): if stage_id in [stage_num, stage.stage_name]: return stage_num raise ValueError('No stage "%s" found in the pipeline.' % stage_id) def __len__(self): return len(self._stages) def __iter__(self): return iter(self._stages) def __getitem__(self, idx): if isinstance(idx, str): return self.stages[self.index(idx)] if isinstance(idx, (int, slice)): return self.stages[idx] raise ValueError('Cannot locate stage "%s" in pipeline. Stages' " available are %s." % (idx, self.stage_names)) def __getattr__(self, attr): for stage in self: if stage.stage_name == attr: return stage raise AttributeError( '"%s" is neither a stage in this pipeline nor an attribute/property' " of the `Pipeline` object." % attr) def _init_stages(self): """Stage factory: Instantiate stages specified by self.config. Conventions required for this to work: * Stage and service names must be lower-case * Service implementations must be found at Python path `pisa.stages.<stage_name>.<service_name>` * `service` cannot be an instantiation argument for a service """ stages = [] for stage_num, item in enumerate(self.config.items()): try: name, settings = item if isinstance(name, str): if name == 'pipeline': continue stage_name, service_name = name # old cfgs compatibility if service_name.startswith('pi_'): logging.warning( f"Old stage name `{service_name}` is automatically renamed to `{service_name.replace('pi_', '')}`. " + "Please change your config in the future!") service_name = service_name.replace('pi_', '') logging.debug("instantiating stage %s / service %s", stage_name, service_name) # Import service's module logging.trace( f"Importing service module: {stage_name}.{service_name}") try: module_path = f"pisa.stages.{stage_name}.{service_name}" module = import_module(module_path) except: logging.debug( f"Module {stage_name}.{service_name} not found in PISA, trying " "to import from external definition.") module_path = f"{stage_name}.{service_name}" module = import_module(module_path) # Get service class from module service_cls = getattr(module, service_name) # Instantiate service logging.trace( "initializing stage.service %s.%s with settings %s" % (stage_name, service_name, settings)) try: service = service_cls(**settings, profile=self._profile) except Exception: logging.error( "Failed to instantiate stage.service %s.%s with settings %s", stage_name, service_name, settings.keys(), ) raise if not isinstance(service, Stage): raise TypeError( 'Trying to create service "%s" for stage #%d (%s),' " but object %s instantiated from class %s is not a" " PISA Stage type but instead is of type %s." % ( service_name, stage_num, stage_name, service, service_cls, type(service), )) stages.append(service) except: logging.error( "Failed to initialize stage #%d (stage=%s, service=%s).", stage_num, stage_name, service_name, ) raise # set parameters with an identical name to the same object # otherwise we get inconsistent behaviour when setting repeated params # See Isues #566 and #648 all_parans = self.params self.update_params(all_parans, existing_must_match=True, extend=False) param_selections = set() for service in stages: param_selections.update(service.param_selections) param_selections = sorted(param_selections) for stage in stages: stage.select_params(param_selections, error_on_missing=False) self._stages = stages self.setup() def get_outputs(self, output_binning=None, output_key=None): """Get MapSet output""" self.run() if output_binning is None: output_binning = self.output_binning output_key = self.output_key else: assert (isinstance(output_binning, MultiDimBinning)) assert output_binning is not None self.data.representation = output_binning if isinstance(output_key, tuple): assert len(output_key) == 2 outputs = self.data.get_mapset(output_key[0], error=output_key[1]) else: outputs = self.data.get_mapset(output_key) return outputs def run(self): """Run the pipeline to compute""" for stage in self.stages: logging.debug( f"Working on stage {stage.stage_name}.{stage.service_name}") stage.run() def setup(self): """Setup (reset) all stages""" self.data = ContainerSet(self.name) for stage in self.stages: stage.data = self.data stage.setup() def update_params(self, params, existing_must_match=False, extend=False): """Update params for the pipeline. Note that any param in `params` in excess of those that already exist in the pipeline's stages will have no effect. Parameters ---------- params : ParamSet Parameters to be updated existing_must_match : bool extend : bool """ for stage in self: stage._param_selector.update( params, existing_must_match=existing_must_match, extend=extend) #stage.params.update(params, existing_must_match=existing_must_match, extend=extend) def select_params(self, selections, error_on_missing=False): """Select a set of alternate param values/specifications. Parameters ----------- selections : string or iterable of strings error_on_missing : bool Raises ------ KeyError if `error_on_missing` is `True` and any of `selections` does not exist in any stage in the pipeline. """ successes = 0 for stage in self: try: stage.select_params(selections, error_on_missing=True) except KeyError: pass else: successes += 1 if error_on_missing and successes == 0: raise KeyError("None of the stages in this pipeline has all of the" " selections %s available." % (selections, )) @property def params(self): """pisa.core.param.ParamSet : pipeline's parameters""" params = ParamSet() for stage in self: params.extend(stage.params) return params @property def param_selections(self): """list of strings : param selections collected from all stages""" selections = set() for stage in self: selections.update(stage.param_selections) return sorted(selections) @property def stages(self): """list of Stage : stages in the pipeline""" return [s for s in self] @property def stage_names(self): """list of strings : names of stages in the pipeline""" return [s.stage_name for s in self] @property def config(self): """Deepcopy of the OrderedDict used to instantiate the pipeline""" return deepcopy(self._config) @property def source_code_hash(self): """Hash for the source code of this object's class. Not meant to be perfect, but should suffice for tracking provenance of an object stored to disk that were produced by a Stage. """ if self._source_code_hash is None: self._source_code_hash = hash_obj(getsource(self.__class__)) return self._source_code_hash @property def hash(self): """int : Hash of the state of the pipeline. This hashes together a hash of the Pipeline class's source code and a hash of the state of each contained stage.""" return hash_obj([self.source_code_hash] + [stage.hash for stage in self]) def __hash__(self): return self.hash
def setup(self): """Setup (reset) all stages""" self.data = ContainerSet(self.name) for stage in self.stages: stage.data = self.data stage.setup()
def _init_stages(self): """Stage factory: Instantiate stages specified by self.config. Conventions required for this to work: * Stage and service names must be lower-case * Service implementations must be found at Python path `pisa.stages.<stage_name>.<service_name>` * `service` cannot be an instantiation argument for a service """ stages = [] data = ContainerSet("events") for stage_num, ((stage_name, service_name), settings) in enumerate(self.config.items()): try: logging.debug("instantiating stage %s / service %s", stage_name, service_name) # Import service's module logging.trace("Importing: pisa.stages.%s.%s", stage_name, service_name) module = import_module("pisa.stages.%s.%s" % (stage_name, service_name)) # Get service class from module cls = getattr(module, service_name) # Instantiate service logging.trace( "initializing stage.service %s.%s with settings %s" % (stage_name, service_name, settings)) try: service = cls(**settings) except Exception: logging.error( "Failed to instantiate stage.service %s.%s with settings %s", stage_name, service_name, settings.keys(), ) raise cake_stage = isinstance(service, Stage) pi_stage = isinstance(service, PiStage) if not (cake_stage or pi_stage): raise TypeError( 'Trying to create service "%s" for stage #%d (%s),' " but object %s instantiated from class %s is not a" " PISA Stage type but instead is of type %s." % ( service_name, stage_num, stage_name, service, cls, type(service), )) # first stage can determine type of pipeline if self.pisa_version is None: self.pisa_version = "cake" if cake_stage else "pi" elif self.pisa_version == "cake" and pi_stage: raise TypeError("Trying to use the PISA Pi Stage in " "a PISA cake pipeline.") elif self.pisa_version == "pi" and cake_stage: raise TypeError("Trying to use the PISA cake Stage in " "a PISA Pi pipeline.") # Append service to pipeline if self.pisa_version == "pi": service.data = data # add events object # run setup on service service.setup() stages.append(service) except: logging.error( "Failed to initialize stage #%d (stage=%s, service=%s).", stage_num, stage_name, service_name, ) raise param_selections = set() for service in stages: param_selections.update(service.param_selections) param_selections = sorted(param_selections) previous_stage = None for stage in stages: stage.select_params(param_selections, error_on_missing=False) if previous_stage is not None: prev_has_binning = (hasattr(previous_stage, "output_binning") and previous_stage.output_binning is not None) this_has_binning = (hasattr(stage, "input_binning") and stage.input_binning is not None) if this_has_binning != prev_has_binning: raise ValueError('hasattr(%s, "output_binning") is %s but' ' hasattr(%s, "input_binning") is %s.' % ( previous_stage.stage_name, prev_has_binning, stage.stage_name, this_has_binning, )) if this_has_binning: is_compat = stage.input_binning.is_compat( previous_stage.output_binning) if not is_compat: logging.error( "Stage %s output binning: %s", previous_stage.stage_name, previous_stage.output_binning, ) logging.error( "Stage %s input binning: %s", stage.stage_name, stage.input_binning, ) raise ValueError( "%s stage's output binning is incompatible with" " %s stage's input binning." % (previous_stage.stage_name, stage.stage_name)) previous_stage = stage self._stages = stages