Beispiel #1
0
    def __init__(self, config, profile=False):
        if isinstance(config, (str, PISAConfigParser)):
            config = parse_pipeline_config(config=config)
        elif isinstance(config, OrderedDict):
            pass
        else:
            raise TypeError(
                "`config` passed is of type %s but must be string,"
                " PISAConfigParser, or OrderedDict" % type(config).__name__
            )

        self.pisa_version = None

        self.name = config['pipeline']['name']
        self.data = ContainerSet(self.name)
        self.detector_name = config['pipeline']['detector_name']
        self.output_binning = config['pipeline']['output_binning']
        self.output_key = config['pipeline']['output_key']

        self._profile = profile

        self._stages = []
        self._config = config
        self._init_stages()
        self._source_code_hash = None
Beispiel #2
0
class Pipeline(object):
    """Instantiate stages according to a parsed config object; excecute
    stages.

    Parameters
    ----------
    config : string, OrderedDict, or PISAConfigParser
        If string, interpret as resource location; send to the
        `config_parser.parse_pipeline_config()` method to get a config
        OrderedDict. If `OrderedDict`, use directly as pipeline configuration.

    profile : bool
        Perform timings

    """
    def __init__(self, config, profile=False):
        if isinstance(config, (str, PISAConfigParser)):
            config = parse_pipeline_config(config=config)
        elif isinstance(config, OrderedDict):
            pass
        else:
            raise TypeError("`config` passed is of type %s but must be string,"
                            " PISAConfigParser, or OrderedDict" %
                            type(config).__name__)

        self.pisa_version = None

        self.name = config['pipeline']['name']
        self.data = ContainerSet(self.name)
        self.detector_name = config['pipeline']['detector_name']
        self.output_binning = config['pipeline']['output_binning']
        self.output_key = config['pipeline']['output_key']

        self._profile = profile

        self._stages = []
        self._config = config
        self._init_stages()
        self._source_code_hash = None

    def __repr__(self):
        return self.tabulate(tablefmt="presto")

    def _repr_html_(self):
        return self.tabulate(tablefmt="html")

    def tabulate(self, tablefmt="plain"):
        headers = [
            'stage number', 'name', 'calc_mode', 'apply_mode', 'has setup',
            'has compute', 'has apply', '# fixed params', '# free params'
        ]
        colalign = ["right"] + ["center"] * (len(headers) - 1)
        table = []
        for i, s in enumerate(self.stages):
            table.append([i, s.__class__.__name__, s.calc_mode, s.apply_mode])
            table[-1].append(
                s.setup_function.__func__.__module__ == s.__class__.__module__)
            table[-1].append(s.compute_function.__func__.__module__ ==
                             s.__class__.__module__)
            table[-1].append(
                s.apply_function.__func__.__module__ == s.__class__.__module__)
            table[-1] += [len(s.params.fixed), len(s.params.free)]
        return tabulate(table, headers, tablefmt=tablefmt, colalign=colalign)

    def report_profile(self, detailed=False):
        for stage in self.stages:
            stage.report_profile(detailed=detailed)

    @property
    def profile(self):
        return self._profile

    @profile.setter
    def profile(self, value):
        for stage in self.stages:
            stage.profile = value
        self._profile = value

    def index(self, stage_id):
        """Return the index in the pipeline of `stage_id`.

        Parameters
        ----------
        stage_id : string or int
            Name of the stage, or stage number (0-indexed)

        Returns
        -------
        idx : integer stage number (0-indexed)

        Raises
        ------
        ValueError : if `stage_id` not in pipeline.

        """
        assert isinstance(stage_id, (int, str))
        for stage_num, stage in enumerate(self):
            if stage_id in [stage_num, stage.stage_name]:
                return stage_num
        raise ValueError('No stage "%s" found in the pipeline.' % stage_id)

    def __len__(self):
        return len(self._stages)

    def __iter__(self):
        return iter(self._stages)

    def __getitem__(self, idx):
        if isinstance(idx, str):
            return self.stages[self.index(idx)]

        if isinstance(idx, (int, slice)):
            return self.stages[idx]

        raise ValueError('Cannot locate stage "%s" in pipeline. Stages'
                         " available are %s." % (idx, self.stage_names))

    def __getattr__(self, attr):
        for stage in self:
            if stage.stage_name == attr:
                return stage
        raise AttributeError(
            '"%s" is neither a stage in this pipeline nor an attribute/property'
            " of the `Pipeline` object." % attr)

    def _init_stages(self):
        """Stage factory: Instantiate stages specified by self.config.

        Conventions required for this to work:
            * Stage and service names must be lower-case
            * Service implementations must be found at Python path
              `pisa.stages.<stage_name>.<service_name>`
            * `service` cannot be an instantiation argument for a service

        """
        stages = []
        for stage_num, item in enumerate(self.config.items()):
            try:
                name, settings = item

                if isinstance(name, str):
                    if name == 'pipeline':
                        continue

                stage_name, service_name = name

                # old cfgs compatibility
                if service_name.startswith('pi_'):
                    logging.warning(
                        f"Old stage name `{service_name}` is automatically renamed to `{service_name.replace('pi_', '')}`. "
                        + "Please change your config in the future!")
                service_name = service_name.replace('pi_', '')

                logging.debug("instantiating stage %s / service %s",
                              stage_name, service_name)

                # Import service's module
                logging.trace(
                    f"Importing service module: {stage_name}.{service_name}")
                try:
                    module_path = f"pisa.stages.{stage_name}.{service_name}"
                    module = import_module(module_path)
                except:
                    logging.debug(
                        f"Module {stage_name}.{service_name} not found in PISA, trying "
                        "to import from external definition.")
                    module_path = f"{stage_name}.{service_name}"
                    module = import_module(module_path)

                # Get service class from module
                service_cls = getattr(module, service_name)

                # Instantiate service
                logging.trace(
                    "initializing stage.service %s.%s with settings %s" %
                    (stage_name, service_name, settings))
                try:
                    service = service_cls(**settings, profile=self._profile)
                except Exception:
                    logging.error(
                        "Failed to instantiate stage.service %s.%s with settings %s",
                        stage_name,
                        service_name,
                        settings.keys(),
                    )
                    raise

                if not isinstance(service, Stage):
                    raise TypeError(
                        'Trying to create service "%s" for stage #%d (%s),'
                        " but object %s instantiated from class %s is not a"
                        " PISA Stage type but instead is of type %s." % (
                            service_name,
                            stage_num,
                            stage_name,
                            service,
                            service_cls,
                            type(service),
                        ))

                stages.append(service)

            except:
                logging.error(
                    "Failed to initialize stage #%d (stage=%s, service=%s).",
                    stage_num,
                    stage_name,
                    service_name,
                )
                raise

        # set parameters with an identical name to the same object
        # otherwise we get inconsistent behaviour when setting repeated params
        # See Isues #566 and #648
        all_parans = self.params
        self.update_params(all_parans, existing_must_match=True, extend=False)

        param_selections = set()
        for service in stages:
            param_selections.update(service.param_selections)
        param_selections = sorted(param_selections)

        for stage in stages:
            stage.select_params(param_selections, error_on_missing=False)

        self._stages = stages

        self.setup()

    def get_outputs(self, output_binning=None, output_key=None):
        """Get MapSet output"""

        self.run()

        if output_binning is None:
            output_binning = self.output_binning
            output_key = self.output_key
        else:
            assert (isinstance(output_binning, MultiDimBinning))

        assert output_binning is not None

        self.data.representation = output_binning

        if isinstance(output_key, tuple):
            assert len(output_key) == 2
            outputs = self.data.get_mapset(output_key[0], error=output_key[1])
        else:
            outputs = self.data.get_mapset(output_key)

        return outputs

    def run(self):
        """Run the pipeline to compute"""
        for stage in self.stages:
            logging.debug(
                f"Working on stage {stage.stage_name}.{stage.service_name}")
            stage.run()

    def setup(self):
        """Setup (reset) all stages"""
        self.data = ContainerSet(self.name)
        for stage in self.stages:
            stage.data = self.data
            stage.setup()

    def update_params(self, params, existing_must_match=False, extend=False):
        """Update params for the pipeline.

        Note that any param in `params` in excess of those that already exist
        in the pipeline's stages will have no effect.

        Parameters
        ----------
        params : ParamSet
            Parameters to be updated

        existing_must_match : bool
        extend : bool

        """
        for stage in self:
            stage._param_selector.update(
                params, existing_must_match=existing_must_match, extend=extend)
            #stage.params.update(params, existing_must_match=existing_must_match, extend=extend)

    def select_params(self, selections, error_on_missing=False):
        """Select a set of alternate param values/specifications.

        Parameters
        -----------
        selections : string or iterable of strings
        error_on_missing : bool

        Raises
        ------
        KeyError if `error_on_missing` is `True` and any of `selections` does
            not exist in any stage in the pipeline.

        """
        successes = 0
        for stage in self:
            try:
                stage.select_params(selections, error_on_missing=True)
            except KeyError:
                pass
            else:
                successes += 1

        if error_on_missing and successes == 0:
            raise KeyError("None of the stages in this pipeline has all of the"
                           " selections %s available." % (selections, ))

    @property
    def params(self):
        """pisa.core.param.ParamSet : pipeline's parameters"""
        params = ParamSet()
        for stage in self:
            params.extend(stage.params)
        return params

    @property
    def param_selections(self):
        """list of strings : param selections collected from all stages"""
        selections = set()
        for stage in self:
            selections.update(stage.param_selections)
        return sorted(selections)

    @property
    def stages(self):
        """list of Stage : stages in the pipeline"""
        return [s for s in self]

    @property
    def stage_names(self):
        """list of strings : names of stages in the pipeline"""
        return [s.stage_name for s in self]

    @property
    def config(self):
        """Deepcopy of the OrderedDict used to instantiate the pipeline"""
        return deepcopy(self._config)

    @property
    def source_code_hash(self):
        """Hash for the source code of this object's class.

        Not meant to be perfect, but should suffice for tracking provenance of
        an object stored to disk that were produced by a Stage.

        """
        if self._source_code_hash is None:
            self._source_code_hash = hash_obj(getsource(self.__class__))
        return self._source_code_hash

    @property
    def hash(self):
        """int : Hash of the state of the pipeline. This hashes together a hash
        of the Pipeline class's source code and a hash of the state of each
        contained stage."""
        return hash_obj([self.source_code_hash] +
                        [stage.hash for stage in self])

    def __hash__(self):
        return self.hash
Beispiel #3
0
 def setup(self):
     """Setup (reset) all stages"""
     self.data = ContainerSet(self.name)
     for stage in self.stages:
         stage.data = self.data
         stage.setup()
Beispiel #4
0
    def _init_stages(self):
        """Stage factory: Instantiate stages specified by self.config.

        Conventions required for this to work:
            * Stage and service names must be lower-case
            * Service implementations must be found at Python path
              `pisa.stages.<stage_name>.<service_name>`
            * `service` cannot be an instantiation argument for a service

        """
        stages = []
        data = ContainerSet("events")
        for stage_num, ((stage_name, service_name),
                        settings) in enumerate(self.config.items()):
            try:
                logging.debug("instantiating stage %s / service %s",
                              stage_name, service_name)

                # Import service's module
                logging.trace("Importing: pisa.stages.%s.%s", stage_name,
                              service_name)
                module = import_module("pisa.stages.%s.%s" %
                                       (stage_name, service_name))

                # Get service class from module
                cls = getattr(module, service_name)

                # Instantiate service
                logging.trace(
                    "initializing stage.service %s.%s with settings %s" %
                    (stage_name, service_name, settings))
                try:
                    service = cls(**settings)
                except Exception:
                    logging.error(
                        "Failed to instantiate stage.service %s.%s with settings %s",
                        stage_name,
                        service_name,
                        settings.keys(),
                    )
                    raise

                cake_stage = isinstance(service, Stage)
                pi_stage = isinstance(service, PiStage)

                if not (cake_stage or pi_stage):
                    raise TypeError(
                        'Trying to create service "%s" for stage #%d (%s),'
                        " but object %s instantiated from class %s is not a"
                        " PISA Stage type but instead is of type %s." % (
                            service_name,
                            stage_num,
                            stage_name,
                            service,
                            cls,
                            type(service),
                        ))

                # first stage can determine type of pipeline
                if self.pisa_version is None:
                    self.pisa_version = "cake" if cake_stage else "pi"

                elif self.pisa_version == "cake" and pi_stage:
                    raise TypeError("Trying to use the PISA Pi Stage in "
                                    "a PISA cake pipeline.")

                elif self.pisa_version == "pi" and cake_stage:
                    raise TypeError("Trying to use the PISA cake Stage in "
                                    "a PISA Pi pipeline.")

                # Append service to pipeline

                if self.pisa_version == "pi":
                    service.data = data
                # add events object

                # run setup on service
                service.setup()

                stages.append(service)

            except:
                logging.error(
                    "Failed to initialize stage #%d (stage=%s, service=%s).",
                    stage_num,
                    stage_name,
                    service_name,
                )
                raise

        param_selections = set()
        for service in stages:
            param_selections.update(service.param_selections)
        param_selections = sorted(param_selections)

        previous_stage = None
        for stage in stages:
            stage.select_params(param_selections, error_on_missing=False)
            if previous_stage is not None:
                prev_has_binning = (hasattr(previous_stage, "output_binning")
                                    and previous_stage.output_binning
                                    is not None)
                this_has_binning = (hasattr(stage, "input_binning")
                                    and stage.input_binning is not None)
                if this_has_binning != prev_has_binning:
                    raise ValueError('hasattr(%s, "output_binning") is %s but'
                                     ' hasattr(%s, "input_binning") is %s.' % (
                                         previous_stage.stage_name,
                                         prev_has_binning,
                                         stage.stage_name,
                                         this_has_binning,
                                     ))
                if this_has_binning:
                    is_compat = stage.input_binning.is_compat(
                        previous_stage.output_binning)
                    if not is_compat:
                        logging.error(
                            "Stage %s output binning: %s",
                            previous_stage.stage_name,
                            previous_stage.output_binning,
                        )
                        logging.error(
                            "Stage %s input binning: %s",
                            stage.stage_name,
                            stage.input_binning,
                        )
                        raise ValueError(
                            "%s stage's output binning is incompatible with"
                            " %s stage's input binning." %
                            (previous_stage.stage_name, stage.stage_name))
            previous_stage = stage

        self._stages = stages