Esempio n. 1
0
    def inline_plugins(cls, components, start_from, log):
        plugins = components.plugins.copy()

        sub_plugins = {start_from: plugins[start_from]}
        del plugins[start_from]

        # Gather all plugins that do not rechunk and which branch out as a
        # simple tree from the input plugin.
        # We'll run these all together in one process.
        while True:
            # Scan for plugins we can inline
            for p in plugins.values():
                if (p.parallel
                        and all([d in sub_plugins for d in p.depends_on])):
                    for d in p.provides:
                        sub_plugins[d] = p
                        if d in plugins:
                            del plugins[d]
                    # Rescan
                    break
            else:
                # No more plugins we can inline
                break

        if len(set(list(sub_plugins.values()))) == 1:
            # Just one plugin to inline: no use
            log.debug("Just one plugin to inline: skipping")
            return components

        # Which data types should we output? Three cases follow.
        outputs_to_send = set()

        # Case 1. Requested as a final target
        for p in sub_plugins.values():
            outputs_to_send.update(
                set(components.targets).intersection(set(p.provides)))
        # Case 2. Requested by a plugin we did not inline
        for d, p in plugins.items():
            outputs_to_send.update(set(p.depends_on))
        outputs_to_send &= sub_plugins.keys()

        # Inline savers that do not require rechunking
        savers = components.savers
        sub_savers = dict()
        for p in sub_plugins.values():
            for d in p.provides:
                if d not in savers:
                    continue
                if p.can_rechunk(d):
                    # Case 3. has a saver we can't inline
                    outputs_to_send.add(d)
                    continue

                remaining_savers = []
                for s_i, s in enumerate(savers[d]):
                    if not s.allow_fork:
                        # Case 3 again, cannot inline saver
                        outputs_to_send.add(d)
                        remaining_savers.append(s)
                        continue
                    if d not in sub_savers:
                        sub_savers[d] = []
                    s.is_forked = True
                    sub_savers[d].append(s)
                savers[d] = remaining_savers

                if not len(savers[d]):
                    del savers[d]

        p = cls(depends_on=sub_plugins[start_from].depends_on)
        p.sub_plugins = sub_plugins
        assert len(outputs_to_send)
        p.provides = tuple(outputs_to_send)
        p.sub_savers = sub_savers
        p.start_from = start_from
        if p.multi_output:
            p.dtype = {
                d: p.sub_plugins[d].dtype_for(d)
                for d in outputs_to_send
            }
        else:
            to_send = list(outputs_to_send)[0]
            p.dtype = p.sub_plugins[to_send].dtype_for(to_send)
        for d in p.provides:
            plugins[d] = p
        p.deps = {d: plugins[d] for d in p.depends_on}

        log.debug(f"Inlined plugins: {p.sub_plugins}."
                  f"Inlined savers: {p.sub_savers}")

        return strax.ProcessorComponents(plugins, components.loaders, savers,
                                         components.targets)
Esempio n. 2
0
    def get_components(
        self, run_id: str, targets=tuple(),
        save=tuple()) -> strax.ProcessorComponents:
        """Return components for setting up a processor
        {get_docs}
        """
        save = strax.to_str_tuple(save)
        targets = strax.to_str_tuple(targets)

        plugins = self._get_plugins(targets, run_id)

        # Get savers/loaders, and meanwhile filter out plugins that do not
        # have to do computation.(their instances will stick around
        # though the .deps attribute of plugins that do)
        loaders = dict()
        savers = collections.defaultdict(list)
        seen = set()
        to_compute = dict()

        def check_cache(d):
            nonlocal plugins, loaders, savers, seen
            if d in seen:
                return
            seen.add(d)
            p = plugins[d]
            key = strax.CacheKey(run_id, d, p.lineage)

            for sb_i, sb in enumerate(self.storage):
                try:
                    loaders[d] = sb.loader(key)
                    # Found it! No need to make it or save it
                    del plugins[d]
                    return
                except strax.NotCached:
                    continue

            # Not in any cache. We will be computing it.
            to_compute[d] = p
            for dep_d in p.depends_on:
                check_cache(dep_d)

            # We're making this OR it gets fed in. Should we save it?
            if p.save_when == strax.SaveWhen.NEVER:
                if d in save:
                    raise ValueError("Plugin forbids saving of {d}")
                return
            elif p.save_when == strax.SaveWhen.TARGET:
                if d != targets:
                    return
            elif p.save_when == strax.SaveWhen.EXPLICIT:
                if d not in save:
                    return
            else:
                assert p.save_when == strax.SaveWhen.ALWAYS

            for sb_i, sb in enumerate(self.storage):
                if not sb.provides(d, write=True):
                    continue
                s = sb.saver(key, p.metadata(run_id))
                s.meta_only = p.save_meta_only
                savers[d].append(s)

        for d in targets:
            check_cache(d)
        plugins = to_compute

        intersec = list(plugins.keys() & loaders.keys())
        if len(intersec):
            raise RuntimeError("{intersec} both computed and loaded?!")

        # Check all required options are available / set defaults
        for p in plugins.values():
            self._set_plugin_config(p, tolerant=False)
        return strax.ProcessorComponents(plugins=plugins,
                                         loaders=loaders,
                                         savers=dict(savers),
                                         targets=targets)
Esempio n. 3
0
    def get_components(
            self,
            run_id: str,
            targets=tuple(),
            save=tuple(),
            time_range=None,
    ) -> strax.ProcessorComponents:
        """Return components for setting up a processor
        {get_docs}
        """
        save = strax.to_str_tuple(save)
        targets = strax.to_str_tuple(targets)

        plugins = self._get_plugins(targets, run_id)

        n_range = None
        if time_range is not None:
            # Ensure we have one data kind
            if len(set([plugins[t].data_kind for t in targets])) > 1:
                raise NotImplementedError(
                    "Time range selection not implemented "
                    "for multiple data kinds.")

            # Which plugin provides time information? We need it to map to
            # row indices.
            for p in targets:
                if 'time' in plugins[p].dtype.names:
                    break
            else:
                raise RuntimeError(f"No time info in targets, should have been"
                                   f" caught earlier??")

            # Find a range of row numbers that contains the time range
            # It's a bit too large: to
            # Get the n <-> time mapping in needed chunks
            if not self.is_stored(run_id, p):
                raise strax.DataNotAvailable(f"Time range selection needs time"
                                             f" info from {p}, but this data"
                                             f" is not yet available")
            meta = self.get_meta(run_id, p)
            times = np.array([c['first_time'] for c in meta['chunks']])
            # Reconstruct row numbers from row counts, which are in metadata
            # n_end is last row + 1 in a chunk. n_start is the first.
            n_end = np.array([c['n'] for c in meta['chunks']]).cumsum()
            n_start = n_end - n_end[0]
            _inds = np.searchsorted(times, time_range) - 1
            # Clip to prevent out-of-range times causing
            # negative or nonexistent indices
            _inds = np.clip(_inds, 0, len(n_end) - 1)
            n_range = n_start[_inds[0]], n_end[_inds[1]]

        # Get savers/loaders, and meanwhile filter out plugins that do not
        # have to do computation.(their instances will stick around
        # though the .deps attribute of plugins that do)
        loaders = dict()
        savers = collections.defaultdict(list)
        seen = set()
        to_compute = dict()

        def check_cache(d):
            nonlocal plugins, loaders, savers, seen
            if d in seen:
                return
            seen.add(d)
            p = plugins[d]
            key = strax.DataKey(run_id, d, p.lineage)

            for sb_i, sf in enumerate(self.storage):
                try:
                    # Bit clunky... but allows specifying executor later
                    sf.find(key, **self._find_options)
                    loaders[d] = partial(sf.loader,
                                         key,
                                         n_range=n_range,
                                         **self._find_options)
                    # Found it! No need to make it
                    del plugins[d]
                    break
                except strax.DataNotAvailable:
                    continue
            else:
                if time_range is not None:
                    # While the data type providing the time information is
                    # available (else we'd have failed earlier), one of the
                    # other requested data types is not.
                    raise strax.DataNotAvailable(
                        f"Time range selection assumes data is already "
                        f"available, but {d} for {run_id} is not.")
                if d in self.context_config['forbid_creation_of']:
                    raise strax.DataNotAvailable(
                        f"{d} for {run_id} not found in any storage, and "
                        "your context specifies it cannot be created.")
                # Not in any cache. We will be computing it.
                to_compute[d] = p
                for dep_d in p.depends_on:
                    check_cache(dep_d)

            # Should we save this data?
            if time_range is not None:
                # No, since we're not even getting the whole data.
                # Without this check, saving could be attempted if the
                # storage converter mode is enabled.
                self.log.warning(f"Not saving {d} while "
                                 f"selecting a time range in the run")
                return
            if any([
                    len(v) > 0 for k, v in self._find_options.items()
                    if 'fuzzy' in k
            ]):
                # In fuzzy matching mode, we cannot (yet) derive the lineage
                # of any data we are creating. To avoid create false
                # data entries, we currently do not save at all.
                self.log.warning(f"Not saving {d} while fuzzy matching is "
                                 f"turned on.")
                return
            if self.context_config['allow_incomplete']:
                self.log.warning(f"Not saving {d} while loading incomplete "
                                 f"data is allowed.")
                return

            elif p.save_when == strax.SaveWhen.NEVER:
                if d in save:
                    raise ValueError("Plugin forbids saving of {d}")
                return
            elif p.save_when == strax.SaveWhen.TARGET:
                if d not in targets:
                    return
            elif p.save_when == strax.SaveWhen.EXPLICIT:
                if d not in save:
                    return
            else:
                assert p.save_when == strax.SaveWhen.ALWAYS

            for sf in self.storage:
                if sf.readonly:
                    continue
                if d not in to_compute:
                    if not self.context_config['storage_converter']:
                        continue
                    try:
                        sf.find(key, **self._find_options)
                        # Already have this data in this backend
                        continue
                    except strax.DataNotAvailable:
                        # Don't have it, so let's convert it!
                        pass
                try:
                    savers[d].append(sf.saver(key,
                                              metadata=p.metadata(run_id)))
                except strax.DataNotAvailable:
                    # This frontend cannot save. Too bad.
                    pass

        for d in targets:
            check_cache(d)
        plugins = to_compute

        intersec = list(plugins.keys() & loaders.keys())
        if len(intersec):
            raise RuntimeError("{intersec} both computed and loaded?!")

        # For the plugins which will run computations,
        # check all required options are available or set defaults.
        # Also run any user-defined setup
        for p in plugins.values():
            self._set_plugin_config(p, run_id, tolerant=False)
            p.setup()
        return strax.ProcessorComponents(plugins=plugins,
                                         loaders=loaders,
                                         savers=dict(savers),
                                         targets=targets)
Esempio n. 4
0
    def get_components(self,
                       run_id: str,
                       targets=tuple(),
                       save=tuple(),
                       time_range=None,
                       chunk_number=None) -> strax.ProcessorComponents:
        """Return components for setting up a processor
        {get_docs}
        """

        save = strax.to_str_tuple(save)
        targets = strax.to_str_tuple(targets)

        # Although targets is a tuple, we only support one target at the moment
        # TODO: just make it a string!
        assert len(targets) == 1, f"Found {len(targets)} instead of 1 target"
        if len(targets[0]) == 1:
            raise ValueError(
                f"Plugin names must be more than one letter, not {targets[0]}")

        plugins = self._get_plugins(targets, run_id)
        target = targets[0]  # See above, already restricted to one target
        targetp = plugins[target]

        # Get savers/loaders, and meanwhile filter out plugins that do not
        # have to do computation. (their instances will stick around
        # though the .deps attribute of plugins that do)
        loaders = dict()
        savers = collections.defaultdict(list)
        seen = set()
        to_compute = dict()

        def check_cache(d):
            nonlocal plugins, loaders, savers, seen
            if d in seen:
                return
            seen.add(d)
            p = plugins[d]

            # Can we load this data?
            loading_this_data = False
            key = strax.DataKey(run_id, d, p.lineage)

            ldr = self._get_partial_loader_for(key,
                                               chunk_number=chunk_number,
                                               time_range=time_range)

            if not ldr and run_id.startswith('_'):
                if time_range is not None:
                    raise NotImplementedError("time range loading not yet "
                                              "supported for superruns")

                sub_run_spec = self.run_metadata(
                    run_id, 'sub_run_spec')['sub_run_spec']
                self.make(list(sub_run_spec.keys()), d)

                ldrs = []
                for subrun in sub_run_spec:
                    sub_key = strax.DataKey(
                        subrun, d,
                        self._get_plugins((d, ), subrun)[d].lineage)
                    if sub_run_spec[subrun] == 'all':
                        _subrun_time_range = None
                    else:
                        _subrun_time_range = sub_run_spec[subrun]
                    ldr = self._get_partial_loader_for(
                        sub_key,
                        time_range=_subrun_time_range,
                        chunk_number=chunk_number)
                    if not ldr:
                        raise RuntimeError(
                            f"Could not load {d} for subrun {subrun} "
                            f"even though we made it??")
                    ldrs.append(ldr)

                def concat_loader(*args, **kwargs):
                    for x in ldrs:
                        yield from x(*args, **kwargs)

                ldr = lambda *args, **kwargs: concat_loader(*args, **kwargs)

            if ldr:
                # Found it! No need to make it or look in other frontends
                loading_this_data = True
                loaders[d] = ldr
                del plugins[d]
            else:
                # Data not found anywhere. We will be computing it.
                if (time_range is not None
                        and plugins[d].save_when != strax.SaveWhen.NEVER):
                    # While the data type providing the time information is
                    # available (else we'd have failed earlier), one of the
                    # other requested data types is not.
                    raise strax.DataNotAvailable(
                        f"Time range selection assumes data is already "
                        f"available, but {d} for {run_id} is not.")
                if '*' in self.context_config['forbid_creation_of']:
                    raise strax.DataNotAvailable(
                        f"{d} for {run_id} not found in any storage, and "
                        "your context specifies no new data can be created.")
                if d in self.context_config['forbid_creation_of']:
                    raise strax.DataNotAvailable(
                        f"{d} for {run_id} not found in any storage, and "
                        "your context specifies it cannot be created.")
                to_compute[d] = p
                for dep_d in p.depends_on:
                    check_cache(dep_d)

            # Should we save this data? If not, return.
            if (loading_this_data
                    and not self.context_config['storage_converter']):
                return
            if p.save_when == strax.SaveWhen.NEVER:
                if d in save:
                    raise ValueError("Plugin forbids saving of {d}")
                return
            elif p.save_when == strax.SaveWhen.TARGET:
                if d not in targets:
                    return
            elif p.save_when == strax.SaveWhen.EXPLICIT:
                if d not in save:
                    return
            else:
                assert p.save_when == strax.SaveWhen.ALWAYS

            # Warn about conditions that preclude saving, but the user
            # might not expect.
            if time_range is not None:
                # We're not even getting the whole data.
                # Without this check, saving could be attempted if the
                # storage converter mode is enabled.
                self.log.warning(f"Not saving {d} while "
                                 f"selecting a time range in the run")
                return
            if any([
                    len(v) > 0 for k, v in self._find_options.items()
                    if 'fuzzy' in k
            ]):
                # In fuzzy matching mode, we cannot (yet) derive the
                # lineage of any data we are creating. To avoid creating
                # false data entries, we currently do not save at all.
                self.log.warning(f"Not saving {d} while fuzzy matching is"
                                 f" turned on.")
                return
            if self.context_config['allow_incomplete']:
                self.log.warning(f"Not saving {d} while loading incomplete"
                                 f" data is allowed.")
                return

            # Save the target and any other outputs of the plugin.
            for d_to_save in set([d] + list(p.provides)):
                if d_to_save in savers and len(savers[d_to_save]):
                    # This multi-output plugin was scanned before
                    # let's not create doubled savers
                    assert p.multi_output
                    continue

                key = strax.DataKey(run_id, d_to_save, p.lineage)

                for sf in self.storage:
                    if sf.readonly:
                        continue
                    if loading_this_data:
                        # Usually, we don't save if we're loading
                        if not self.context_config['storage_converter']:
                            continue
                        # ... but in storage converter mode we do:
                        try:
                            sf.find(key, **self._find_options)
                            # Already have this data in this backend
                            continue
                        except strax.DataNotAvailable:
                            # Don't have it, so let's save it!
                            pass
                    # If we get here, we must try to save
                    try:
                        savers[d_to_save].append(
                            sf.saver(key,
                                     metadata=p.metadata(run_id, d_to_save)))
                    except strax.DataNotAvailable:
                        # This frontend cannot save. Too bad.
                        pass

        for d in targets:
            check_cache(d)
        plugins = to_compute

        intersec = list(plugins.keys() & loaders.keys())
        if len(intersec):
            raise RuntimeError("{intersec} both computed and loaded?!")

        # For the plugins which will run computations,
        # check all required options are available or set defaults.
        # Also run any user-defined setup
        for d in plugins.values():
            self._set_plugin_config(d, run_id, tolerant=False)
            d.setup()
        return strax.ProcessorComponents(plugins=plugins,
                                         loaders=loaders,
                                         savers=dict(savers),
                                         targets=targets)