Ejemplo n.º 1
0
    def setUp(self):
        self.id = 0
        self.factory = FormatterFactory()
        self.universe = DimensionUniverse()
        self.dataId = DataCoordinate.makeEmpty(self.universe)

        # Dummy FileDescriptor for testing getFormatter
        self.fileDescriptor = FileDescriptor(
            Location("/a/b/c", "d"),
            StorageClass("DummyStorageClass", dict, None))
Ejemplo n.º 2
0
    def makeGraph(self, pipeline, collections, run, userQuery):
        """Create execution graph for a pipeline.

        Parameters
        ----------
        pipeline : `Pipeline`
            Pipeline definition, task names/classes and their configs.
        collections
            Expressions representing the collections to search for input
            datasets.  May be any of the types accepted by
            `lsst.daf.butler.CollectionSearch.fromExpression`.
        run : `str`, optional
            Name of the `~lsst.daf.butler.CollectionType.RUN` collection for
            output datasets, if it already exists.
        userQuery : `str`
            String which defines user-defined selection for registry, should be
            empty or `None` if there is no restrictions on data selection.

        Returns
        -------
        graph : `QuantumGraph`

        Raises
        ------
        UserExpressionError
            Raised when user expression cannot be parsed.
        OutputExistsError
            Raised when output datasets already exist.
        Exception
            Other exceptions types may be raised by underlying registry
            classes.
        """
        scaffolding = _PipelineScaffolding(pipeline, registry=self.registry)

        instrument = pipeline.getInstrument()
        if isinstance(instrument, str):
            instrument = doImport(instrument)
        if instrument is not None:
            dataId = DataCoordinate.standardize(
                instrument=instrument.getName(),
                universe=self.registry.dimensions)
        else:
            dataId = DataCoordinate.makeEmpty(self.registry.dimensions)
        with scaffolding.connectDataIds(self.registry, collections, userQuery,
                                        dataId) as commonDataIds:
            scaffolding.resolveDatasetRefs(self.registry,
                                           collections,
                                           run,
                                           commonDataIds,
                                           skipExisting=self.skipExisting)
        return scaffolding.makeQuantumGraph()
Ejemplo n.º 3
0
 def testStandardize(self):
     """Test constructing a DataCoordinate from many different kinds of
     input via `DataCoordinate.standardize` and `DataCoordinate.subset`.
     """
     for n in range(5):
         dimensions = self.randomDimensionSubset()
         dataIds = self.randomDataIds(n=1).subset(dimensions)
         split = self.splitByStateFlags(dataIds)
         for m, dataId in enumerate(split.chain()):
             # Passing in any kind of DataCoordinate alone just returns
             # that object.
             self.assertIs(dataId, DataCoordinate.standardize(dataId))
             # Same if we also explicitly pass the dimensions we want.
             self.assertIs(dataId, DataCoordinate.standardize(dataId, graph=dataId.graph))
             # Same if we pass the dimensions and some irrelevant
             # kwargs.
             self.assertIs(dataId, DataCoordinate.standardize(dataId, graph=dataId.graph, htm7=12))
             # Test constructing a new data ID from this one with a
             # subset of the dimensions.
             # This is not possible for some combinations of
             # dimensions if hasFull is False (see
             # `DataCoordinate.subset` docs).
             newDimensions = self.randomDimensionSubset(n=1, graph=dataId.graph)
             if dataId.hasFull() or dataId.graph.required.issuperset(newDimensions.required):
                 newDataIds = [
                     dataId.subset(newDimensions),
                     DataCoordinate.standardize(dataId, graph=newDimensions),
                     DataCoordinate.standardize(dataId, graph=newDimensions, htm7=12),
                 ]
                 for newDataId in newDataIds:
                     with self.subTest(newDataId=newDataId, type=type(dataId)):
                         commonKeys = dataId.keys() & newDataId.keys()
                         self.assertTrue(commonKeys)
                         self.assertEqual(
                             [newDataId[k] for k in commonKeys],
                             [dataId[k] for k in commonKeys],
                         )
                         # This should never "downgrade" from
                         # Complete to Minimal or Expanded to Complete.
                         if dataId.hasRecords():
                             self.assertTrue(newDataId.hasRecords())
                         if dataId.hasFull():
                             self.assertTrue(newDataId.hasFull())
         # Start from a complete data ID, and pass its values in via several
         # different ways that should be equivalent.
         for dataId in split.complete:
             # Split the keys (dimension names) into two random subsets, so
             # we can pass some as kwargs below.
             keys1 = set(self.rng.sample(list(dataId.graph.dimensions.names),
                                         len(dataId.graph.dimensions)//2))
             keys2 = dataId.graph.dimensions.names - keys1
             newCompleteDataIds = [
                 DataCoordinate.standardize(dataId.full.byName(), universe=dataId.universe),
                 DataCoordinate.standardize(dataId.full.byName(), graph=dataId.graph),
                 DataCoordinate.standardize(DataCoordinate.makeEmpty(dataId.graph.universe),
                                            **dataId.full.byName()),
                 DataCoordinate.standardize(DataCoordinate.makeEmpty(dataId.graph.universe),
                                            graph=dataId.graph, **dataId.full.byName()),
                 DataCoordinate.standardize(**dataId.full.byName(), universe=dataId.universe),
                 DataCoordinate.standardize(graph=dataId.graph, **dataId.full.byName()),
                 DataCoordinate.standardize(
                     {k: dataId[k] for k in keys1},
                     universe=dataId.universe,
                     **{k: dataId[k] for k in keys2}
                 ),
                 DataCoordinate.standardize(
                     {k: dataId[k] for k in keys1},
                     graph=dataId.graph,
                     **{k: dataId[k] for k in keys2}
                 ),
             ]
             for newDataId in newCompleteDataIds:
                 with self.subTest(dataId=dataId, newDataId=newDataId, type=type(dataId)):
                     self.assertEqual(dataId, newDataId)
                     self.assertTrue(newDataId.hasFull())
Ejemplo n.º 4
0
 def setUp(self):
     config = Config(
         {
             "version": 1,
             "namespace": "pipe_base_test",
             "skypix": {
                 "common": "htm7",
                 "htm": {
                     "class": "lsst.sphgeom.HtmPixelization",
                     "max_level": 24,
                 },
             },
             "elements": {
                 "A": {
                     "keys": [
                         {
                             "name": "id",
                             "type": "int",
                         }
                     ],
                     "storage": {
                         "cls": "lsst.daf.butler.registry.dimensions.table.TableDimensionRecordStorage",
                     },
                 },
                 "B": {
                     "keys": [
                         {
                             "name": "id",
                             "type": "int",
                         }
                     ],
                     "storage": {
                         "cls": "lsst.daf.butler.registry.dimensions.table.TableDimensionRecordStorage",
                     },
                 },
             },
             "packers": {},
         }
     )
     universe = DimensionUniverse(config=config)
     # need to make a mapping of TaskDef to set of quantum
     quantumMap = {}
     tasks = []
     for task, label in (
         (Dummy1PipelineTask, "R"),
         (Dummy2PipelineTask, "S"),
         (Dummy3PipelineTask, "T"),
         (Dummy4PipelineTask, "U"),
     ):
         config = task.ConfigClass()
         taskDef = TaskDef(get_full_type_name(task), config, task, label)
         tasks.append(taskDef)
         quantumSet = set()
         connections = taskDef.connections
         for a, b in ((1, 2), (3, 4)):
             if connections.initInputs:
                 initInputDSType = DatasetType(
                     connections.initInput.name,
                     tuple(),
                     storageClass=connections.initInput.storageClass,
                     universe=universe,
                 )
                 initRefs = [DatasetRef(initInputDSType, DataCoordinate.makeEmpty(universe))]
             else:
                 initRefs = None
             inputDSType = DatasetType(
                 connections.input.name,
                 connections.input.dimensions,
                 storageClass=connections.input.storageClass,
                 universe=universe,
             )
             inputRefs = [
                 DatasetRef(inputDSType, DataCoordinate.standardize({"A": a, "B": b}, universe=universe))
             ]
             outputDSType = DatasetType(
                 connections.output.name,
                 connections.output.dimensions,
                 storageClass=connections.output.storageClass,
                 universe=universe,
             )
             outputRefs = [
                 DatasetRef(outputDSType, DataCoordinate.standardize({"A": a, "B": b}, universe=universe))
             ]
             quantumSet.add(
                 Quantum(
                     taskName=task.__qualname__,
                     dataId=DataCoordinate.standardize({"A": a, "B": b}, universe=universe),
                     taskClass=task,
                     initInputs=initRefs,
                     inputs={inputDSType: inputRefs},
                     outputs={outputDSType: outputRefs},
                 )
             )
         quantumMap[taskDef] = quantumSet
     self.tasks = tasks
     self.quantumMap = quantumMap
     self.qGraph = QuantumGraph(quantumMap, metadata=METADATA)
     self.universe = universe
Ejemplo n.º 5
0
def _accumulate(
    graph: QuantumGraph,
    dataset_types: PipelineDatasetTypes,
) -> Tuple[Set[DatasetRef], DataSetTypeMap]:
    # accumulate the DatasetRefs that will be transferred to the execution
    # registry

    # exports holds all the existing data that will be migrated to the
    # execution butler
    exports: Set[DatasetRef] = set()

    # inserts is the mapping of DatasetType to dataIds for what is to be
    # inserted into the registry. These are the products that are expected
    # to be produced during processing of the QuantumGraph
    inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set)

    # It is possible to end up with a graph that has different storage
    # classes attached to the same dataset type name. This is okay but
    # must we must ensure that only a single dataset type definition is
    # accumulated in the loop below.  This data structure caches every dataset
    # type encountered and stores the compatible alternative.
    datasetTypes: dict[Union[str, DatasetType], DatasetType] = {}

    # Add inserts for initOutputs (including initIntermediates); these are
    # defined fully by their DatasetType, because they have no dimensions, and
    # they are by definition not resolved.  initInputs are part of Quantum and
    # that's the only place the graph stores the dataset IDs, so we process
    # them there even though each Quantum for a task has the same ones.
    for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs):
        dataset_type = _validate_dataset_type(dataset_type, datasetTypes)
        inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe))

    n: QuantumNode
    for quantum in (n.quantum for n in graph):
        for attrName in ("initInputs", "inputs", "outputs"):
            attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)

            for type, refs in attr.items():
                # This if block is because init inputs has a different
                # signature for its items
                if not isinstance(refs, list):
                    refs = [refs]
                # iterate over all the references, if it has an id, it
                # means it exists and should be exported, if not it should
                # be inserted into the new registry
                for ref in refs:
                    if ref.id is not None:
                        # If this is a component we want the composite to be
                        # exported.
                        if ref.isComponent():
                            ref = ref.makeCompositeRef()
                        exports.add(ref)
                    else:
                        if ref.isComponent():
                            # We can't insert a component, and a component will
                            # be part of some other upstream dataset, so it
                            # should be safe to skip them here
                            continue
                        type = _validate_dataset_type(type, datasetTypes)
                        inserts[type].add(ref.dataId)
    return exports, inserts
Ejemplo n.º 6
0
    def connectDataIds(self, registry, collections, userQuery, externalDataId):
        """Query for the data IDs that connect nodes in the `QuantumGraph`.

        This method populates `_TaskScaffolding.dataIds` and
        `_DatasetScaffolding.dataIds` (except for those in `prerequisites`).

        Parameters
        ----------
        registry : `lsst.daf.butler.Registry`
            Registry for the data repository; used for all data ID queries.
        collections
            Expressions representing the collections to search for input
            datasets.  May be any of the types accepted by
            `lsst.daf.butler.CollectionSearch.fromExpression`.
        userQuery : `str` or `None`
            User-provided expression to limit the data IDs processed.
        externalDataId : `DataCoordinate`
            Externally-provided data ID that should be used to restrict the
            results, just as if these constraints had been included via ``AND``
            in ``userQuery``.  This includes (at least) any instrument named
            in the pipeline definition.

        Returns
        -------
        commonDataIds : \
                `lsst.daf.butler.registry.queries.DataCoordinateQueryResults`
            An interface to a database temporary table containing all data IDs
            that will appear in this `QuantumGraph`.  Returned inside a
            context manager, which will drop the temporary table at the end of
            the `with` block in which this method is called.
        """
        _LOG.debug("Building query for data IDs.")
        # Initialization datasets always have empty data IDs.
        emptyDataId = DataCoordinate.makeEmpty(registry.dimensions)
        for datasetType, refs in itertools.chain(
                self.initInputs.items(), self.initIntermediates.items(),
                self.initOutputs.items()):
            refs[emptyDataId] = DatasetRef(datasetType, emptyDataId)
        # Run one big query for the data IDs for task dimensions and regular
        # inputs and outputs.  We limit the query to only dimensions that are
        # associated with the input dataset types, but don't (yet) try to
        # obtain the dataset_ids for those inputs.
        _LOG.debug("Submitting data ID query and materializing results.")
        with registry.queryDataIds(
                self.dimensions,
                datasets=list(self.inputs),
                collections=collections,
                where=userQuery,
                dataId=externalDataId,
        ).materialize() as commonDataIds:
            _LOG.debug("Expanding data IDs.")
            commonDataIds = commonDataIds.expanded()
            _LOG.debug(
                "Iterating over query results to associate quanta with datasets."
            )
            # Iterate over query results, populating data IDs for datasets and
            # quanta and then connecting them to each other.
            n = 0
            for n, commonDataId in enumerate(commonDataIds):
                # Create DatasetRefs for all DatasetTypes from this result row,
                # noting that we might have created some already.
                # We remember both those that already existed and those that we
                # create now.
                refsForRow = {}
                for datasetType, refs in itertools.chain(
                        self.inputs.items(), self.intermediates.items(),
                        self.outputs.items()):
                    datasetDataId = commonDataId.subset(datasetType.dimensions)
                    ref = refs.get(datasetDataId)
                    if ref is None:
                        ref = DatasetRef(datasetType, datasetDataId)
                        refs[datasetDataId] = ref
                    refsForRow[datasetType.name] = ref
                # Create _QuantumScaffolding objects for all tasks from this
                # result row, noting that we might have created some already.
                for task in self.tasks:
                    quantumDataId = commonDataId.subset(task.dimensions)
                    quantum = task.quanta.get(quantumDataId)
                    if quantum is None:
                        quantum = _QuantumScaffolding(task=task,
                                                      dataId=quantumDataId)
                        task.quanta[quantumDataId] = quantum
                    # Whether this is a new quantum or an existing one, we can
                    # now associate the DatasetRefs for this row with it.  The
                    # fact that a Quantum data ID and a dataset data ID both
                    # came from the same result row is what tells us they
                    # should be associated.
                    # Many of these associates will be duplicates (because
                    # another query row that differed from this one only in
                    # irrelevant dimensions already added them), and we use
                    # sets to skip.
                    for datasetType in task.inputs:
                        ref = refsForRow[datasetType.name]
                        quantum.inputs[datasetType.name][ref.dataId] = ref
                    for datasetType in task.outputs:
                        ref = refsForRow[datasetType.name]
                        quantum.outputs[datasetType.name][ref.dataId] = ref
            _LOG.debug("Finished processing %d rows from data ID query.", n)
            yield commonDataIds