def testQuantum(self): registry = self.makeRegistry() if not registry.limited: registry.addDimensionEntry("instrument", {"instrument": "DummyCam"}) run = registry.makeRun(collection="test") storageClass = StorageClass("testQuantum") registry.storageClasses.registerStorageClass(storageClass) # Make two predicted inputs datasetType1 = DatasetType(name="dst1", dimensions=registry.dimensions.extract( ("instrument", )), storageClass=storageClass) registry.registerDatasetType(datasetType1) ref1 = registry.addDataset(datasetType1, dataId={"instrument": "DummyCam"}, run=run) datasetType2 = DatasetType(name="dst2", dimensions=registry.dimensions.extract( ("instrument", )), storageClass=storageClass) registry.registerDatasetType(datasetType2) ref2 = registry.addDataset(datasetType2, dataId={"instrument": "DummyCam"}, run=run) # Create and add a Quantum quantum = Quantum(run=run, task="some.fully.qualified.SuperTask", startTime=datetime(2018, 1, 1), endTime=datetime(2018, 1, 2), host="localhost") quantum.addPredictedInput(ref1) quantum.addPredictedInput(ref2) # Quantum is not yet in Registry, so can't mark input as actual with self.assertRaises(KeyError): registry.markInputUsed(quantum, ref1) registry.addQuantum(quantum) # Now we can registry.markInputUsed(quantum, ref1) outQuantum = registry.getQuantum(quantum.id) self.assertEqual(outQuantum, quantum) # Removing a predictedInput dataset should be enough to remove the # Quantum; we don't want to allow Quantums with inaccurate information # to exist. registry.removeDataset(ref1) self.assertIsNone(registry.getQuantum(quantum.id))
def _makeQuanta(self, config): """Create set of Quanta """ run = Run(collection=1, environment=None, pipeline=None) descriptor = pipeBase.DatasetTypeDescriptor.fromConfig(config.input) dstype0 = descriptor.datasetType descriptor = pipeBase.DatasetTypeDescriptor.fromConfig(config.output) dstype1 = descriptor.datasetType quanta = [] for visit in range(100): quantum = Quantum(run=run, task=None) quantum.addPredictedInput(self._makeDSRefVisit(dstype0, visit)) quantum.addOutput(self._makeDSRefVisit(dstype1, visit)) quanta.append(quantum) return quanta
def addQuantum(self, quantum: Quantum): config = self.taskDef.config connectionClass = config.connections.ConnectionsClass connectionInstance = connectionClass(config=config) # This will raise if one of the check conditions is not met, which is the intended # behavior result = connectionInstance.adjustQuantum(quantum.predictedInputs) quantum._predictedInputs = NamedKeyDict(result) # If this function has reached this far add the quantum self.quanta.append(quantum)
def testRunQuantum(self): inputId = { "instrument": self.CAMERA_ID, "visit": self.VISIT_ID, "detector": self.CHIP_ID, } butler = self._makeButler() # self.task.config not persistable because it refers to a local class # We don't actually use the persisted config, so just make a new one butler.put(self.task.ConfigClass(), "apdb_marker", inputId) quantum = Quantum(taskClass=self.taskClass) quantum.addPredictedInput( ref_from_connection(butler, self.connections.dbInfo, inputId)) quantum.addOutput( ref_from_connection(butler, self.connections.measurement, { "instrument": self.CAMERA_ID, })) run_quantum(self.task, butler, quantum) # Did output data ID get passed to DummyTask.run? measurement = butler.get(self.connections.measurement.name, instrument=self.CAMERA_ID) self.assertEqual(measurement.quantity, len(self.CAMERA_ID) * u.dimensionless_unscaled)
def testConstructor(self): """Test of constructor. """ # Quantum specific arguments taskName = "some.task.object" # can't use a real PipelineTask due to inverted package dependency quantum = Quantum(taskName=taskName) self.assertEqual(quantum.taskName, taskName) self.assertEqual(quantum.initInputs, {}) self.assertEqual(quantum.inputs, NamedKeyDict()) self.assertEqual(quantum.outputs, {}) self.assertIsNone(quantum.dataId) universe = DimensionUniverse() instrument = "DummyCam" datasetTypeName = "test_ds" storageClass = StorageClass("testref_StructuredData") datasetType = DatasetType(datasetTypeName, universe.extract(("instrument", "visit")), storageClass) predictedInputs = { datasetType: [ DatasetRef(datasetType, dict(instrument=instrument, visit=42)), DatasetRef(datasetType, dict(instrument=instrument, visit=43)) ] } outputs = { datasetType: [ DatasetRef(datasetType, dict(instrument=instrument, visit=42)), DatasetRef(datasetType, dict(instrument=instrument, visit=43)) ] } quantum = Quantum(taskName=taskName, inputs=predictedInputs, outputs=outputs) self.assertEqual(len(quantum.inputs[datasetType]), 2) self.assertEqual(len(quantum.outputs[datasetType]), 2)
def makeQuantum(task, butler, dataId, ioDataIds): """Create a Quantum for a particular data ID(s). Parameters ---------- task : `lsst.pipe.base.PipelineTask` The task whose processing the quantum represents. butler : `lsst.daf.butler.Butler` The collection the quantum refers to. dataId: any data ID type The data ID of the quantum. Must have the same dimensions as ``task``'s connections class. ioDataIds : `collections.abc.Mapping` [`str`] A mapping keyed by input/output names. Values must be data IDs for single connections and sequences of data IDs for multiple connections. Returns ------- quantum : `lsst.daf.butler.Quantum` A quantum for ``task``, when called with ``dataIds``. """ connections = task.config.ConnectionsClass(config=task.config) try: inputs = defaultdict(list) outputs = defaultdict(list) for name in itertools.chain(connections.inputs, connections.prerequisiteInputs): connection = connections.__getattribute__(name) _checkDataIdMultiplicity(name, ioDataIds[name], connection.multiple) ids = _normalizeDataIds(ioDataIds[name]) for id in ids: ref = _refFromConnection(butler, connection, id) inputs[ref.datasetType].append(ref) for name in connections.outputs: connection = connections.__getattribute__(name) _checkDataIdMultiplicity(name, ioDataIds[name], connection.multiple) ids = _normalizeDataIds(ioDataIds[name]) for id in ids: ref = _refFromConnection(butler, connection, id) outputs[ref.datasetType].append(ref) quantum = Quantum(taskClass=type(task), dataId=dataId, inputs=inputs, outputs=outputs) return quantum except KeyError as e: raise ValueError("Mismatch in input data.") from e
def from_simple( cls, simple: SerializedQuantumNode, taskDefMap: Dict[str, TaskDef], universe: DimensionUniverse, recontitutedDimensions: Optional[Dict[int, Tuple[str, DimensionRecord]]] = None, ) -> QuantumNode: return QuantumNode( quantum=Quantum.from_simple( simple.quantum, universe, reconstitutedDimensions=recontitutedDimensions), taskDef=taskDefMap[simple.taskLabel], nodeId=simple.nodeId, )
def _makeQuanta(self, config): """Create set of Quanta""" universe = DimensionUniverse() connections = config.connections.ConnectionsClass(config=config) dstype0 = connections.input.makeDatasetType(universe) dstype1 = connections.output.makeDatasetType(universe) quanta = [] for visit in range(100): inputRef = self._makeDSRefVisit(dstype0, visit, universe) outputRef = self._makeDSRefVisit(dstype1, visit, universe) quantum = Quantum( inputs={inputRef.datasetType: [inputRef]}, outputs={outputRef.datasetType: [outputRef]} ) quanta.append(quantum) return quanta
def updatedQuantumInputs(self, quantum, butler): """Update quantum with extra information, returns a new updated Quantum. Some methods may require input DatasetRefs to have non-None ``dataset_id``, but in case of intermediate dataset it may not be filled during QuantumGraph construction. This method will retrieve missing info from registry. Parameters ---------- quantum : `~lsst.daf.butler.Quantum` Single Quantum instance. butler : `~lsst.daf.butler.Butler` Data butler. Returns ------- update : `~lsst.daf.butler.Quantum` Updated Quantum instance """ updatedInputs = defaultdict(list) for key, refsForDatasetType in quantum.inputs.items(): newRefsForDatasetType = updatedInputs[key] for ref in refsForDatasetType: if ref.id is None: resolvedRef = butler.registry.findDataset( ref.datasetType, ref.dataId, collections=butler.collections) if resolvedRef is None: raise ValueError( f"Cannot find {ref.datasetType.name} with id {ref.dataId} " f"in collections {butler.collections}.") newRefsForDatasetType.append(resolvedRef) _LOG.debug("Updating dataset ID for %s", ref) else: newRefsForDatasetType.append(ref) return Quantum(taskName=quantum.taskName, taskClass=quantum.taskClass, dataId=quantum.dataId, initInputs=quantum.initInputs, inputs=updatedInputs, outputs=quantum.outputs)
def _makeQGraph(): """Make a trivial QuantumGraph with one quantum. The only thing that we need to do with this quantum graph is to pickle it, the quanta in this graph are not usable for anything else. Returns ------- qgraph : `~lsst.pipe.base.QuantumGraph` """ # The task name in TaskDef needs to be a real importable name, use one that is sure to exist taskDef = TaskDef(taskName="lsst.pipe.base.Struct", config=SimpleConfig()) quanta = [ Quantum(taskName="lsst.pipe.base.Struct", inputs={FakeTaskDef("A"): FakeDSRef("A", (1, 2))}) ] # type: ignore qgraph = QuantumGraph({taskDef: set(quanta)}) return qgraph
def testConstructor(self): """Test of constructor. """ # Quantum specific arguments run = None # TODO add Run taskName = "some.task.object" # can't use a real PipelineTask due to inverted package dependency # Base class arguments startTime = datetime(2018, 1, 1) endTime = datetime(2018, 1, 2) host = "localhost" quantum = Quantum(taskName=taskName, run=run, startTime=startTime, endTime=endTime, host=host) self.assertEqual(quantum.taskName, taskName) self.assertEqual(quantum.run, run) self.assertEqual(quantum.predictedInputs, NamedKeyDict()) self.assertEqual(quantum.actualInputs, NamedKeyDict()) self.assertIsNone(quantum.dataId) self.assertIsNone(quantum.id) self.assertEqual(quantum.startTime, startTime) self.assertEqual(quantum.endTime, endTime) self.assertEqual(quantum.host, host)
def makeQuantum(self) -> Quantum: """Transform the scaffolding object into a true `Quantum` instance. Returns ------- quantum : `Quantum` An actual `Quantum` instance. """ allInputs = self.inputs.unpackMultiRefs() allInputs.update(self.prerequisites.unpackMultiRefs()) # Give the task's Connections class an opportunity to remove some # inputs, or complain if they are unacceptable. # This will raise if one of the check conditions is not met, which is # the intended behavior allInputs = self.task.taskDef.connections.adjustQuantum(allInputs) return Quantum( taskName=self.task.taskDef.taskName, taskClass=self.task.taskDef.taskClass, dataId=self.dataId, initInputs=self.task.initInputs.unpackSingleRefs(), inputs=allInputs, outputs=self.outputs.unpackMultiRefs(), )
def _makeQuanta(self, config): """Create set of Quanta """ universe = DimensionUniverse.fromConfig() run = Run(collection=1, environment=None, pipeline=None) descriptor = pipeBase.DatasetTypeDescriptor.fromConfig(config.input) dstype0 = descriptor.makeDatasetType(universe) descriptor = pipeBase.DatasetTypeDescriptor.fromConfig(config.output) dstype1 = descriptor.makeDatasetType(universe) quanta = [] for visit in range(100): quantum = Quantum(run=run, task=None) quantum.addPredictedInput(self._makeDSRefVisit(dstype0, visit)) quantum.addOutput(self._makeDSRefVisit(dstype1, visit)) quanta.append(quantum) return quanta
def _makeQuanta(self, config): """Create set of Quanta """ universe = DimensionUniverse() run = Run(collection=1, environment=None, pipeline=None) connections = config.connections.ConnectionsClass(config=config) dstype0 = connections.input.makeDatasetType(universe) dstype1 = connections.output.makeDatasetType(universe) quanta = [] for visit in range(100): quantum = Quantum(run=run) quantum.addPredictedInput( self._makeDSRefVisit(dstype0, visit, universe)) quantum.addOutput(self._makeDSRefVisit(dstype1, visit, universe)) quanta.append(quantum) return quanta
def testAddInputsOutputs(self): """Test of addPredictedInput() method. """ quantum = Quantum(taskName="some.task.object", run=None) # start with empty self.assertEqual(quantum.predictedInputs, dict()) universe = DimensionUniverse() instrument = "DummyCam" datasetTypeName = "test_ds" storageClass = StorageClass("testref_StructuredData") datasetType = DatasetType(datasetTypeName, universe.extract(("instrument", "visit")), storageClass) # add one ref ref = DatasetRef(datasetType, dict(instrument=instrument, visit=42)) quantum.addPredictedInput(ref) self.assertIn(datasetTypeName, quantum.predictedInputs) self.assertEqual(len(quantum.predictedInputs[datasetTypeName]), 1) # add second ref ref = DatasetRef(datasetType, dict(instrument=instrument, visit=43)) quantum.addPredictedInput(ref) self.assertEqual(len(quantum.predictedInputs[datasetTypeName]), 2) # mark last ref as actually used self.assertEqual(quantum.actualInputs, dict()) quantum._markInputUsed(ref) self.assertIn(datasetTypeName, quantum.actualInputs) self.assertEqual(len(quantum.actualInputs[datasetTypeName]), 1) # add couple of outputs too self.assertEqual(quantum.outputs, dict()) ref = DatasetRef(datasetType, dict(instrument=instrument, visit=42)) quantum.addOutput(ref) self.assertIn(datasetTypeName, quantum.outputs) self.assertEqual(len(quantum.outputs[datasetTypeName]), 1) ref = DatasetRef(datasetType, dict(instrument=instrument, visit=43)) quantum.addOutput(ref) self.assertEqual(len(quantum.outputs[datasetTypeName]), 2)
def setUp(self): config = Config( { "version": 1, "namespace": "pipe_base_test", "skypix": { "common": "htm7", "htm": { "class": "lsst.sphgeom.HtmPixelization", "max_level": 24, }, }, "elements": { "A": { "keys": [ { "name": "id", "type": "int", } ], "storage": { "cls": "lsst.daf.butler.registry.dimensions.table.TableDimensionRecordStorage", }, }, "B": { "keys": [ { "name": "id", "type": "int", } ], "storage": { "cls": "lsst.daf.butler.registry.dimensions.table.TableDimensionRecordStorage", }, }, }, "packers": {}, } ) universe = DimensionUniverse(config=config) # need to make a mapping of TaskDef to set of quantum quantumMap = {} tasks = [] for task, label in ( (Dummy1PipelineTask, "R"), (Dummy2PipelineTask, "S"), (Dummy3PipelineTask, "T"), (Dummy4PipelineTask, "U"), ): config = task.ConfigClass() taskDef = TaskDef(get_full_type_name(task), config, task, label) tasks.append(taskDef) quantumSet = set() connections = taskDef.connections for a, b in ((1, 2), (3, 4)): if connections.initInputs: initInputDSType = DatasetType( connections.initInput.name, tuple(), storageClass=connections.initInput.storageClass, universe=universe, ) initRefs = [DatasetRef(initInputDSType, DataCoordinate.makeEmpty(universe))] else: initRefs = None inputDSType = DatasetType( connections.input.name, connections.input.dimensions, storageClass=connections.input.storageClass, universe=universe, ) inputRefs = [ DatasetRef(inputDSType, DataCoordinate.standardize({"A": a, "B": b}, universe=universe)) ] outputDSType = DatasetType( connections.output.name, connections.output.dimensions, storageClass=connections.output.storageClass, universe=universe, ) outputRefs = [ DatasetRef(outputDSType, DataCoordinate.standardize({"A": a, "B": b}, universe=universe)) ] quantumSet.add( Quantum( taskName=task.__qualname__, dataId=DataCoordinate.standardize({"A": a, "B": b}, universe=universe), taskClass=task, initInputs=initRefs, inputs={inputDSType: inputRefs}, outputs={outputDSType: outputRefs}, ) ) quantumMap[taskDef] = quantumSet self.tasks = tasks self.quantumMap = quantumMap self.qGraph = QuantumGraph(quantumMap, metadata=METADATA) self.universe = universe
def fillQuanta(self, registry, inputCollections, *, skipExisting=True): """Define quanta for each task by splitting up the datasets associated with each task data ID. This method populates `_TaskScaffolding.quanta`. Parameters ---------- registry : `lsst.daf.butler.Registry` Registry for the data repository; used for all data ID queries. inputCollections : `~collections.abc.Mapping` Mapping from dataset type name to an ordered sequence of collections to search for that dataset. A `defaultdict` is recommended for the case where the same collections should be used for most datasets. skipExisting : `bool`, optional If `True` (default), a Quantum is not created if all its outputs already exist. """ for task in self.tasks: for quantumDataId in task.dataIds: # Identify the (regular) inputs that correspond to the Quantum # with this data ID. These are those whose data IDs have the # same values for all dimensions they have in common. # We do this data IDs expanded to include implied dimensions, # which is why _DatasetScaffolding.dimensions is thus expanded # even though DatasetType.dimensions is not. inputs = NamedKeyDict() for datasetType, scaffolding in task.inputs.items(): inputs[datasetType] = [ref for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds) if quantumDataId.matches(dataId)] # Same for outputs. outputs = NamedKeyDict() allOutputsPresent = True for datasetType, scaffolding in task.outputs.items(): outputs[datasetType] = [] for ref, dataId in zip(scaffolding.refs, scaffolding.dataIds): if quantumDataId.matches(dataId): if ref.id is None: allOutputsPresent = False else: assert skipExisting, "Existing outputs should have already been identified." if not allOutputsPresent: raise OutputExistsError(f"Output {datasetType.name} with data ID " f"{dataId} already exists, but other outputs " f"for task with label {task.taskDef.label} " f"and data ID {quantumDataId} do not.") outputs[datasetType].append(ref) if allOutputsPresent and skipExisting: continue # Look up prerequisite datasets in the input collection(s). # These may have dimensions that extend beyond those we queried # for originally, because we want to permit those data ID # values to differ across quanta and dataset types. # For example, the same quantum may have a flat and bias with # a different calibration_label, or a refcat with a skypix # value that overlaps the quantum's data ID's region, but not # the user expression used for the initial query. for datasetType, scaffolding in task.prerequisites.items(): refs = list( registry.queryDatasets( datasetType, collections=inputCollections[datasetType.name], dataId=quantumDataId, deduplicate=True, expand=True, ) ) inputs[datasetType] = refs task.addQuantum( Quantum( taskName=task.taskDef.taskName, taskClass=task.taskDef.taskClass, dataId=quantumDataId, initInputs=task.initInputs.unpackRefs(), predictedInputs=inputs, outputs=outputs, ) )
def _makeGraph(self, taskDatasets, required, optional, prerequisite, initInputs, initOutputs, originInfo, userQuery, perDatasetTypeDimensions=()): """Make QuantumGraph instance. Parameters ---------- taskDatasets : sequence of `_TaskDatasetTypes` Tasks with their inputs and outputs. required : `set` of `~lsst.daf.butler.DatasetType` Datasets that must exist in the repository in order to generate a QuantumGraph node that consumes them. optional : `set` of `~lsst.daf.butler.DatasetType` Datasets that will be produced by the graph, but may exist in the repository. If ``self.skipExisting`` and all outputs of a particular node already exist, it will be skipped. Otherwise pre-existing datasets of these types will cause `OutputExistsError` to be raised. prerequisite : `set` of `~lsst.daf.butler.DatasetType` Datasets that must exist in the repository, but whose absence should cause `PrerequisiteMissingError` to be raised if they are needed by any graph node that would otherwise be created. initInputs : `set` of `DatasetType` Datasets which should exist in input repository, and will be used in task initialization initOutputs : `set` of `DatasetType` Datasets which which will be created in task initialization originInfo : `DatasetOriginInfo` Object which provides names of the input/output collections. userQuery : `str` String which defines user-defined selection for registry, should be empty or `None` if there is no restrictions on data selection. perDatasetTypeDimensions : iterable of `Dimension` or `str` Dimensions (or names thereof) that may have different values for different dataset types within the same quantum. Returns ------- `QuantumGraph` instance. """ rows = self.registry.selectMultipleDatasetTypes( originInfo, userQuery, required=required, optional=optional, prerequisite=prerequisite, perDatasetTypeDimensions=perDatasetTypeDimensions ) # store result locally for multi-pass algorithm below # TODO: change it to single pass dimensionVerse = [] try: for row in rows: _LOG.debug("row: %s", row) dimensionVerse.append(row) except LookupError as err: raise PrerequisiteMissingError(str(err)) from err # Next step is to group by task quantum dimensions qgraph = QuantumGraph() qgraph._inputDatasetTypes = (required | prerequisite) qgraph._outputDatasetTypes = optional for dsType in initInputs: for collection in originInfo.getInputCollections(dsType.name): result = self.registry.find(collection, dsType) if result is not None: qgraph.initInputs.append(result) break else: raise GraphBuilderError(f"Could not find initInput {dsType.name} in any input" " collection") for dsType in initOutputs: qgraph.initOutputs.append(DatasetRef(dsType, {})) for taskDss in taskDatasets: taskQuantaInputs = {} # key is the quantum dataId (as tuple) taskQuantaOutputs = {} # key is the quantum dataId (as tuple) qlinks = [] for dimensionName in taskDss.taskDef.config.quantum.dimensions: dimension = self.dimensions[dimensionName] qlinks += dimension.links() _LOG.debug("task %s qdimensions: %s", taskDss.taskDef.label, qlinks) # some rows will be non-unique for subset of dimensions, create # temporary structure to remove duplicates for row in dimensionVerse: qkey = tuple((col, row.dataId[col]) for col in qlinks) _LOG.debug("qkey: %s", qkey) def _datasetRefKey(datasetRef): return tuple(sorted(datasetRef.dataId.items())) qinputs = taskQuantaInputs.setdefault(qkey, {}) for dsType in taskDss.inputs: datasetRefs = qinputs.setdefault(dsType, {}) datasetRef = row.datasetRefs[dsType] datasetRefs[_datasetRefKey(datasetRef)] = datasetRef _LOG.debug("add input datasetRef: %s %s", dsType.name, datasetRef) qoutputs = taskQuantaOutputs.setdefault(qkey, {}) for dsType in taskDss.outputs: datasetRefs = qoutputs.setdefault(dsType, {}) datasetRef = row.datasetRefs[dsType] datasetRefs[_datasetRefKey(datasetRef)] = datasetRef _LOG.debug("add output datasetRef: %s %s", dsType.name, datasetRef) # all nodes for this task quanta = [] for qkey in taskQuantaInputs: # taskQuantaInputs and taskQuantaOutputs have the same keys _LOG.debug("make quantum for qkey: %s", qkey) quantum = Quantum(run=None, task=None) # add all outputs, but check first that outputs don't exist outputs = list(chain.from_iterable(datasetRefs.values() for datasetRefs in taskQuantaOutputs[qkey].values())) for ref in outputs: _LOG.debug("add output: %s", ref) if self.skipExisting and all(ref.id is not None for ref in outputs): _LOG.debug("all output datasetRefs already exist, skip quantum") continue if any(ref.id is not None for ref in outputs): # some outputs exist, can't override them raise OutputExistsError(taskDss.taskDef.taskName, outputs) for ref in outputs: quantum.addOutput(ref) # add all inputs for datasetRefs in taskQuantaInputs[qkey].values(): for ref in datasetRefs.values(): quantum.addPredictedInput(ref) _LOG.debug("add input: %s", ref) quanta.append(quantum) qgraph.append(QuantumGraphTaskNodes(taskDss.taskDef, quanta)) return qgraph
def _makeGraph(self, taskDatasets, inputs, outputs, initInputs, initOutputs, originInfo, userQuery): """Make QuantumGraph instance. Parameters ---------- taskDatasets : sequence of `_TaskDatasetTypes` Tasks with their inputs and outputs. inputs : `set` of `DatasetType` Datasets which should already exist in input repository outputs : `set` of `DatasetType` Datasets which will be created by tasks initInputs : `set` of `DatasetType` Datasets which should exist in input repository, and will be used in task initialization initOutputs : `set` of `DatasetType` Datasets which which will be created in task initialization originInfo : `DatasetOriginInfo` Object which provides names of the input/output collections. userQuery : `str` String which defunes user-defined selection for registry, should be empty or `None` if there is no restrictions on data selection. Returns ------- `QuantumGraph` instance. """ parsedQuery = self._parseUserQuery(userQuery or "") expr = None if parsedQuery is None else str(parsedQuery) rows = self.registry.selectDimensions(originInfo, expr, inputs, outputs) # store result locally for multi-pass algorithm below # TODO: change it to single pass dimensionVerse = [] for row in rows: _LOG.debug("row: %s", row) dimensionVerse.append(row) # Next step is to group by task quantum dimensions qgraph = QuantumGraph() qgraph._inputDatasetTypes = inputs qgraph._outputDatasetTypes = outputs for dsType in initInputs: for collection in originInfo.getInputCollections(dsType.name): result = self.registry.find(collection, dsType) if result is not None: qgraph.initInputs.append(result) break else: raise GraphBuilderError(f"Could not find initInput {dsType.name} in any input" " collection") for dsType in initOutputs: qgraph.initOutputs.append(DatasetRef(dsType, {})) for taskDss in taskDatasets: taskQuantaInputs = {} # key is the quantum dataId (as tuple) taskQuantaOutputs = {} # key is the quantum dataId (as tuple) qlinks = [] for dimensionName in taskDss.taskDef.config.quantum.dimensions: dimension = self.dimensions[dimensionName] qlinks += dimension.link _LOG.debug("task %s qdimensions: %s", taskDss.taskDef.label, qlinks) # some rows will be non-unique for subset of dimensions, create # temporary structure to remove duplicates for row in dimensionVerse: qkey = tuple((col, row.dataId[col]) for col in qlinks) _LOG.debug("qkey: %s", qkey) def _dataRefKey(dataRef): return tuple(sorted(dataRef.dataId.items())) qinputs = taskQuantaInputs.setdefault(qkey, {}) for dsType in taskDss.inputs: dataRefs = qinputs.setdefault(dsType, {}) dataRef = row.datasetRefs[dsType] dataRefs[_dataRefKey(dataRef)] = dataRef _LOG.debug("add input dataRef: %s %s", dsType.name, dataRef) qoutputs = taskQuantaOutputs.setdefault(qkey, {}) for dsType in taskDss.outputs: dataRefs = qoutputs.setdefault(dsType, {}) dataRef = row.datasetRefs[dsType] dataRefs[_dataRefKey(dataRef)] = dataRef _LOG.debug("add output dataRef: %s %s", dsType.name, dataRef) # pre-flight does not fill dataset components, and graph users # may need to know that, re-retrieve all input datasets to have # their components properly filled. for qinputs in taskQuantaInputs.values(): for dataRefs in qinputs.values(): for key in dataRefs.keys(): if dataRefs[key].id is not None: dataRefs[key] = self.registry.getDataset(dataRefs[key].id) # all nodes for this task quanta = [] for qkey in taskQuantaInputs: # taskQuantaInputs and taskQuantaOutputs have the same keys _LOG.debug("make quantum for qkey: %s", qkey) quantum = Quantum(run=None, task=None) # add all outputs, but check first that outputs don't exist outputs = list(chain.from_iterable(dataRefs.values() for dataRefs in taskQuantaOutputs[qkey].values())) for ref in outputs: _LOG.debug("add output: %s", ref) if self.skipExisting and all(ref.id is not None for ref in outputs): _LOG.debug("all output dataRefs already exist, skip quantum") continue if any(ref.id is not None for ref in outputs): # some outputs exist, can't override them raise OutputExistsError(taskDss.taskDef.taskName, outputs) for ref in outputs: quantum.addOutput(ref) # add all inputs for dataRefs in taskQuantaInputs[qkey].values(): for ref in dataRefs.values(): quantum.addPredictedInput(ref) _LOG.debug("add input: %s", ref) quanta.append(quantum) qgraph.append(QuantumGraphNodes(taskDss.taskDef, quanta)) return qgraph
def makeQuantum( task: PipelineTask, butler: Butler, dataId: DataId, ioDataIds: Mapping[str, Union[DataId, Sequence[DataId]]], ) -> Quantum: """Create a Quantum for a particular data ID(s). Parameters ---------- task : `lsst.pipe.base.PipelineTask` The task whose processing the quantum represents. butler : `lsst.daf.butler.Butler` The collection the quantum refers to. dataId: any data ID type The data ID of the quantum. Must have the same dimensions as ``task``'s connections class. ioDataIds : `collections.abc.Mapping` [`str`] A mapping keyed by input/output names. Values must be data IDs for single connections and sequences of data IDs for multiple connections. Returns ------- quantum : `lsst.daf.butler.Quantum` A quantum for ``task``, when called with ``dataIds``. """ connections = task.config.ConnectionsClass(config=task.config) try: _checkDimensionsMatch(butler.registry.dimensions, connections.dimensions, dataId.keys()) except ValueError as e: raise ValueError("Error in quantum dimensions.") from e inputs = defaultdict(list) outputs = defaultdict(list) for name in itertools.chain(connections.inputs, connections.prerequisiteInputs): try: connection = connections.__getattribute__(name) _checkDataIdMultiplicity(name, ioDataIds[name], connection.multiple) ids = _normalizeDataIds(ioDataIds[name]) for id in ids: ref = _refFromConnection(butler, connection, id) inputs[ref.datasetType].append(ref) except (ValueError, KeyError) as e: raise ValueError(f"Error in connection {name}.") from e for name in connections.outputs: try: connection = connections.__getattribute__(name) _checkDataIdMultiplicity(name, ioDataIds[name], connection.multiple) ids = _normalizeDataIds(ioDataIds[name]) for id in ids: ref = _refFromConnection(butler, connection, id) outputs[ref.datasetType].append(ref) except (ValueError, KeyError) as e: raise ValueError(f"Error in connection {name}.") from e quantum = Quantum( taskClass=type(task), dataId=DataCoordinate.standardize(dataId, universe=butler.registry.dimensions), inputs=inputs, outputs=outputs, ) return quantum
def _makeGraph(self, taskDatasets, required, optional, prerequisite, initInputs, initOutputs, originInfo, userQuery, perDatasetTypeDimensions=()): """Make QuantumGraph instance. Parameters ---------- taskDatasets : sequence of `_TaskDatasetTypes` Tasks with their inputs and outputs. required : `set` of `~lsst.daf.butler.DatasetType` Datasets that must exist in the repository in order to generate a QuantumGraph node that consumes them. optional : `set` of `~lsst.daf.butler.DatasetType` Datasets that will be produced by the graph, but may exist in the repository. If ``self.skipExisting`` and all outputs of a particular node already exist, it will be skipped. Otherwise pre-existing datasets of these types will cause `OutputExistsError` to be raised. prerequisite : `set` of `~lsst.daf.butler.DatasetType` Datasets that must exist in the repository, but whose absence should cause `PrerequisiteMissingError` to be raised if they are needed by any graph node that would otherwise be created. initInputs : `set` of `DatasetType` Datasets which should exist in input repository, and will be used in task initialization initOutputs : `set` of `DatasetType` Datasets which which will be created in task initialization originInfo : `DatasetOriginInfo` Object which provides names of the input/output collections. userQuery : `str` String which defines user-defined selection for registry, should be empty or `None` if there is no restrictions on data selection. perDatasetTypeDimensions : iterable of `Dimension` or `str` Dimensions (or names thereof) that may have different values for different dataset types within the same quantum. Returns ------- `QuantumGraph` instance. """ rows = self.registry.selectMultipleDatasetTypes( originInfo, userQuery, required=required, optional=optional, prerequisite=prerequisite, perDatasetTypeDimensions=perDatasetTypeDimensions) # store result locally for multi-pass algorithm below # TODO: change it to single pass dimensionVerse = [] try: for row in rows: _LOG.debug("row: %s", row) dimensionVerse.append(row) except LookupError as err: raise PrerequisiteMissingError(str(err)) from err # Next step is to group by task quantum dimensions qgraph = QuantumGraph() qgraph._inputDatasetTypes = (required | prerequisite) qgraph._outputDatasetTypes = optional for dsType in initInputs: for collection in originInfo.getInputCollections(dsType.name): result = self.registry.find(collection, dsType) if result is not None: qgraph.initInputs.append(result) break else: raise GraphBuilderError( f"Could not find initInput {dsType.name} in any input" " collection") for dsType in initOutputs: qgraph.initOutputs.append(DatasetRef(dsType, {})) for taskDss in taskDatasets: taskQuantaInputs = {} # key is the quantum dataId (as tuple) taskQuantaOutputs = {} # key is the quantum dataId (as tuple) qlinks = [] for dimensionName in taskDss.taskDef.config.quantum.dimensions: dimension = self.dimensions[dimensionName] qlinks += dimension.links() _LOG.debug("task %s qdimensions: %s", taskDss.taskDef.label, qlinks) # some rows will be non-unique for subset of dimensions, create # temporary structure to remove duplicates for row in dimensionVerse: qkey = tuple((col, row.dataId[col]) for col in qlinks) _LOG.debug("qkey: %s", qkey) def _datasetRefKey(datasetRef): return tuple(sorted(datasetRef.dataId.items())) qinputs = taskQuantaInputs.setdefault(qkey, {}) for dsType in taskDss.inputs: datasetRefs = qinputs.setdefault(dsType, {}) datasetRef = row.datasetRefs[dsType] datasetRefs[_datasetRefKey(datasetRef)] = datasetRef _LOG.debug("add input datasetRef: %s %s", dsType.name, datasetRef) qoutputs = taskQuantaOutputs.setdefault(qkey, {}) for dsType in taskDss.outputs: datasetRefs = qoutputs.setdefault(dsType, {}) datasetRef = row.datasetRefs[dsType] datasetRefs[_datasetRefKey(datasetRef)] = datasetRef _LOG.debug("add output datasetRef: %s %s", dsType.name, datasetRef) # all nodes for this task quanta = [] for qkey in taskQuantaInputs: # taskQuantaInputs and taskQuantaOutputs have the same keys _LOG.debug("make quantum for qkey: %s", qkey) quantum = Quantum(run=None, task=None) # add all outputs, but check first that outputs don't exist outputs = list( chain.from_iterable( datasetRefs.values() for datasetRefs in taskQuantaOutputs[qkey].values())) for ref in outputs: _LOG.debug("add output: %s", ref) if self.skipExisting and all(ref.id is not None for ref in outputs): _LOG.debug( "all output datasetRefs already exist, skip quantum") continue if any(ref.id is not None for ref in outputs): # some outputs exist, can't override them raise OutputExistsError(taskDss.taskDef.taskName, outputs) for ref in outputs: quantum.addOutput(ref) # add all inputs for datasetRefs in taskQuantaInputs[qkey].values(): for ref in datasetRefs.values(): quantum.addPredictedInput(ref) _LOG.debug("add input: %s", ref) quanta.append(quantum) qgraph.append(QuantumGraphTaskNodes(taskDss.taskDef, quanta)) return qgraph
def _pruner( datasetRefDict: _DatasetTracker[DatasetRef, QuantumNode], refsToRemove: Iterable[DatasetRef], *, alreadyPruned: Optional[Set[QuantumNode]] = None, ) -> None: r"""Prune supplied dataset refs out of datasetRefDict container, recursing to additional nodes dependant on pruned refs. This function modifies datasetRefDict in-place. Parameters ---------- datasetRefDict : `_DatasetTracker[DatasetRef, QuantumNode]` The dataset tracker that maps `DatasetRef`\ s to the Quantum Nodes that produce/consume that `DatasetRef` refsToRemove : `Iterable` of `DatasetRef` The `DatasetRef`\ s which should be pruned from the input dataset tracker alreadyPruned : `set` of `QuantumNode` A set of nodes which have been pruned from the dataset tracker """ if alreadyPruned is None: alreadyPruned = set() for ref in refsToRemove: # make a copy here, because this structure will be modified in # recursion, hitting a node more than once won't be much of an # issue, as we skip anything that has been processed nodes = set(datasetRefDict.getConsumers(ref)) for node in nodes: # This node will never be associated with this ref datasetRefDict.removeConsumer(ref, node) if node in alreadyPruned: continue # find the connection corresponding to the input ref connectionRefs = node.quantum.inputs.get(ref.datasetType) if connectionRefs is None: # look to see if any inputs are component refs that match the # input ref to prune others = ref.datasetType.makeAllComponentDatasetTypes() # for each other component type check if there are assocated # refs for other in others: connectionRefs = node.quantum.inputs.get(other) if connectionRefs is not None: # now search the component refs and see which one # matches the ref to trim for cr in connectionRefs: if cr.makeCompositeRef() == ref: toRemove = cr break else: # Ref must be an initInput ref and we want to ignore those raise RuntimeError( f"Cannot prune on non-Input dataset type {ref.datasetType.name}" ) else: toRemove = ref tmpRefs = set(connectionRefs).difference((toRemove, )) tmpConnections = NamedKeyDict[DatasetType, List[DatasetRef]]( node.quantum.inputs.items()) tmpConnections[toRemove.datasetType] = list(tmpRefs) helper = AdjustQuantumHelper(inputs=tmpConnections, outputs=node.quantum.outputs) assert node.quantum.dataId is not None, ( "assert to make the type checker happy, it should not " "actually be possible to not have dataId set to None " "at this point") # Try to adjust the quantum with the reduced refs to make sure the # node will still satisfy all its conditions. # # If it can't because NoWorkFound is raised, that means a # connection is no longer present, and the node should be removed # from the graph. try: helper.adjust_in_place(node.taskDef.connections, node.taskDef.label, node.quantum.dataId) newQuantum = Quantum( taskName=node.quantum.taskName, taskClass=node.quantum.taskClass, dataId=node.quantum.dataId, initInputs=node.quantum.initInputs, inputs=helper.inputs, outputs=helper.outputs, ) # If the inputs or outputs were adjusted to something different # than what was supplied by the graph builder, dissassociate # node from those refs, and if they are output refs, prune them # from downstream tasks. This means that based on new inputs # the task wants to produce fewer outputs, or consume fewer # inputs. for condition, existingMapping, newMapping, remover in ( ( helper.inputs_adjusted, node.quantum.inputs, helper.inputs, datasetRefDict.removeConsumer, ), ( helper.outputs_adjusted, node.quantum.outputs, helper.outputs, datasetRefDict.removeProducer, ), ): if condition: notNeeded = set() for key in existingMapping: if key not in newMapping: compositeRefs = (r if not r.isComponent() else r.makeCompositeRef() for r in existingMapping[key]) notNeeded |= set(compositeRefs) continue notNeeded |= set(existingMapping[key]) - set( newMapping[key]) if notNeeded: for ref in notNeeded: if ref.isComponent(): ref = ref.makeCompositeRef() remover(ref, node) if remover is datasetRefDict.removeProducer: _pruner(datasetRefDict, notNeeded, alreadyPruned=alreadyPruned) object.__setattr__(node, "quantum", newQuantum) noWorkFound = False except NoWorkFound: noWorkFound = True if noWorkFound: # This will throw if the length is less than the minimum number for tmpRef in chain( chain.from_iterable(node.quantum.inputs.values()), node.quantum.initInputs.values()): if tmpRef.isComponent(): tmpRef = tmpRef.makeCompositeRef() datasetRefDict.removeConsumer(tmpRef, node) alreadyPruned.add(node) # prune all outputs produced by this node # mark that none of these will be produced forwardPrunes = set() for forwardRef in chain.from_iterable( node.quantum.outputs.values()): datasetRefDict.removeProducer(forwardRef, node) forwardPrunes.add(forwardRef) _pruner(datasetRefDict, forwardPrunes, alreadyPruned=alreadyPruned)
def build_quantum_graph( cls, task_def, registry, constraint_order, constraint_ranges, where=None, collections=None, ): """Generate a `QuantumGraph` for running just this task. This is a temporary workaround for incomplete butler query support for HEALPix dimensions. Parameters ---------- task_def : `lsst.pipe.base.TaskDef` Task definition. registry : `lsst.daf.butler.Registry` Client for the butler database. May be read-only. constraint_order : `int` HEALPix order used to contrain which quanta are generated, via ``constraint_indices``. This should be a coarser grid (smaller order) than the order used for the task's quantum and output data IDs, and ideally something between the spatial scale of a patch or the data repository's "common skypix" system (usually ``htm7``). constraint_ranges : `lsst.sphgeom.RangeSet` RangeSet which describes constraint pixels (HEALPix NEST, with order constraint_order) to constrain generated quanta. where : `str`, optional A boolean `str` expression of the form accepted by `Registry.queryDatasets` to constrain input datasets. This may contain a constraint on tracts, patches, or bands, but not HEALPix indices. Constraints on tracts and patches should usually be unnecessary, however - existing coadds that overlap the given HEALpix indices will be selected without such a constraint, and providing one may reject some that should normally be included. collections : `str` or `Iterable` [ `str` ], optional Collection or collections to search for input datasets, in order. If not provided, ``registry.defaults.collections`` will be searched. """ config = task_def.config dataset_types = pipeBase.PipelineDatasetTypes.fromPipeline( pipeline=[task_def], registry=registry) # Since we know this is the only task in the pipeline, we know there # is only one overall input and one overall output. (input_dataset_type, ) = dataset_types.inputs # Extract the main output dataset type (which needs multiple # DatasetRefs, and tells us the output HPX level), and make a set of # what remains for more mechanical handling later. output_dataset_type = dataset_types.outputs[ task_def.connections.hips_exposures.name] incidental_output_dataset_types = dataset_types.outputs.copy() incidental_output_dataset_types.remove(output_dataset_type) (hpx_output_dimension, ) = (d for d in output_dataset_type.dimensions if isinstance(d, SkyPixDimension)) constraint_hpx_pixelization = registry.dimensions[ f"healpix{constraint_order}"].pixelization common_skypix_name = registry.dimensions.commonSkyPix.name common_skypix_pixelization = registry.dimensions.commonSkyPix.pixelization # We will need all the pixels at the quantum resolution as well task_dimensions = registry.dimensions.extract( task_def.connections.dimensions) (hpx_dimension, ) = (d for d in task_dimensions if d.name != "band") hpx_pixelization = hpx_dimension.pixelization if hpx_pixelization.level < constraint_order: raise ValueError( f"Quantum order {hpx_pixelization.level} must be < {constraint_order}" ) hpx_ranges = constraint_ranges.scaled(4**(hpx_pixelization.level - constraint_order)) # We can be generous in looking for pixels here, because we constraint by actual # patch regions below. common_skypix_ranges = RangeSet() for begin, end in constraint_ranges: for hpx_index in range(begin, end): constraint_hpx_region = constraint_hpx_pixelization.pixel( hpx_index) common_skypix_ranges |= common_skypix_pixelization.envelope( constraint_hpx_region) # To keep the query from getting out of hand (and breaking) we simplify until we have fewer # than 100 ranges which seems to work fine. for simp in range(1, 10): if len(common_skypix_ranges) < 100: break common_skypix_ranges.simplify(simp) # Use that RangeSet to assemble a WHERE constraint expression. This # could definitely get too big if the "constraint healpix" order is too # fine. where_terms = [] bind = {} for n, (begin, end) in enumerate(common_skypix_ranges): stop = end - 1 # registry range syntax is inclusive if begin == stop: where_terms.append(f"{common_skypix_name} = cpx{n}") bind[f"cpx{n}"] = begin else: where_terms.append( f"({common_skypix_name} >= cpx{n}a AND {common_skypix_name} <= cpx{n}b)" ) bind[f"cpx{n}a"] = begin bind[f"cpx{n}b"] = stop if where is None: where = " OR ".join(where_terms) else: where = f"({where}) AND ({' OR '.join(where_terms)})" # Query for input datasets with this constraint, and ask for expanded # data IDs because we want regions. Immediately group this by patch so # we don't do later geometric stuff n_bands more times than we need to. input_refs = registry.queryDatasets(input_dataset_type, where=where, findFirst=True, collections=collections, bind=bind).expanded() inputs_by_patch = defaultdict(set) patch_dimensions = registry.dimensions.extract(["patch"]) for input_ref in input_refs: inputs_by_patch[input_ref.dataId.subset(patch_dimensions)].add( input_ref) if not inputs_by_patch: message_body = '\n'.join(input_refs.explain_no_results()) raise RuntimeError(f"No inputs found:\n{message_body}") # Iterate over patches and compute the set of output healpix pixels # that overlap each one. Use that to associate inputs with output # pixels, but only for the output pixels we've already identified. inputs_by_hpx = defaultdict(set) for patch_data_id, input_refs_for_patch in inputs_by_patch.items(): patch_hpx_ranges = hpx_pixelization.envelope(patch_data_id.region) for begin, end in patch_hpx_ranges & hpx_ranges: for hpx_index in range(begin, end): inputs_by_hpx[hpx_index].update(input_refs_for_patch) # Iterate over the dict we just created and create the actual quanta. quanta = [] for hpx_index, input_refs_for_hpx_index in inputs_by_hpx.items(): # Group inputs by band. input_refs_by_band = defaultdict(list) for input_ref in input_refs_for_hpx_index: input_refs_by_band[input_ref.dataId["band"]].append(input_ref) # Iterate over bands to make quanta. for band, input_refs_for_band in input_refs_by_band.items(): data_id = registry.expandDataId({ hpx_dimension: hpx_index, "band": band }) hpx_pixel_ranges = RangeSet(hpx_index) hpx_output_ranges = hpx_pixel_ranges.scaled( 4**(config.hips_order - hpx_pixelization.level)) output_data_ids = [] for begin, end in hpx_output_ranges: for hpx_output_index in range(begin, end): output_data_ids.append( registry.expandDataId({ hpx_output_dimension: hpx_output_index, "band": band })) outputs = { dt: [DatasetRef(dt, data_id)] for dt in incidental_output_dataset_types } outputs[output_dataset_type] = [ DatasetRef(output_dataset_type, data_id) for data_id in output_data_ids ] quanta.append( Quantum( taskName=task_def.taskName, taskClass=task_def.taskClass, dataId=data_id, initInputs={}, inputs={input_dataset_type: input_refs_for_band}, outputs=outputs, )) if len(quanta) == 0: raise RuntimeError( "Given constraints yielded empty quantum graph.") return pipeBase.QuantumGraph(quanta={task_def: quanta})