def testConstructor(self): """Test that construction preserves and validates values. """ # Construct an unresolved ref. ref = DatasetRef(self.datasetType, self.dataId) self.assertEqual(ref.datasetType, self.datasetType) self.assertEqual(ref.dataId, DataCoordinate.standardize(self.dataId, universe=self.universe), msg=ref.dataId) self.assertIsInstance(ref.dataId, DataCoordinate) # Constructing an unresolved ref with run and/or components should # fail. run = "somerun" with self.assertRaises(ValueError): DatasetRef(self.datasetType, self.dataId, run=run) # Passing a data ID that is missing dimensions should fail. with self.assertRaises(KeyError): DatasetRef(self.datasetType, {"instrument": "DummyCam"}) # Constructing a resolved ref should preserve run as well as everything # else. ref = DatasetRef(self.datasetType, self.dataId, id=1, run=run) self.assertEqual(ref.datasetType, self.datasetType) self.assertEqual(ref.dataId, DataCoordinate.standardize(self.dataId, universe=self.universe), msg=ref.dataId) self.assertIsInstance(ref.dataId, DataCoordinate) self.assertEqual(ref.id, 1) self.assertEqual(ref.run, run)
def splitByStateFlags(self, dataIds: Optional[DataCoordinateSequence] = None, *, expanded: bool = True, complete: bool = True, minimal: bool = True) -> SplitByStateFlags: """Given a sequence of data IDs, generate new equivalent sequences containing less information. Parameters ---------- dataIds : `DataCoordinateSequence`, optional. Data IDs to start from. Defaults to ``self.allDataIds``. ``dataIds.hasRecords()`` and ``dataIds.hasFull()`` must both return `True`. expanded : `bool`, optional If `True` (default) include the original data IDs that contain all information in the result. complete : `bool`, optional If `True` (default) include data IDs for which ``hasFull()`` returns `True` but ``hasRecords()`` does not. minimal : `bool`, optional If `True` (default) include data IDS that only contain values for required dimensions, for which ``hasFull()`` may not return `True`. Returns ------- split : `SplitByStateFlags` A dataclass holding the indicated data IDs in attributes that correspond to the boolean keyword arguments. """ if dataIds is None: dataIds = self.allDataIds assert dataIds.hasFull() and dataIds.hasRecords() result = SplitByStateFlags(expanded=dataIds) if complete: result.complete = DataCoordinateSequence([ DataCoordinate.standardize(e.full.byName(), graph=dataIds.graph) for e in result.expanded ], graph=dataIds.graph) self.assertTrue(result.complete.hasFull()) self.assertFalse(result.complete.hasRecords()) if minimal: result.minimal = DataCoordinateSequence([ DataCoordinate.standardize(e.byName(), graph=dataIds.graph) for e in result.expanded ], graph=dataIds.graph) self.assertEqual(result.minimal.hasFull(), not dataIds.graph.implied) self.assertFalse(result.minimal.hasRecords()) if not expanded: result.expanded = None return result
def visits(self): butler = Butler(self.root, collections=[self.outputRun]) return { DataCoordinate.standardize(instrument="MegaPrime", visit=1038843, universe=butler.registry.dimensions): [ DataCoordinate.standardize(instrument="MegaPrime", exposure=1038843, universe=butler.registry.dimensions) ] }
def translate(self, dataId2: dict, *, partial: bool = False, log: Log) -> Optional[DataCoordinate]: # Docstring inherited from PathElementHandler. rawDataId3 = self._translator(dataId2, partial=partial, log=log) if partial: return DataCoordinate.standardize( rawDataId3, universe=self._datasetType.dimensions.universe) else: return DataCoordinate.standardize( rawDataId3, graph=self._datasetType.dimensions)
def makeGraph(self, pipeline, collections, run, userQuery): """Create execution graph for a pipeline. Parameters ---------- pipeline : `Pipeline` Pipeline definition, task names/classes and their configs. collections Expressions representing the collections to search for input datasets. May be any of the types accepted by `lsst.daf.butler.CollectionSearch.fromExpression`. run : `str`, optional Name of the `~lsst.daf.butler.CollectionType.RUN` collection for output datasets, if it already exists. userQuery : `str` String which defines user-defined selection for registry, should be empty or `None` if there is no restrictions on data selection. Returns ------- graph : `QuantumGraph` Raises ------ UserExpressionError Raised when user expression cannot be parsed. OutputExistsError Raised when output datasets already exist. Exception Other exceptions types may be raised by underlying registry classes. """ scaffolding = _PipelineScaffolding(pipeline, registry=self.registry) instrument = pipeline.getInstrument() if isinstance(instrument, str): instrument = doImport(instrument) if instrument is not None: dataId = DataCoordinate.standardize( instrument=instrument.getName(), universe=self.registry.dimensions) else: dataId = DataCoordinate.makeEmpty(self.registry.dimensions) with scaffolding.connectDataIds(self.registry, collections, userQuery, dataId) as commonDataIds: scaffolding.resolveDatasetRefs(self.registry, collections, run, commonDataIds, skipExisting=self.skipExisting) return scaffolding.makeQuantumGraph()
def translate(self, dataId2: dict, *, partial: bool = False ) -> Tuple[Optional[DataCoordinate], Optional[str]]: # Docstring inherited from PathElementHandler. rawDataId3, calibDate = self._translator(dataId2, partial=partial) if partial: return ( DataCoordinate.standardize(rawDataId3, universe=self._datasetType.dimensions.universe), calibDate, ) else: return ( DataCoordinate.standardize(rawDataId3, graph=self._datasetType.dimensions), calibDate )
def getDataId(self, id: DatasetId) -> DataCoordinate: """Return DataId for a dataset. Parameters ---------- id : `DatasetId` Unique dataset identifier. Returns ------- dataId : `DataCoordinate` DataId for the dataset. """ # This query could return multiple rows (one for each tagged collection # the dataset is in, plus one for its run collection), and we don't # care which of those we get. sql = self._tags.select().where( sqlalchemy.sql.and_( self._tags.columns.dataset_id == id, self._tags.columns.dataset_type_id == self._dataset_type_id)).limit(1) row = self._db.query(sql).fetchone() assert row is not None, "Should be guaranteed by caller and foreign key constraints." return DataCoordinate.standardize( { dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required }, graph=self.datasetType.dimensions)
def _calculate_dataset_info(self, header, filename): """Calculate a RawFileDatasetInfo from the supplied information. Parameters ---------- header : `Mapping` Header from the dataset. filename : `str` Filename to use for error messages. Returns ------- dataset : `RawFileDatasetInfo` The region, dataId, and observation information associated with this dataset. """ obsInfo = ObservationInfo(header) dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, exposure=obsInfo.exposure_id, detector=obsInfo.detector_num, universe=self.universe) if obsInfo.instrument != self.instrument.getName(): raise ValueError(f"Incorrect instrument (expected {self.instrument.getName()}, " f"got {obsInfo.instrument}) for file {filename}.") FormatterClass = self.instrument.getRawFormatter(dataId) region = self._calculate_region_from_dataset_metadata(obsInfo, header, FormatterClass) return RawFileDatasetInfo(obsInfo=obsInfo, region=region, dataId=dataId)
def testSkyMapPacking(self): """Test that packing Tract+Patch into an integer in Gen3 works and is self-consistent. Note that this packing does *not* use the same algorithm as Gen2 and hence generates different IDs, because the Gen2 algorithm is problematically tied to the *default* SkyMap for a particular camera, rather than the SkyMap actually used. """ # SkyMap used by ci_hsc has only one tract, so the test coverage in # that area isn't great. That's okay because that's tested in SkyMap; # what we care about here is that the converted repo has the necessary # metadata to construct and use these packers at all. for patch in [0, 43, 52]: dataId = self.butler.registry.expandDataId( skymap="discrete/ci_hsc", tract=0, patch=patch, band='r') packer1 = self.butler.registry.dimensions.makePacker( "tract_patch", dataId) packer2 = self.butler.registry.dimensions.makePacker( "tract_patch_band", dataId) self.assertNotEqual(packer1.pack(dataId), packer2.pack(dataId)) self.assertEqual( packer1.unpack(packer1.pack(dataId)), DataCoordinate.standardize(dataId, graph=packer1.dimensions)) self.assertEqual(packer2.unpack(packer2.pack(dataId)), dataId) self.assertEqual(packer1.pack(dataId, band='i'), packer1.pack(dataId)) self.assertNotEqual(packer2.pack(dataId, band='i'), packer2.pack(dataId))
def prep(self): # Docstring inherited from RepoConverter. self.task.log.info(f"Looking for skymaps in root {self.root}.") for coaddName, datasetTypeName in SKYMAP_DATASET_TYPES.items(): if not self.task.isDatasetTypeIncluded(datasetTypeName): continue try: exists = self.butler2.datasetExists(datasetTypeName) except AttributeError: # This mapper doesn't even define this dataset type. continue if not exists: continue instance = self.butler2.get(datasetTypeName) name = self.task.useSkyMap(instance, datasetTypeName) datasetType = DatasetType(datasetTypeName, dimensions=["skymap"], storageClass="SkyMap", universe=self.task.universe) dataId = DataCoordinate.standardize(skymap=name, universe=self.task.universe) struct = FoundSkyMap(name=name, instance=instance, coaddName=coaddName, ref=DatasetRef(datasetType, dataId), filename=self.butler2.getUri(datasetTypeName)) self._foundSkyMapsByCoaddName[coaddName] = struct self.task.log.info("Found skymap %s in %s in %s.", name, datasetTypeName, self.root) super().prep()
def handle(self, path: str, nextDataId2, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], *, predicate: Callable[[DataCoordinate], bool]): dataId3, calibDate = self.translate(nextDataId2, partial=True) def get_detectors(filename): fitsData = lsst.afw.fits.Fits(filename, 'r') # NOTE: The primary header (HDU=0) does not contain detector data. detectors = [] for i in range(1, fitsData.countHdus()): fitsData.setHdu(i) metadata = fitsData.readMetadata() detectors.append(metadata['CCDNUM']) return detectors if predicate(dataId3): detectors = get_detectors(path) refs = [] for detector in detectors: newDataId3 = DataCoordinate.standardize(dataId3, graph=self._datasetType.dimensions, detector=detector) refs.append(DatasetRef(self._datasetType, newDataId3)) datasets[self._datasetType][calibDate].append( FileDataset(refs=refs, path=path, formatter=self._formatter) )
def translate(self, dataId2: dict, *, partial: bool = False ) -> Tuple[Optional[DataCoordinate], Optional[str]]: assert partial is True, "We always require partial, to ignore 'ccdnum'" rawDataId3, calibDate = self._translator(dataId2, partial=partial) return ( DataCoordinate.standardize(rawDataId3, universe=self._datasetType.dimensions.universe), calibDate, )
def setUp(self): self.butler = Butler(os.path.join(getPackageDir("ci_hsc_gen3"), "DATA"), writeable=False, collections=["HSC/calib/2013-06-17", "HSC/runs/ci_hsc"]) # We need to provide a physical_filter value to fully identify a flat, # but this still leaves the band as an implied value that this data ID # doesn't know. self.flatMinimalDataId = DataCoordinate.standardize( instrument="HSC", detector=0, physical_filter="HSC-R", universe=self.butler.registry.dimensions, ) # For a calexp, the minimal data ID just has exposure and detector, # so both band and physical_filter are implied and not known here. self.calexpMinimalDataId = DataCoordinate.standardize( instrument="HSC", detector=100, visit=903334, universe=self.butler.registry.dimensions, ) # Parameters with bbox to test that logic still works on subimage gets. self.parameters = {"bbox": Box2I(Point2I(0, 0), Point2I(8, 7))}
def _makeDSRefVisit(self, dstype, visitId, universe): return DatasetRef(datasetType=dstype, dataId=DataCoordinate.standardize( detector="X", visit=visitId, physical_filter='a', abstract_filter='b', instrument='TestInstrument', universe=universe))
def _calculate_dataset_info(self, header, filename): """Calculate a RawFileDatasetInfo from the supplied information. Parameters ---------- header : `Mapping` Header from the dataset. filename : `str` Filename to use for error messages. Returns ------- dataset : `RawFileDatasetInfo` The dataId, and observation information associated with this dataset. """ # To ensure we aren't slowed down for no reason, explicitly # list here the properties we need for the schema # Use a dict with values a boolean where True indicates # that it is required that we calculate this property. ingest_subset = { "altaz_begin": False, "boresight_rotation_coord": False, "boresight_rotation_angle": False, "dark_time": False, "datetime_begin": True, "datetime_end": True, "detector_num": True, "exposure_group": False, "exposure_id": True, "exposure_time": True, "instrument": True, "tracking_radec": False, "object": False, "observation_counter": False, "observation_id": True, "observation_reason": False, "observation_type": True, "observing_day": False, "physical_filter": True, "science_program": False, "visit_id": False, } obsInfo = ObservationInfo( header, pedantic=False, filename=filename, required={k for k in ingest_subset if ingest_subset[k]}, subset=set(ingest_subset)) dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, exposure=obsInfo.exposure_id, detector=obsInfo.detector_num, universe=self.universe) return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId)
def unpack(self, packedId: int) -> DataCoordinate: # Docstring inherited from DataIdPacker.unpack d = {"skymap": self._skyMapName} if self._filterMax is not None: d["band"] = self.getFilterNameFromInt(packedId // self._tractPatchMax) packedId %= self._tractPatchMax d["tract"] = packedId // self._patchMax d["patch"] = packedId % self._patchMax return DataCoordinate.standardize(d, graph=self.dimensions)
def setUp(self): self.id = 0 self.factory = FormatterFactory() self.universe = DimensionUniverse() self.dataId = DataCoordinate.makeEmpty(self.universe) # Dummy FileDescriptor for testing getFormatter self.fileDescriptor = FileDescriptor( Location("/a/b/c", "d"), StorageClass("DummyStorageClass", dict, None))
def unpack(self, packedId): # Docstring inherited from DimensionPacker.unpack observation, detector = divmod(packedId, self._detectorMax) return DataCoordinate.standardize( { "instrument": self._instrumentName, "detector": detector, self._observationName: observation, }, graph=self.dimensions )
def testWithoutFilter(self): dimensions = DimensionGraph(universe=self.universe, names=["tract", "patch"]) dataId = DataCoordinate.standardize(skymap=self.fixed["skymap"], tract=2, patch=6, universe=self.universe) packer = SkyMapDimensionPacker(self.fixed, dimensions) packedId = packer.pack(dataId) self.assertLessEqual(packedId.bit_length(), packer.maxBits) self.assertEqual(packer.unpack(packedId), dataId)
def _makeDSRefVisit(self, dstype, visitId, universe): return DatasetRef( datasetType=dstype, dataId=DataCoordinate.standardize( detector="X", visit=visitId, physical_filter="a", band="b", instrument="TestInstrument", universe=universe, ), )
def setUp(self): self.universe = DimensionUniverse() self.fixed = DataCoordinate.fromFullValues( DimensionGraph(universe=self.universe, names=["skymap"]), values=("unimportant", ), ).expanded( records={ "skymap": self.universe["skymap"].RecordClass( name="unimportant", tract_max=5, patch_nx_max=3, patch_ny_max=3, ) })
def _makeDatasetId(self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum) -> uuid.UUID: """Generate dataset ID for a dataset. Parameters ---------- run : `RunRecord` The record object describing the RUN collection for the dataset. dataId : `DataCoordinate` Expanded data ID for the dataset. idGenerationMode : `DatasetIdGenEnum` ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a deterministic UUID5-type ID based on a dataset type name and ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a deterministic UUID5-type ID based on a dataset type name, run collection name, and ``dataId``. Returns ------- datasetId : `uuid.UUID` Dataset identifier. """ if idGenerationMode is DatasetIdGenEnum.UNIQUE: return uuid.uuid4() else: # WARNING: If you modify this code make sure that the order of # items in the `items` list below never changes. items: List[Tuple[str, str]] = [] if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: items = [ ("dataset_type", self.datasetType.name), ] elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: items = [ ("dataset_type", self.datasetType.name), ("run", run.name), ] else: raise ValueError( f"Unexpected ID generation mode: {idGenerationMode}") for name, value in sorted(dataId.byName().items()): items.append((name, str(value))) data = ",".join(f"{key}={value}" for key, value in items) return uuid.uuid5(self.NS_UUID, data)
def getDataId(self, id: int) -> DataCoordinate: # Docstring inherited from DatasetRecordStorageManager. # This query could return multiple rows (one for each tagged collection # the dataset is in, plus one for its run collection), and we don't # care which of those we get. sql = self._dynamic.select().where( sqlalchemy.sql.and_( self._dynamic.columns.dataset_id == id, self._dynamic.columns.dataset_type_id == self._dataset_type_id ) ).limit(1) row = self._db.query(sql).fetchone() assert row is not None, "Should be guaranteed by caller and foreign key constraints." return DataCoordinate.standardize( {dimension.name: row[dimension.name] for dimension in self.datasetType.dimensions.required}, graph=self.datasetType.dimensions )
def _refFromConnection(butler: Butler, connection: DimensionedConnection, dataId: DataId, **kwargs: Any) -> DatasetRef: """Create a DatasetRef for a connection in a collection. Parameters ---------- butler : `lsst.daf.butler.Butler` The collection to point to. connection : `lsst.pipe.base.connectionTypes.DimensionedConnection` The connection defining the dataset type to point to. dataId The data ID for the dataset to point to. **kwargs Additional keyword arguments used to augment or construct a `~lsst.daf.butler.DataCoordinate`. Returns ------- ref : `lsst.daf.butler.DatasetRef` A reference to a dataset compatible with ``connection``, with ID ``dataId``, in the collection pointed to by ``butler``. """ universe = butler.registry.dimensions # DatasetRef only tests if required dimension is missing, but not extras _checkDimensionsMatch(universe, set(connection.dimensions), dataId.keys()) dataId = DataCoordinate.standardize(dataId, **kwargs, universe=universe) # skypix is a PipelineTask alias for "some spatial index", Butler doesn't # understand it. Code copied from TaskDatasetTypes.fromTaskDef if "skypix" in connection.dimensions: datasetType = butler.registry.getDatasetType(connection.name) else: datasetType = connection.makeDatasetType(universe) try: butler.registry.getDatasetType(datasetType.name) except KeyError: raise ValueError(f"Invalid dataset type {connection.name}.") try: ref = DatasetRef(datasetType=datasetType, dataId=dataId) return ref except KeyError as e: raise ValueError( f"Dataset type ({connection.name}) and ID {dataId.byName()} not compatible." ) from e
def testReadingBadNewFileWithFullDataId(self): """If we try to read a new calexp with a full data ID, the reader should check the filters in the file for consistency with the data ID (and in this case, find them inconsistent, which should result in warnings and returning what's in the data ID). """ self.skip_mock() calexpBadDataId = DataCoordinate.standardize( self.calexpMinimalDataId, band="g", physical_filter="HSC-G", visit_system=0, ) self.assertTrue(calexpBadDataId.hasFull()) # Some tests are only relevant when reading full calexps. # By definition a disassembled exposure will have a correct # filterlabel written out. # In this situation the test becomes moot since the filterLabel # formatter will not force a correct filter label into an # incorrect filter label based on DataId. _, components = self.butler.getURIs("calexp", calexpBadDataId) if components: raise unittest.SkipTest("Test not relevant because composite has been disassembled") with self.assertWarns(Warning): calexp = self.butler.get("calexp", calexpBadDataId) with self.assertWarns(Warning): calexpFilterLabel = self.butler.get("calexp.filter", calexpBadDataId) self.assertEqual(calexp.getFilter(), calexpFilterLabel) self.assertEqual(calexp.getFilter().bandLabel, calexpBadDataId["band"]) self.assertEqual(calexp.getFilter().physicalLabel, calexpBadDataId["physical_filter"]) self.assertEqual(calexpFilterLabel.bandLabel, calexpBadDataId["band"]) self.assertEqual(calexpFilterLabel.physicalLabel, calexpBadDataId["physical_filter"]) with self.assertWarns(Warning): calexpSub = self.butler.get("calexp", calexpBadDataId, parameters=self.parameters) self.assertEqual(calexp.getFilter(), calexpSub.getFilter())
def ref_from_connection(butler, connection, data_id): """Create a DatasetRef for a connection in a collection. Parameters ---------- butler : `lsst.daf.butler.Butler` The collection to point to. connection : `lsst.pipe.base.connectionTypes.DimensionedConnection` The connection defining the dataset type to point to. data_id : `Mapping` [`str`] or `lsst.daf.butler.DataCoordinate` The data ID for the dataset to point to. Returns ------- ref : `lsst.daf.butler.DatasetRef` A reference to a dataset compatible with ``connection``, with ID ``data_id``, in the collection pointed to by ``butler``. """ universe = butler.registry.dimensions data_id = DataCoordinate.standardize(data_id, universe=universe) return DatasetRef( datasetType=connection.makeDatasetType(universe), dataId=data_id, )
def pack_data_id(self, tract, patch, band=None): """Pack a skymap-based data ID into an integer. Parameters ---------- tract : `int` Integer ID for the tract. patch : `tuple` (`int`) or `int` Either a 2-element (x, y) tuple (Gen2 patch ID) or a single integer (Gen3 patch ID, corresponding to the "sequential" patch index methods in this package). band : `str`, optional If provided, a filter name present in `SkyMapDimensionPacker.SUPPORTED_FILTERS` (which is aspirationally a list of all Gen3 'bands', but in practice may be missing some; see RFC-785). If not provided, the packing algorithm that does not include the filter will be used. Returns ------- packed : `int` Integer that corresponds to the data ID. max_bits : `int` Maximum number of bits that ``packed`` could have, assuming this skymap and presence or absence of ``band``. Notes ----- This method uses a Gen3 `lsst.daf.butler.DimensionPacker` object under the hood to guarantee consistency with pure Gen3 code, but it does not require the caller to actually have a Gen3 butler available. It does, however, require a filter value compatible with the Gen3 "band" dimension. This is a temporary interface intended to aid with the migration from Gen2 to Gen3 middleware. It will be removed with the Gen2 middleware or when DM-31924 provides a longer-term replacement, whichever comes first. Pure Gen3 code should use `lsst.daf.butler.DataCoordinate.pack` or other `lsst.daf.butler.DimensionPacker` interfaces. """ from lsst.daf.butler import DataCoordinate, DimensionUniverse universe = DimensionUniverse() dummy_skymap_name = "unimportant" # only matters to Gen3 registry tract_info = self[tract] patch_info = tract_info[patch] nx, ny = tract_info.getNumPatches() skymap_record = universe["skymap"].RecordClass( name=dummy_skymap_name, hash=self.getSha1(), tract_max=len(self), patch_nx_max= nx, # assuming these are the same for all tracts for now patch_ny_max=ny, ) skymap_data_id = DataCoordinate.standardize( skymap=dummy_skymap_name, universe=universe, ).expanded(records={"skymap": skymap_record}, ) full_data_id = DataCoordinate.standardize( skymap=dummy_skymap_name, tract=tract_info.getId(), patch=tract_info.getSequentialPatchIndex(patch_info), universe=universe, ) if band is None: packer = universe.makePacker("tract_patch", skymap_data_id) else: packer = universe.makePacker("tract_patch_band", skymap_data_id) full_data_id = DataCoordinate.standardize(full_data_id, band=band) return packer.pack(full_data_id, returnMaxBits=True)
def ingestSimulated(repo, locations, regex, output_run, transfer="auto", ingest_type="rawexp"): """Ingests raw frames into the butler registry Parameters ---------- repo : `str` URI to the repository. locations : `list` [`str`] Files to ingest and directories to search for files that match ``regex`` to ingest. regex : `str` Regex string used to find files in directories listed in locations. output_run : `str` The path to the location, the run, where datasets should be put. transfer : `str` or None The external data transfer type, by default "auto". ingest_type : `str` ingest product data type. Raises ------ Exception Raised if operations on configuration object fail. Notes ----- This method inserts all datasets for an exposure within a transaction, guaranteeing that partial exposures are never ingested. The exposure dimension record is inserted with `Registry.syncDimensionData` first (in its own transaction), which inserts only if a record with the same primary key does not already exist. This allows different files within the same exposure to be incremented in different runs. """ butler = Butler(repo, writeable=True) # make sure instrument and detector dimensions are populated with butler.registry.transaction(): instrument_record = { "name": "simulator", "exposure_max": 600000, "detector_max": 6, "class_name": "spherex.instrument.SimulatorInstrument" } butler.registry.syncDimensionData("instrument", instrument_record) for idx in range(1, 7): detector_record = { "instrument": "simulator", "id": idx, "full_name": f"array{idx}" } butler.registry.syncDimensionData("detector", detector_record) dimension_universe = butler.registry.dimensions datasetType = DatasetType(ingest_type, dimension_universe.extract( ("instrument", "detector", "exposure")), "SPHERExImage", universe=dimension_universe) # idempotent dataset type registration butler.registry.registerDatasetType(datasetType) # idempotent collection registration run = f"{ingest_type}r" if (output_run is None) else output_run butler.registry.registerCollection(run, type=CollectionType.RUN) n_failed = 0 files = findFileResources(locations, regex) # example: sim_exposure_000000_array_1.fits or # sim_exposure_000000_array_2_dark_current.fits pattern = re.compile(r"sim_exposure_(\d+)_array_(\d)[_,.]") # do we want to group observations? grp = datetime.date.today().strftime("%Y%m%d") datasets = [] for file in files: # parse exposure and detector ids from file name m = pattern.search(file) if m is None: n_failed += 1 logging.error(f"{file} does not match simulator file pattern") continue else: g = m.groups() if len(g) != 2: n_failed += 1 logging.error( f"Unable to get exposure and detector from file name: {file}" ) continue else: [exposure_id, detector_id] = list(map(int, g)) try: exposure_record = { "instrument": "simulator", "id": exposure_id, "name": f"{exposure_id:06d}", "group_name": f"{grp}", "timespan": Timespan(begin=None, end=None) } # idempotent insertion of individual dimension rows butler.registry.syncDimensionData("exposure", exposure_record) except Exception as e: n_failed += 1 logging.error( f"Unable to insert exposure record for file {file}: {e}") continue dataId = DataCoordinate.standardize( instrument="simulator", detector=detector_id, exposure=exposure_id, universe=butler.registry.dimensions) ref = DatasetRef(datasetType, dataId=dataId) datasets.append( FileDataset(refs=ref, path=file, formatter=AstropyImageFormatter)) with butler.transaction(): butler.ingest(*datasets, transfer=transfer, run=run)
def makeQuantum( task: PipelineTask, butler: Butler, dataId: DataId, ioDataIds: Mapping[str, Union[DataId, Sequence[DataId]]], ) -> Quantum: """Create a Quantum for a particular data ID(s). Parameters ---------- task : `lsst.pipe.base.PipelineTask` The task whose processing the quantum represents. butler : `lsst.daf.butler.Butler` The collection the quantum refers to. dataId: any data ID type The data ID of the quantum. Must have the same dimensions as ``task``'s connections class. ioDataIds : `collections.abc.Mapping` [`str`] A mapping keyed by input/output names. Values must be data IDs for single connections and sequences of data IDs for multiple connections. Returns ------- quantum : `lsst.daf.butler.Quantum` A quantum for ``task``, when called with ``dataIds``. """ connections = task.config.ConnectionsClass(config=task.config) try: _checkDimensionsMatch(butler.registry.dimensions, connections.dimensions, dataId.keys()) except ValueError as e: raise ValueError("Error in quantum dimensions.") from e inputs = defaultdict(list) outputs = defaultdict(list) for name in itertools.chain(connections.inputs, connections.prerequisiteInputs): try: connection = connections.__getattribute__(name) _checkDataIdMultiplicity(name, ioDataIds[name], connection.multiple) ids = _normalizeDataIds(ioDataIds[name]) for id in ids: ref = _refFromConnection(butler, connection, id) inputs[ref.datasetType].append(ref) except (ValueError, KeyError) as e: raise ValueError(f"Error in connection {name}.") from e for name in connections.outputs: try: connection = connections.__getattribute__(name) _checkDataIdMultiplicity(name, ioDataIds[name], connection.multiple) ids = _normalizeDataIds(ioDataIds[name]) for id in ids: ref = _refFromConnection(butler, connection, id) outputs[ref.datasetType].append(ref) except (ValueError, KeyError) as e: raise ValueError(f"Error in connection {name}.") from e quantum = Quantum( taskClass=type(task), dataId=DataCoordinate.standardize(dataId, universe=butler.registry.dimensions), inputs=inputs, outputs=outputs, ) return quantum
def testStandardize(self): """Test constructing a DataCoordinate from many different kinds of input via `DataCoordinate.standardize` and `DataCoordinate.subset`. """ for n in range(5): dimensions = self.randomDimensionSubset() dataIds = self.randomDataIds(n=1).subset(dimensions) split = self.splitByStateFlags(dataIds) for m, dataId in enumerate(split.chain()): # Passing in any kind of DataCoordinate alone just returns # that object. self.assertIs(dataId, DataCoordinate.standardize(dataId)) # Same if we also explicitly pass the dimensions we want. self.assertIs(dataId, DataCoordinate.standardize(dataId, graph=dataId.graph)) # Same if we pass the dimensions and some irrelevant # kwargs. self.assertIs(dataId, DataCoordinate.standardize(dataId, graph=dataId.graph, htm7=12)) # Test constructing a new data ID from this one with a # subset of the dimensions. # This is not possible for some combinations of # dimensions if hasFull is False (see # `DataCoordinate.subset` docs). newDimensions = self.randomDimensionSubset(n=1, graph=dataId.graph) if dataId.hasFull() or dataId.graph.required.issuperset(newDimensions.required): newDataIds = [ dataId.subset(newDimensions), DataCoordinate.standardize(dataId, graph=newDimensions), DataCoordinate.standardize(dataId, graph=newDimensions, htm7=12), ] for newDataId in newDataIds: with self.subTest(newDataId=newDataId, type=type(dataId)): commonKeys = dataId.keys() & newDataId.keys() self.assertTrue(commonKeys) self.assertEqual( [newDataId[k] for k in commonKeys], [dataId[k] for k in commonKeys], ) # This should never "downgrade" from # Complete to Minimal or Expanded to Complete. if dataId.hasRecords(): self.assertTrue(newDataId.hasRecords()) if dataId.hasFull(): self.assertTrue(newDataId.hasFull()) # Start from a complete data ID, and pass its values in via several # different ways that should be equivalent. for dataId in split.complete: # Split the keys (dimension names) into two random subsets, so # we can pass some as kwargs below. keys1 = set(self.rng.sample(list(dataId.graph.dimensions.names), len(dataId.graph.dimensions)//2)) keys2 = dataId.graph.dimensions.names - keys1 newCompleteDataIds = [ DataCoordinate.standardize(dataId.full.byName(), universe=dataId.universe), DataCoordinate.standardize(dataId.full.byName(), graph=dataId.graph), DataCoordinate.standardize(DataCoordinate.makeEmpty(dataId.graph.universe), **dataId.full.byName()), DataCoordinate.standardize(DataCoordinate.makeEmpty(dataId.graph.universe), graph=dataId.graph, **dataId.full.byName()), DataCoordinate.standardize(**dataId.full.byName(), universe=dataId.universe), DataCoordinate.standardize(graph=dataId.graph, **dataId.full.byName()), DataCoordinate.standardize( {k: dataId[k] for k in keys1}, universe=dataId.universe, **{k: dataId[k] for k in keys2} ), DataCoordinate.standardize( {k: dataId[k] for k in keys1}, graph=dataId.graph, **{k: dataId[k] for k in keys2} ), ] for newDataId in newCompleteDataIds: with self.subTest(dataId=dataId, newDataId=newDataId, type=type(dataId)): self.assertEqual(dataId, newDataId) self.assertTrue(newDataId.hasFull())