Exemple #1
0
    except lsst.daf.butler.registry.MissingCollectionError:
        pass  # Already removed; nothing to do


logging.info("Preparing destination repository %s...", DEST_DIR)
_remove_refcat_run(dest_repo, DEST_RUN)
dest_repo.registry.registerCollection(DEST_RUN, CollectionType.RUN)
for src_cat, dest_cat in REFCATS.items():
    src_type = src_repo.registry.getDatasetType(src_cat)
    dest_type = _rename_dataset_type(src_type, dest_cat)
    dest_repo.registry.registerDatasetType(dest_type)
dest_repo.registry.refresh()

logging.info("Searching for refcats in %s:%s...", args.src_dir,
             args.src_collection)
query = f"htm{HTM_LEVEL} in ({','.join(id_ranges)})"
datasets = []
for src_ref in src_repo.registry.queryDatasets(REFCATS.keys(),
                                               where=query,
                                               findFirst=True):
    src_type = src_ref.datasetType
    dest_type = _rename_dataset_type(src_type, REFCATS[src_type.name])
    dest_ref = DatasetRef(dest_type, src_ref.dataId)
    datasets.append(FileDataset(path=src_repo.getURI(src_ref), refs=dest_ref))

logging.info("Copying refcats...")
dest_repo.ingest(*datasets, transfer="copy")

logging.info("%d refcat shards copied to %s:%s", len(datasets), DEST_DIR,
             DEST_RUN)
Exemple #2
0
def ingestSimulated(repo,
                    locations,
                    regex,
                    output_run,
                    transfer="auto",
                    ingest_type="rawexp"):
    """Ingests raw frames into the butler registry

    Parameters
    ----------
    repo : `str`
        URI to the repository.
    locations : `list` [`str`]
        Files to ingest and directories to search for files that match
        ``regex`` to ingest.
    regex : `str`
        Regex string used to find files in directories listed in locations.
    output_run : `str`
        The path to the location, the run, where datasets should be put.
    transfer : `str` or None
        The external data transfer type, by default "auto".
    ingest_type : `str`
        ingest product data type.

    Raises
    ------
    Exception
        Raised if operations on configuration object fail.

    Notes
    -----
    This method inserts all datasets for an exposure within a transaction,
    guaranteeing that partial exposures are never ingested.  The exposure
    dimension record is inserted with `Registry.syncDimensionData` first
    (in its own transaction), which inserts only if a record with the same
    primary key does not already exist.  This allows different files within
    the same exposure to be incremented in different runs.
    """

    butler = Butler(repo, writeable=True)

    # make sure instrument and detector dimensions are populated
    with butler.registry.transaction():
        instrument_record = {
            "name": "simulator",
            "exposure_max": 600000,
            "detector_max": 6,
            "class_name": "spherex.instrument.SimulatorInstrument"
        }
        butler.registry.syncDimensionData("instrument", instrument_record)
        for idx in range(1, 7):
            detector_record = {
                "instrument": "simulator",
                "id": idx,
                "full_name": f"array{idx}"
            }
            butler.registry.syncDimensionData("detector", detector_record)

    dimension_universe = butler.registry.dimensions
    datasetType = DatasetType(ingest_type,
                              dimension_universe.extract(
                                  ("instrument", "detector", "exposure")),
                              "SPHERExImage",
                              universe=dimension_universe)
    # idempotent dataset type registration
    butler.registry.registerDatasetType(datasetType)

    # idempotent collection registration
    run = f"{ingest_type}r" if (output_run is None) else output_run
    butler.registry.registerCollection(run, type=CollectionType.RUN)

    n_failed = 0
    files = findFileResources(locations, regex)

    # example: sim_exposure_000000_array_1.fits or
    #   sim_exposure_000000_array_2_dark_current.fits
    pattern = re.compile(r"sim_exposure_(\d+)_array_(\d)[_,.]")

    # do we want to group observations?
    grp = datetime.date.today().strftime("%Y%m%d")

    datasets = []
    for file in files:
        # parse exposure and detector ids from file name
        m = pattern.search(file)
        if m is None:
            n_failed += 1
            logging.error(f"{file} does not match simulator file pattern")
            continue
        else:
            g = m.groups()
            if len(g) != 2:
                n_failed += 1
                logging.error(
                    f"Unable to get exposure and detector from file name: {file}"
                )
                continue
            else:
                [exposure_id, detector_id] = list(map(int, g))

        try:
            exposure_record = {
                "instrument": "simulator",
                "id": exposure_id,
                "name": f"{exposure_id:06d}",
                "group_name": f"{grp}",
                "timespan": Timespan(begin=None, end=None)
            }
            # idempotent insertion of individual dimension rows
            butler.registry.syncDimensionData("exposure", exposure_record)
        except Exception as e:
            n_failed += 1
            logging.error(
                f"Unable to insert exposure record for file {file}: {e}")
            continue

        dataId = DataCoordinate.standardize(
            instrument="simulator",
            detector=detector_id,
            exposure=exposure_id,
            universe=butler.registry.dimensions)
        ref = DatasetRef(datasetType, dataId=dataId)
        datasets.append(
            FileDataset(refs=ref, path=file, formatter=AstropyImageFormatter))

    with butler.transaction():
        butler.ingest(*datasets, transfer=transfer, run=run)
    def testIngest(self):
        butler = Butler(self.tmpConfigFile, run="ingest")

        # Create and register a DatasetType
        dimensions = butler.registry.dimensions.extract(
            ["instrument", "visit", "detector"])

        storageClass = self.storageClassFactory.getStorageClass(
            "StructuredDataDictYaml")
        datasetTypeName = "metric"

        datasetType = self.addDatasetType(datasetTypeName, dimensions,
                                          storageClass, butler.registry)

        # Add needed Dimensions
        butler.registry.insertDimensionData("instrument",
                                            {"name": "DummyCamComp"})
        butler.registry.insertDimensionData("physical_filter", {
            "instrument": "DummyCamComp",
            "name": "d-r",
            "abstract_filter": "R"
        })
        for detector in (1, 2):
            butler.registry.insertDimensionData(
                "detector", {
                    "instrument": "DummyCamComp",
                    "id": detector,
                    "full_name": f"detector{detector}"
                })

        butler.registry.insertDimensionData(
            "visit", {
                "instrument": "DummyCamComp",
                "id": 423,
                "name": "fourtwentythree",
                "physical_filter": "d-r"
            }, {
                "instrument": "DummyCamComp",
                "id": 424,
                "name": "fourtwentyfour",
                "physical_filter": "d-r"
            })

        formatter = doImport(
            "lsst.daf.butler.formatters.yamlFormatter.YamlFormatter")
        dataRoot = os.path.join(TESTDIR, "data", "basic")
        datasets = []
        for detector in (1, 2):
            detector_name = f"detector_{detector}"
            metricFile = os.path.join(dataRoot, f"{detector_name}.yaml")
            dataId = {
                "instrument": "DummyCamComp",
                "visit": 423,
                "detector": detector
            }
            # Create a DatasetRef for ingest
            refIn = DatasetRef(datasetType, dataId, id=None)

            datasets.append(
                FileDataset(path=metricFile, refs=[refIn],
                            formatter=formatter))

        butler.ingest(*datasets, transfer="copy")

        dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 423}
        dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 423}

        metrics1 = butler.get(datasetTypeName, dataId1)
        metrics2 = butler.get(datasetTypeName, dataId2)
        self.assertNotEqual(metrics1, metrics2)

        # Compare URIs
        uri1 = butler.getUri(datasetTypeName, dataId1)
        uri2 = butler.getUri(datasetTypeName, dataId2)
        self.assertNotEqual(uri1, uri2)

        # Now do a multi-dataset but single file ingest
        metricFile = os.path.join(dataRoot, "detectors.yaml")
        refs = []
        for detector in (1, 2):
            detector_name = f"detector_{detector}"
            dataId = {
                "instrument": "DummyCamComp",
                "visit": 424,
                "detector": detector
            }
            # Create a DatasetRef for ingest
            refs.append(DatasetRef(datasetType, dataId, id=None))

        datasets = []
        datasets.append(
            FileDataset(path=metricFile,
                        refs=refs,
                        formatter=MultiDetectorFormatter))

        butler.ingest(*datasets, transfer="copy")

        dataId1 = {"instrument": "DummyCamComp", "detector": 1, "visit": 424}
        dataId2 = {"instrument": "DummyCamComp", "detector": 2, "visit": 424}

        multi1 = butler.get(datasetTypeName, dataId1)
        multi2 = butler.get(datasetTypeName, dataId2)

        self.assertEqual(multi1, metrics1)
        self.assertEqual(multi2, metrics2)

        # Compare URIs
        uri1 = butler.getUri(datasetTypeName, dataId1)
        uri2 = butler.getUri(datasetTypeName, dataId2)
        self.assertEqual(uri1, uri2)

        # Test that removing one does not break the second
        butler.remove(datasetTypeName, dataId1)
        with self.assertRaises(LookupError):
            butler.datasetExists(datasetTypeName, dataId1)
        self.assertTrue(butler.datasetExists(datasetTypeName, dataId2))
        multi2b = butler.get(datasetTypeName, dataId2)
        self.assertEqual(multi2, multi2b)
class FormattersTests(DatasetTestHelper, lsst.utils.tests.TestCase):
    root = None
    storageClassFactory = None

    @classmethod
    def setUpClass(cls):
        """Create a new butler once only."""

        cls.storageClassFactory = StorageClassFactory()

        cls.root = tempfile.mkdtemp(dir=TESTDIR)

        data_ids = {
            "instrument": [INSTRUMENT_NAME],
            "detector": [0, 1, 2, 3, 4, 5],
            "exposure": [11, 22],
        }

        configURI = ButlerURI("resource://spherex/configs",
                              forceDirectory=True)
        butlerConfig = Config(configURI.join("butler.yaml"))
        # in-memory db is being phased out
        # butlerConfig["registry", "db"] = 'sqlite:///:memory:'
        cls.creatorButler = makeTestRepo(
            cls.root,
            data_ids,
            config=butlerConfig,
            dimensionConfig=configURI.join("dimensions.yaml"))
        for formatter in FORMATTERS:
            datasetTypeName, storageClassName = (formatter["dataset_type"],
                                                 formatter["storage_class"])
            storageClass = cls.storageClassFactory.getStorageClass(
                storageClassName)
            addDatasetType(cls.creatorButler, datasetTypeName, set(data_ids),
                           storageClass)

    @classmethod
    def tearDownClass(cls):
        if cls.root is not None:
            shutil.rmtree(cls.root, ignore_errors=True)

    def setUp(self):
        # make test collection
        # self.butler = makeTestCollection(self.creatorButler)
        self.collection = self._testMethodName
        self.butler = Butler(butler=self.creatorButler, run=self.collection)

    def test_putget(self):
        fitsPath = os.path.join(TESTDIR, "data", "small.fits")
        dataid = {"exposure": 11, "detector": 0, "instrument": INSTRUMENT_NAME}
        for formatter in FORMATTERS:
            # in-memory object, representing fits
            inmemobj = formatter["reader"](fitsPath)

            # save in-memory object into butler dataset
            datasetTypeName = formatter["dataset_type"]
            self.butler.put(inmemobj, datasetTypeName, dataid)

            # get butler dataset
            retrievedobj = self.butler.get(datasetTypeName, dataid)
            self.assertTrue(isinstance(retrievedobj, formatter["inmem_cls"]))
            self.assertTrue(retrievedobj.__class__.__name__,
                            inmemobj.__class__.__name__)

    def test_ingest(self):

        fitsPath = os.path.join(TESTDIR, "data", "small.fits")

        formatter = FORMATTERS[0]
        datasetTypeName, formatterCls = (formatter["dataset_type"],
                                         formatter["formatter_cls"])

        datasetType = self.butler.registry.getDatasetType(datasetTypeName)
        datasets = []
        for exposure in range(3, 5):
            for detector in range(6):
                # use the same fits to test ingest
                if not os.path.exists(fitsPath):
                    log.warning(
                        f"No data found for detector {detector}, exposure {exposure} @ {fitsPath}."
                    )
                    continue
                ref = DatasetRef(datasetType,
                                 dataId={
                                     "instrument": INSTRUMENT_NAME,
                                     "detector": detector,
                                     "exposure": exposure * 11
                                 })
                datasets.append(
                    FileDataset(refs=ref,
                                path=fitsPath,
                                formatter=formatterCls))

        # register new collection
        # run = "rawIngestedRun"
        # self.butler.registry.registerCollection(run, type=CollectionType.RUN)

        # collection is registered as a part of setUp
        run = self.collection

        with self.butler.transaction():
            for exposure in range(3, 5):
                expid = exposure * 11
                self.butler.registry.insertDimensionData(
                    "exposure", {
                        "instrument": INSTRUMENT_NAME,
                        "id": expid,
                        "name": f"{expid}",
                        "group_name": "day1",
                        "timespan": Timespan(begin=None, end=None)
                    })
            # transfer can be 'auto', 'move', 'copy', 'hardlink', 'relsymlink'
            # or 'symlink'
            self.butler.ingest(*datasets, transfer="symlink", run=run)

        # verify that 12 files were ingested (2 exposures for each detector)
        refsSet = set(
            self.butler.registry.queryDatasets(datasetTypeName,
                                               collections=[run]))
        self.assertEqual(
            len(refsSet), 12,
            f"Collection {run} should have 12 elements after ingest")

        # verify that data id is present
        dataid = {"exposure": 44, "detector": 5, "instrument": INSTRUMENT_NAME}
        refsList = list(
            self.butler.registry.queryDatasets(datasetTypeName,
                                               collections=[run],
                                               dataId=dataid))
        self.assertEqual(
            len(refsList), 1,
            f"Collection {run} should have 1 element with {dataid}")