def test_remote(intake_server): cat_local = intake.open_catalog(cat_file) cat_remote = intake.open_catalog(intake_server) assert 'outer' in cat_remote assert 'outer' in cat_local print(tuple(cat_remote)) print(tuple(cat_remote['outer']())) print(tuple(cat_remote['outer']()['circle']())) print(cat_remote['outer']()['circle']()['green']) print(cat_remote['outer']()['circle']()['green'].read())
def test_multi_cat_names(): fn = abspath("catalog_union*.yml") cat = open_catalog(fn) assert cat.name == fn assert fn in repr(cat) fn1 = abspath("catalog_union_1.yml") fn2 = abspath("catalog_union_2.yml") cat = open_catalog([fn1, fn2]) assert cat.name == '2 files'
def bundle(request, intake_server, example_data): # noqa tmp_dir = TMP_DIRS[request.param] tmp_data_dir = Path(tmp_dir) / 'data' serializer_partial = partial(Serializer, tmp_data_dir) serializer = serializer_partial() uid, docs = example_data for name, doc in docs: serializer(name, doc) serializer.close() fullname = os.path.join(tmp_dir, YAML_FILENAME) with open(fullname, 'w') as f: f.write(f''' sources: xyz: description: Some imaginary beamline driver: "bluesky-jsonl-catalog" container: catalog args: paths: {tmp_data_dir / "*.jsonl"} handler_registry: NPY_SEQ: ophyd.sim.NumpySeqHandler metadata: beamline: "00-ID" xyz_with_transforms: description: Some imaginary beamline driver: "bluesky-jsonl-catalog" container: catalog args: paths: [{tmp_data_dir / "*.jsonl"}] handler_registry: NPY_SEQ: ophyd.sim.NumpySeqHandler transforms: start: databroker.tests.test_v2.transform.transform stop: databroker.tests.test_v2.transform.transform resource: databroker.tests.test_v2.transform.transform descriptor: databroker.tests.test_v2.transform.transform metadata: beamline: "00-ID" ''') time.sleep(2) remote = request.param == 'remote' if request.param == 'local': cat = intake.open_catalog(os.path.join(tmp_dir, YAML_FILENAME)) elif request.param == 'remote': cat = intake.open_catalog(intake_server, page_size=10) else: raise ValueError return types.SimpleNamespace(cat=cat, uid=uid, docs=docs, remote=remote, serializer_partial=serializer_partial)
def test_cat_with_no_declared_name_gets_name_from_dir_if_file_named_catalog(): fn = abspath("catalog.yml") cat = open_catalog(fn, name='name_in_func', description='Description in func') assert cat.name == 'name_in_func' assert cat.description == 'Description in func' cat = open_catalog(fn) assert cat.name == 'tests' assert cat.description == None
def test_cat_with_declared_name(): fn = abspath("catalog_named.yml") description = 'Description declared in the open function' cat = open_catalog(fn, name='name_in_func', description=description) assert cat.name == 'name_in_func' assert cat.description == description assert cat.metadata.get('some') == 'thing' cat = open_catalog(fn) assert cat.name == 'name_in_spec' assert cat.description == 'This is a catalog with a description in the yaml'
def test_remote_netcdf(intake_server): cat_local = intake.open_catalog(cat_file) cat = intake.open_catalog(intake_server) assert 'xarray_source' in cat source = cat.xarray_source() assert isinstance(source._ds, xr.Dataset) assert source._schema is None source._get_schema() assert source._schema is not None assert (source.to_dask().rh.data.compute() == cat_local.xarray_source.to_dask().rh.data.compute()).all()
def main(argv=None): from intake.config import conf from intake import open_catalog if argv is None: argv = sys.argv parser = argparse.ArgumentParser(description='Intake Catalog Server') parser.add_argument('-p', '--port', type=int, default=conf['port'], help='port number for server to listen on') parser.add_argument('--list-entries', action='store_true', help='list catalog entries at startup') parser.add_argument('--sys-exit-on-sigterm', action='store_true', help='internal flag used during unit testing to ensure ' '.coverage file is written') parser.add_argument('catalog_args', metavar='FILE', type=str, nargs='+', help='Name of catalog YAML file') parser.add_argument('--flatten', dest='flatten', action='store_true') parser.add_argument('--no-flatten', dest='flatten', action='store_false') parser.set_defaults(flatten=True) args = parser.parse_args(argv[1:]) if args.sys_exit_on_sigterm: signal.signal(signal.SIGTERM, call_exit_on_sigterm) logger.info('Creating catalog from:') for arg in args.catalog_args: logger.info(' - %s' % arg) catargs = args.catalog_args if len(catargs) == 1: catalog = open_catalog(catargs[0]) logger.info("catalog_args: %s" % catargs[0]) else: catalog = open_catalog(catargs, flatten=args.flatten) logger.info("catalog_args: %s" % catargs) if args.list_entries: # This is not a good idea if the Catalog is huge. logger.info('Entries:' + ','.join(list(catalog))) logger.info('Listening on port %d' % args.port) server = IntakeServer(catalog) app = server.make_app() server.start_periodic_functions(close_idle_after=3600.0) app.listen(args.port) try: tornado.ioloop.IOLoop.current().start() except KeyboardInterrupt: logger.critical("Exiting") except Exception as e: logger.critical("Exiting due to %s" % e)
def test_text_export(temp_cache): import tempfile outdir = tempfile.mkdtemp() cat = intake.open_catalog(os.path.join(here, 'sources.yaml')) s = cat.sometext() out = s.export(outdir) fn = os.path.join(outdir, 'cat.yaml') with open(fn, 'w') as f: f.write(out.yaml()) cat = intake.open_catalog(fn) s2 = cat[s.name]() assert s.read() == s2.read()
def test_remote_tiff(intake_server): pytest.importorskip('rasterio') cat_local = intake.open_catalog(cat_file) cat = intake.open_catalog(intake_server) assert 'tiff_source' in cat source = cat.tiff_source() assert isinstance(source._ds, xr.Dataset) assert source._schema is None source._get_schema() assert source._schema is not None remote = source.to_dask().data.compute() local = cat_local.tiff_source.to_dask().data.compute() assert (remote == local).all()
def test_zarr_entries_in_yaml_catalog(temp_zarr): import dask.array as da # open YAML catalog file _, _, _, catalog_file = temp_zarr cat = open_catalog(catalog_file) # test entries assert_items_equal(['root', 'bar', 'eggs'], list(cat)) # entry pointing to zarr root group assert isinstance(cat['root'], ZarrGroupCatalog) assert_items_equal(['foo', 'bar', 'baz'], list(cat['root'])) assert 'catalog' == cat['root'].describe()['container'] assert isinstance(cat['root'].to_zarr(), zarr.hierarchy.Group) # entry pointing to zarr sub-group assert isinstance(cat['bar'], ZarrGroupCatalog) assert_items_equal(['spam', 'eggs'], list(cat['bar'])) assert 'catalog' == cat['bar'].describe()['container'] assert isinstance(cat['bar'].to_zarr(), zarr.hierarchy.Group) # entry pointing to zarr array assert isinstance(cat['eggs'], ZarrArraySource) assert 'ndarray' == cat['eggs'].describe()['container'] assert isinstance(cat['eggs'].to_dask(), da.Array)
def test_nested_catalog_access(tmp_path_catalog_nested): cat = intake.open_catalog(tmp_path_catalog_nested) entry1 = cat.nested.ex1 entry2 = cat["nested.ex1"] entry3 = cat[["nested", "ex1"]] entry4 = cat["nested", "ex1"] assert entry1 == entry2 == entry3 == entry4
def test_plot(): pytest.importorskip("streamz.dataframe") pytest.importorskip("hvplot") cat = intake.open_catalog(catfile) s = cat.df.read() pl = s.plot() assert "DynamicMap" in str(pl)
def open_intake_catalog(platform, config): """Returns an Intake catalog for a specified platform and config Uses the package resources included in the om4labs distribution to determine the directory of the intake catalogs, unless it is overridden by the "OM4LABS_CATALOG_DIR" environment var. Parameters ---------- platform : str Site description, e.g. "gfdl", "orion", "testing" config : str Model configuration, e.g. "OM4p5", "OM4p25" Returns ------- intake.catalog.Catalog Intake catalog corresponding to specified platform/config """ catalog_str = f"{config}_catalog_{platform}.yml" if "OM4LABS_CATALOG_DIR" in os.environ.keys(): catfile = f"{os.environ['OM4LABS_CATALOG_DIR']}/{catalog_str}" else: catfile = pkgr.resource_filename("om4labs", f"catalogs/{catalog_str}") cat = intake.open_catalog(catfile) return cat
def test_cache_to_cat(tmpdir): old = intake.config.conf.copy() olddir = intake.config.confdir intake.config.confdir = str(tmpdir) intake.config.conf.update({ 'cache_dir': 'catdir', 'cache_download_progress': False, 'cache_disabled': False }) try: fn0 = os.path.join(here, 'calvert_uk.zip') fn1 = os.path.join(tmpdir, 'calvert_uk.zip') shutil.copy2(fn0, fn1) fn0 = os.path.join(here, 'cached.yaml') fn1 = os.path.join(tmpdir, 'cached.yaml') shutil.copy2(fn0, fn1) cat = intake.open_catalog(fn1) s = cat.calvert() df = s.read() assert len(df) md = CacheMetadata() f = md[s._urlpath][0] assert f['cache_path'].startswith(str(tmpdir)) assert 'intake_cache' in os.listdir(tmpdir) assert os.listdir(os.path.join(tmpdir, 'intake_cache')) finally: intake.config.confdir = olddir intake.config.conf.update(old)
def _databroker(): mongo_box = MongoBox() try: mongo_box.start() mongo_client = mongo_box.client() mongo_host, mongo_port = mongo_client.address mongo_uri = f"mongodb://{mongo_host}:{mongo_port}" catalog_descriptor_path = tmp_path / Path("mad.yml") with open(catalog_descriptor_path, "w") as f: f.write(f"""\ sources: mad: description: Made up beamline driver: "bluesky-mongo-normalized-catalog" container: catalog args: metadatastore_db: {mongo_uri} asset_registry_db: {mongo_uri} handler_registry: NPY_SEQ: ophyd.sim.NumpySeqHandler metadata: beamline: "00-ID" """) yield intake.open_catalog(catalog_descriptor_path) finally: mongo_box.stop()
def test_flatten_duplicate_error(): path = tempfile.mkdtemp() f1 = os.path.join(path, 'catalog.yaml') path = tempfile.mkdtemp() f2 = os.path.join(path, 'catalog.yaml') for f in [f1, f2]: with open(f, 'w') as fo: fo.write(""" sources: a: driver: csv args: urlpath: /not/a/file """) with pytest.raises(ValueError): open_catalog([f1, f2])
def test_fsspec_integration(): import fsspec import pandas as pd mem = fsspec.filesystem('memory') with mem.open('cat.yaml', 'wt') as f: f.write(""" sources: implicit: driver: csv description: o args: urlpath: "{{CATALOG_DIR}}/file.csv" explicit: driver: csv description: o args: urlpath: "memory:///file.csv" extra: driver: csv description: o args: urlpath: "{{CATALOG_DIR}}/file.csv" storage_options: {other: option}""" ) with mem.open('/file.csv', 'wt') as f: f.write("a,b\n0,1") expected = pd.DataFrame({'a': [0], 'b': [1]}) cat = open_catalog("memory://cat.yaml") assert list(cat) == ['implicit', 'explicit', 'extra'] assert cat.implicit.read().equals(expected) assert cat.explicit.read().equals(expected) s = cat.extra() assert s._storage_options['other']
def fetch_bokeh_sources(catalog_filename): """Define Bokeh Data Sources """ catalog = intake.open_catalog(catalog_filename) dataframes = { 'mulch': catalog.solid_waste_mulch.read().pivot(index='Month', columns='Type') } sources = { 'mulch': ColumnDataSource(dataframes['mulch']), } comm_df = catalog.solid_waste_commodity_recycling.read() comm_df = comm_df.rename(columns={ "Magnolia & Alice": "East Knoxville Recycling Center", "225 Moody": "South Knoxville Recycling Center", "4440 Western Av.": "North Knoxville Recycling Center", "341 Parkvillage": "West Knoxville Recycling Center", "227 Willow Av.": "Downtown Knoxville Recycling Center", "Curbside City-Wide": "Curbside City-Wide Pickup", "Downtown": "Downtown Pickup", "KPD": "KPD", # what is this? "Recycling Trailer": "Recycling Trailer" }) for material in {'Glass', 'Cardboard', 'Mixed Paper', 'Plastics ("Commingled")'}: dataframes[f'commodity_{material}'] = comm_df[comm_df.Type == material].fillna(0).pivot(index='Month', columns='Type') sources[f'commodity_{material}'] = ColumnDataSource(dataframes[f'commodity_{material}']) return {'dataframes': dataframes, 'sources': sources}
def test_reload_missing_remote_directory(intake_server): try: shutil.rmtree(TMP_DIR) except: pass time.sleep(1) catalog = open_catalog(intake_server, ttl=0.1) assert_items_equal(list(catalog), []) os.mkdir(TMP_DIR) with open(os.path.join(TMP_DIR, YAML_FILENAME), 'w') as f: f.write(''' plugins: source: - module: intake.catalog.tests.example1_source - module: intake.catalog.tests.example_plugin_dir.example2_source sources: use_example1: description: example1 source plugin driver: example1 args: {} ''') time.sleep(1.2) assert_items_equal(list(catalog), ['use_example1']) try: shutil.rmtree(TMP_DIR) except: pass
def parse(input_bam, virtual_digest_catalog, output_prefix, n_workers, chunksize): """Filter the read-sorted alignments in INPUT_BAM and save the results under OUTPUT_PREFIX """ from pore_c.analyses.alignments import parse_alignment_bam file_paths = catalogs.AlignmentDfCatalog.generate_paths(output_prefix) vd_cat = open_catalog(str(virtual_digest_catalog)) fragment_df = vd_cat.fragments.read() final_stats = parse_alignment_bam( input_bam, fragment_df, alignment_table=file_paths["alignment"], read_table=file_paths["read"], overlap_table=file_paths["overlap"], alignment_summary=file_paths["alignment_summary"], read_summary=file_paths["read_summary"], n_workers=n_workers, chunksize=chunksize, ) metadata = {"final_stats": final_stats} file_paths["virtual_digest"] = Path(virtual_digest_catalog) file_paths["input_bam"] = Path(input_bam) adf_cat = catalogs.AlignmentDfCatalog.create(file_paths, metadata, {}) logger.info(str(adf_cat))
def test_simple(): cat = intake.open_catalog(catfile) s = cat.simple.read() l = s.sink_to_list() assert not l s.start() wait_for(lambda: l == [1, 2, 3], timeout=1)
def test_parquet(temp_cache): inp = pytest.importorskip('intake_parquet') cat = intake.open_catalog( os.path.abspath(os.path.join(path, 'catalog1.yml'))) s = cat.entry1() s2 = s.persist() assert isinstance(s2, inp.ParquetSource)
def test_multi_cat_names(): fn = abspath("catalog_union*.yml") cat = open_catalog(fn) assert cat.name == fn assert fn in repr(cat) fn1 = abspath("catalog_union_1.yml") fn2 = abspath("catalog_union_2.yml") cat = open_catalog([fn1, fn2]) assert cat.name == '2 files' assert cat.description == 'Catalog generated from 2 files' cat = open_catalog([fn1, fn2], name='special_name', description='Special description') assert cat.name == 'special_name' assert cat.description == 'Special description'
def test_reload_updated_config(intake_server_with_config): catalog = open_catalog(intake_server_with_config) entries = list(catalog) assert entries == ['use_example1'] with open(os.path.join(TMP_DIR, YAML_FILENAME), 'w') as f: f.write(''' plugins: source: - module: intake.catalog.tests.example1_source - module: intake.catalog.tests.example_plugin_dir.example2_source sources: use_example1: description: example1 source plugin driver: example1 args: {} use_example1_1: description: example1 other driver: example1 args: {} ''') time.sleep(2) assert_items_equal(list(catalog), ['use_example1', 'use_example1_1'])
def test_cat_add(tmpdir): tmpdir = str(tmpdir) fn = os.path.join(tmpdir, 'cat.yaml') with open(fn, 'w') as f: f.write('sources: {}') cat = open_catalog(fn) assert list(cat) == [] # was added in memory cat.add(cat) cat._load() # this would happen automatically, but not immediately assert list(cat) == ['cat'] # was added to the file cat = open_catalog(fn) assert list(cat) == ['cat']
def test_read_direct(intake_server): catalog = open_catalog(intake_server) d = catalog['entry1_part'].configure_new(part='2') test_dir = os.path.dirname(__file__) file2 = os.path.join(test_dir, 'entry1_2.csv') expected_df = pd.read_csv(file2) meta = expected_df[:0] info = d.discover() assert info['dtype'] == { k: str(v) for k, v in meta.dtypes.to_dict().items() } assert info['npartitions'] == 1 assert info['shape'] == (None, 3) # Do not know CSV size ahead of time md = info['metadata'].copy() md.pop('catalog_dir', None) assert md == {'bar': [2, 4, 6], 'foo': 'baz'} md = d.metadata.copy() md.pop('catalog_dir', None) assert md == dict(foo='baz', bar=[2, 4, 6]) assert d.description == 'entry1 part' df = d.read() assert expected_df.equals(df)
def test_dir_cache(tmpdir, temp_cache): [ os.makedirs(os.path.join(tmpdir, d)) for d in ['main', 'main/sub1', 'main/sub2'] ] for f in [ 'main/afile', 'main/sub1/subfile', 'main/sub2/subfile1', 'main/sub2/subfile2' ]: fn = os.path.join(tmpdir, f) with open(fn, 'w') as fo: fo.write(f) fn = os.path.join(tmpdir, 'cached.yaml') shutil.copy2(os.path.join(here, 'cached.yaml'), fn) cat = intake.open_catalog(fn) s = cat.dirs() out = s.cache[0].load(s._urlpath, output=False) assert out[0] == os.path.join(tmpdir, s.cache[0]._path(s._urlpath)) assert open(os.path.join(out[0], 'afile')).read() == 'main/afile' md = CacheMetadata() got = md[s._urlpath] # Avoid re-copy s = cat.dirs() s.cache[0].load(s._urlpath, output=False) md2 = CacheMetadata() got2 = md2[s._urlpath] assert got == got2
def _initializeCatalog(self, **kwargs): file_uri: str = self.catURI file_exists = os.path.isfile( file_uri ) print( f"Opening catalog file: {file_uri}") self._catalog = intake.open_catalog( file_uri, driver="yaml_file_cat", autoreload=file_exists, name=self.name, **kwargs ) if file_exists: self.validate() else: self.save()
def test_read_pattern_path_not_as_pattern(): pytest.importorskip('rasterio') cat = intake.open_catalog(os.path.join(here, 'data', 'catalog.yaml')) green = cat.pattern_tiff_source_path_not_as_pattern() da = green.read() assert len(da.band) == 3
def test_read(intake_server): catalog = open_catalog(intake_server) d = catalog['entry1'] test_dir = os.path.dirname(__file__) file1 = os.path.join(test_dir, 'entry1_1.csv') file2 = os.path.join(test_dir, 'entry1_2.csv') expected_df = pd.concat((pd.read_csv(file1), pd.read_csv(file2))) meta = expected_df[:0] info = d.discover() assert info['dtype'] == { k: str(v) for k, v in meta.dtypes.to_dict().items() } assert info['npartitions'] == 2 assert info['shape'] == (None, 3) # Do not know CSV size ahead of time md = d.metadata.copy() assert md['foo'] == 'bar' assert md['bar'] == [1, 2, 3] df = d.read() assert expected_df.equals(df)