def test_validate_par(): up = UserParameter('arg1', type='int') e = LocalCatalogEntry('', '', driver, args={'arg1': "oi"}, parameters=[up], getenv=False) with pytest.raises(ValueError): e() e = LocalCatalogEntry('', '', driver, args={'arg1': 1}, parameters=[up], getenv=False) e() # OK e = LocalCatalogEntry('', '', driver, args={'arg1': "1"}, parameters=[up], getenv=False) s = e() # OK assert s.kwargs['arg1'] == 1 # a number, not str
def _load(self): """ Load the STAC Catalog. """ subcatalog = None # load first sublevel catalog(s) for subcatalog in self._stac_obj.children(): self._entries[subcatalog.id] = LocalCatalogEntry( name=subcatalog.id, description=subcatalog.description, driver=StacCatalog, catalog=self, args={'stac_obj': subcatalog.filename}, ) if subcatalog is None: # load items under last catalog for item in self._stac_obj.items(): self._entries[item.id] = LocalCatalogEntry( name=item.id, description='', driver=StacItem, catalog=self, args={'stac_obj': item}, )
def test_maybe_default_from_env(): # maybe fill in parameter default from the env, depending on getenv up = UserParameter('name', default='env(INTAKE_TEST_VAR)') e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}, parameters=[up], getenv=False) s = e() assert s.kwargs['arg1'] == 'env(INTAKE_TEST_VAR)' os.environ['INTAKE_TEST_VAR'] = 'oi' s = e() assert s.kwargs['arg1'] == 'env(INTAKE_TEST_VAR)' up = UserParameter('name', default='env(INTAKE_TEST_VAR)') e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}, parameters=[up], getenv=True) s = e() assert s.kwargs['arg1'] == 'oi' del os.environ['INTAKE_TEST_VAR'] s = e() assert s.kwargs['arg1'] == ''
def _load(self): """ Load the STAC Catalog. """ for subcatalog in self._stac_obj.get_children(): if isinstance(subcatalog, pystac.Collection): # Collection subclasses Catalog, so check it first driver = StacCollection else: driver = StacCatalog self._entries[subcatalog.id] = LocalCatalogEntry( name=subcatalog.id, description=subcatalog.description, driver=driver, # recursive catalog=self, args={'stac_obj': subcatalog.get_self_href()}, ) for item in self._stac_obj.get_items(): self._entries[item.id] = LocalCatalogEntry( name=item.id, description='', driver=StacItem, catalog=self, args={'stac_obj': item}, )
def _load(self): if self.spark_cat is None: self.spark_cat = SparkHolder(True, [('catalog', )], self.context_args).setup() self._entries = {} dbs = (self.spark_cat.listDatabases() if self.database is None else [self.database]) for db in dbs: tables = self.spark_cat.listTables(dbName=db.name) for table in tables: if db.name: description = ('Spark table %s in database %s' '' % (table.name, db.name)) else: description = ('Spark table %s in default database' '' % table.name) args = {'args': [('table', (table.name, ))]} e = LocalCatalogEntry(table.name, description, 'spark_dataframe', True, args, cache=[], parameters=[], metadata={}, catalog_dir="", getenv=False, getshell=False) e._plugin = [SparkDataFrame] self._entries[table.name] = e
def test_validate_up(): up = UserParameter('name', default=1, type='int') e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}, parameters=[up], getenv=False) s = e() # OK assert s.kwargs['arg1'] == '1' with pytest.raises(ValueError): e(name='oi') up = UserParameter('name', type='int') e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}, parameters=[up], getenv=False) s = e() # OK # arg1 is a string: real int gets rendered by jinja assert s.kwargs['arg1'] == '0' # default default for int s = e(arg1='something') assert s.kwargs['arg1'] == 'something'
def _load(self): import sqlalchemy from intake_sql import SQLSourceAutoPartition engine = sqlalchemy.create_engine(self.uri) meta = sqlalchemy.MetaData(bind=engine) meta.reflect(views=self.views) self._entries = {} for name, table in meta.tables.items(): for c in table.columns: if c.primary_key: description = 'SQL table %s from %s' % (name, self.uri) args = { 'uri': self.uri, 'table': name, 'index': c.name, 'sql_kwargs': self.kwargs } e = LocalCatalogEntry(name, description, 'sql_auto', True, args, {}, {}, {}, "", getenv=False, getshell=False) e._plugin = [SQLSourceAutoPartition] self._entries[name] = e break
def test_no_instance(): from intake.catalog.local import LocalCatalogEntry e0 = LocalCatalogEntry('foo', '', 'fake') e1 = LocalCatalogEntry('foo0', '', 'fake') # this would error on instantiation with driver not found assert e0 != e1
def test_explicit_entry_driver(): from intake.source.textfiles import TextFilesSource e = LocalCatalogEntry('test', 'desc', TextFilesSource, args={'urlpath': None}) assert e.describe()['container'] == 'python' assert isinstance(e(), TextFilesSource) with pytest.raises(TypeError): LocalCatalogEntry('test', 'desc', None)
def test_dict_adddel(): from intake.catalog.base import Catalog entry = LocalCatalogEntry(name='trial', description='get this back', driver='csv', args=dict(urlpath="")) cat = Catalog.from_dict({'trial': entry}, name='mycat') assert 'trial' in cat cat['trial2'] = entry assert list(cat) == ['trial', 'trial2'] cat.pop('trial') assert list(cat) == ['trial2'] assert cat['trial2'].describe() == entry.describe()
def test_filter(): from intake.catalog.base import Catalog entry1 = LocalCatalogEntry(name='trial', description='get this back', driver='csv', args=dict(urlpath="")) entry2 = LocalCatalogEntry(name='trial', description='pass this through', driver='csv', args=dict(urlpath="")) cat = Catalog.from_dict({'trial1': entry1, 'trial2': entry2}, name='mycat') cat2 = cat.filter(lambda e: 'pass' in e._description) assert list(cat2) == ['trial2'] assert cat2.trial2 == entry2()
def _load(self): resources = resource_list() for r in resources: e = LocalCatalogEntry(name=r, description=r, driver=StripeTableSource, catalog=self, args={ 'api_key': self.api_key, 'api_version': self.api_version, 'resource': r }) e._plugin = [StripeTableSource] self._entries[r] = e
def test_unknown(): e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}) s = e() assert s.kwargs['arg1'] == "" # parameter has no default up = UserParameter('name') e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}, parameters=[up]) s = e() assert s.kwargs['arg1'] == ""
def test_maybe_default_from_env(): # maybe fill in parameter default from the env, depending on getenv up = UserParameter('name', default='env(INTAKE_TEST_VAR)') e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}, parameters=[up], getenv=False) s = e() assert s.kwargs['arg1'] == 'env(INTAKE_TEST_VAR)' os.environ['INTAKE_TEST_VAR'] = 'oi' # Clear the cached source so we can (not) pick up the changed environment variable. e.clear_cached_default_source() s = e() assert s.kwargs['arg1'] == 'env(INTAKE_TEST_VAR)' up = UserParameter('name', default='env(INTAKE_TEST_VAR)') e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}, parameters=[up], getenv=True) s = e() assert s.kwargs['arg1'] == 'oi' del os.environ['INTAKE_TEST_VAR'] # Clear the cached source so we can pick up the changed environment variable. e.clear_cached_default_source() s = e() assert s.kwargs['arg1'] == ''
def _create_entry(self, row): name = f'{row.TABLE_SCHEMA}."{row.TABLE_NAME}"' description = f'Dremio {row.TABLE_TYPE} {name} from {self._hostname}' args = {'uri': self._uri, 'sql_expr': f'select * from {name}'} e = LocalCatalogEntry(name, description, 'dremio', True, args, {}, {}, {}, "", getenv=False, getshell=False) e._plugin = [DremioSource] self._entries[name] = e
def test_cache_default_source(): # If the user provides parameters, don't allow default caching up = UserParameter('name', default='oi') e = LocalCatalogEntry('', '', driver, getenv=False, parameters=[up]) s1 = e(name="oioi") s2 = e() assert s1 is not s2 s1 = e() s2 = e(name="oioi") assert s1 is not s2 # Otherwise, we can cache the default source e = LocalCatalogEntry('', '', driver, getenv=False) s1 = e() s2 = e() assert s1 is s2
def add(self, key, source): """Add the persisted source to the store under the given key key : str The unique token of the un-persisted, original source source : DataSource instance The thing to add to the persisted catalogue, referring to persisted data """ from intake.catalog.local import LocalCatalogEntry try: with self.fs.open(self.path, 'rb') as f: data = yaml.safe_load(f) except IOError: data = {'sources': {}} ds = source._yaml()['sources'][source.name] data['sources'][key] = ds with self.fs.open(self.path, 'wb') as fo: fo.write(yaml.dump(data, default_flow_style=False).encode()) self._entries[key] = LocalCatalogEntry( name=ds['metadata']['original_name'], direct_access=True, cache=[], parameters=[], catalog_dir=None, **data['sources'][key])
def search(self, **query): """ Search for entries in the collection catalog """ collection_columns = self.df.columns.tolist() for key in query.keys(): if key not in collection_columns: raise ValueError(f'{key} is not in {self.collection_name}') for key in collection_columns: if key not in query: query[key] = None name = self.collection_name + '_' + str(uuid.uuid4()) args = {'collection_name': self.collection_name, 'query': query} driver = config.get('sources')[self.collection_type] description = f'Catalog entry from {self.collection_name} collection' cat = LocalCatalogEntry( name=name, description=description, driver=driver, direct_access=True, args=args, cache={}, parameters={}, metadata=self.metadata.copy(), catalog_dir='', getenv=False, getshell=False, ) self._entries[name] = cat return cat
def _load(self): """ Connect to the OmniSci database, list the available tables, and construct a catalog entry for each table. """ connection = pymapd.connect(**self._init_args) self._entries = {} if self._ibis_con is None: try: import ibis.omniscidb self._ibis_con = ibis.omniscidb.connect( uri=self._init_args['uri'], user=self._init_args['user'], password=self._init_args['password'], host=self._init_args['host'], port=self._init_args['port'], protocol=self._init_args['protocol'], database=self._init_args['dbname'], ) except ImportError: pass for table in connection.get_tables(): description = "SQL table %s from %s" % (table, str(self)) args = { key: value for key, value in self._init_args.items() if value } args['ibis_con'] = self._ibis_con args["sql_expr"] = table e = LocalCatalogEntry(table, description, "omnisci", True, args) self._entries[table] = e
def _load(self): """ Query the Civis database for all the schemas which have tables and construct catalog entries for them. """ fut = civis.io.query_civis( "SELECT DISTINCT(table_schema) FROM information_schema.tables WHERE " "table_schema != 'pg_catalog' AND table_schema != 'information_schema'", database=self._database, client=self._client, ) res = fut.result() schemas = [row[0] for row in res.result_rows] self._entries = {} for schema in schemas: entry = LocalCatalogEntry( schema, f"Civis schema {schema} from {self._database}", CivisSchema, True, args={ "api_key": self._api_key, "database": self._database, "schema": schema, }, getenv=False, getshell=False, ) self._entries[schema] = entry
def test_auto_env_expansion(): os.environ['INTAKE_TEST_VAR'] = 'oi' e = LocalCatalogEntry('', '', driver, args={'arg1': "{{env(INTAKE_TEST_VAR)}}"}, parameters=[], getenv=False) s = e() # when getenv is False, you pass through the text assert s.kwargs['arg1'] == '{{env(INTAKE_TEST_VAR)}}' e = LocalCatalogEntry('', '', driver, args={'arg1': "{{env(INTAKE_TEST_VAR)}}"}, parameters=[], getenv=True) s = e() assert s.kwargs['arg1'] == 'oi' # same, but with quoted environment name e = LocalCatalogEntry('', '', driver, args={'arg1': '{{env("INTAKE_TEST_VAR")}}'}, parameters=[], getenv=True) s = e() assert s.kwargs['arg1'] == 'oi' del os.environ['INTAKE_TEST_VAR'] # Clear the cached source so we can pick up the changed environment variable. e.clear_cached_default_source() s = e() assert s.kwargs['arg1'] == '' e = LocalCatalogEntry('', '', driver, args={'arg1': "{{env(INTAKE_TEST_VAR)}}"}, parameters=[], getenv=False) s = e() assert s.kwargs['arg1'] == '{{env(INTAKE_TEST_VAR)}}'
def _load(self): """ Query the Civis database for all the tables in the schema and construct catalog entries for them. """ fut1 = civis.io.query_civis( "SELECT table_name FROM information_schema.tables " f"WHERE table_schema = '{self._dbschema}'", database=self._database, client=self._client, ) # If the database has a geometry_columns table, we prefer that as we can # get the SRID for a column from it. Otherwise, we get the geometry columns # from the information schema. if self._has_geom: fut2 = civis.io.query_civis( "SELECT f_table_name, f_geometry_column, srid FROM geometry_columns " f"WHERE f_table_schema = '{self._dbschema}'", database=self._database, client=self._client, ) else: fut2 = civis.io.query_civis( "SELECT table_name, column_name FROM information_schema.columns " f"WHERE table_schema = '{self._dbschema}' and udt_name = 'geometry'", database=self._database, client=self._client, ) done, _ = concurrent.futures.wait((fut1, fut2)) assert fut1 in done and fut2 in done res1 = fut1.result() res2 = fut2.result() tables = [row[0] for row in res1.result_rows] self._entries = {} for table in tables: geometry = [r[1] for r in res2.result_rows if r[0] == table] srid = [ r[2] for r in res2.result_rows if r[0] == table and self._has_geom ] entry = LocalCatalogEntry( table, f"Civis table {table} from {self._database}", CivisSource, True, args={ "api_key": self._api_key, "civis_kwargs": self._civis_kwargs, "database": self._database, "table": table, "schema": self._dbschema, "geometry": geometry if len(geometry) else None, "crs": f"EPSG:{srid[0]}" if len(srid) else None, }, getenv=False, getshell=False, ) self._entries[table] = entry
def test_from_dict_with_data_source(): "Check that Catalog.from_dict accepts DataSources not wrapped in Entry." from intake.catalog.base import Catalog fn = os.path.join(tempfile.mkdtemp(), 'mycat.yaml') entry = LocalCatalogEntry(name='trial', description='get this back', driver='csv', args=dict(urlpath="")) ds = entry() cat = Catalog.from_dict({'trial': ds}, name='mycat')
def test_auto_env_expansion(): os.environ['INTAKE_TEST_VAR'] = 'oi' e = LocalCatalogEntry('', '', driver, args={'arg1': "{{env(INTAKE_TEST_VAR)}}"}, parameters=[], getenv=False) s = e() # when getenv is False, you pass through the text assert s.kwargs['arg1'] == '{{env(INTAKE_TEST_VAR)}}' e = LocalCatalogEntry('', '', driver, args={'arg1': "{{env(INTAKE_TEST_VAR)}}"}, parameters=[], getenv=True) s = e() assert s.kwargs['arg1'] == 'oi' # same, but with quoted environment name e = LocalCatalogEntry('', '', driver, args={'arg1': '{{env("INTAKE_TEST_VAR")}}'}, parameters=[], getenv=True) s = e() assert s.kwargs['arg1'] == 'oi' del os.environ['INTAKE_TEST_VAR'] s = e() assert s.kwargs['arg1'] == '' e = LocalCatalogEntry('', '', driver, args={'arg1': "{{env(INTAKE_TEST_VAR)}}"}, parameters=[], getenv=False) s = e() assert s.kwargs['arg1'] == '{{env(INTAKE_TEST_VAR)}}'
def _load(self): from siphon.catalog import TDSCatalog self.cat = TDSCatalog(self.url) self.name = self.cat.catalog_name self.metadata.update(self.cat.metadata) # sub-cats self._entries = { r.title: LocalCatalogEntry( r.title, 'THREDDS cat', 'thredds_cat', True, {'url': r.href}, [], [], {}, None, catalog=self, ) for r in self.cat.catalog_refs.values() } # data entries (only those with opendap links) self._entries.update({ ds.name: LocalCatalogEntry( ds.name, 'THREDDS data', # 'netcdf', 'opendap', True, # {'urlpath': ds.access_urls['HTTPServer'], 'chunks': None}, { 'urlpath': ds.access_urls['OPENDAP'], 'chunks': None }, [], [], {}, None, catalog=self, ) for ds in self.cat.datasets.values() })
def test_nested_remote(intake_server): from intake.catalog.local import LocalCatalogEntry catalog = open_catalog() catalog._entries = { 'server': LocalCatalogEntry('server', 'remote test', 'intake_remote', True, {'url': intake_server}, [], [], {}, None) } assert 'entry1' in catalog.server()
def test_parameter_default(): up = UserParameter('name', default='oi') e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}, parameters=[up]) s = e() assert s.kwargs['arg1'] == 'oi'
def _instantiate_source(self): """ Driving method of this class. """ mode = self.storage[self.storage_mode if self.storage_mode else self.default] args = {} mode_url = mode if isinstance(mode, dict): mode_url = mode["url"] args = mode.get("args", {}) parse_result, url_path = self.parse_storage_mode_url(mode_url) desc = self.catalog_object[self.name].describe() if parse_result.scheme == "parquet": # https://github.com/dask/dask/issues/5272: Dask parquet metadata w/ ~2k files very slow if "gather_statistics" not in args: args["gather_statistics"] = False if "engine" not in args: args["engine"] = "pyarrow" entry = LocalCatalogEntry( name=desc["name"], description=desc["description"], driver=parse_result.scheme, args={"urlpath": url_path, **args}, parameters=self.catalog_object[self.name]._user_parameters, catalog=self.cat, ) params = { "canonical_name": self._canonical_name, "storage_mode": self.storage_mode, "avro_schema": self._avro_schema, "dtypes": self._dtypes, } source = entry.get(metadata=self.metadata, **self.kwargs) # source = entry.get(metadata=self.metadata, **{**self.kwargs, **params}) source.metadata["url_path"] = url_path source.metadata = {**source.metadata, **params} return source
def _load(self): """ load entries into catalog """ self._entries = {} exps = set() samples = set() for row in get_runs(self.conn): run_description = json.loads(row['run_description']) # move these functions so they can be loaded elsewhere exp_name, sample_name = get_names_from_experiment_id( self.conn, row['exp_id']) dependent_parameters, independent_parameters = parameters_from_description( run_description) self._entries[row['guid']] = LocalCatalogEntry( name='run {}'.format(row['run_id']), description='run {} at {} with guid {}'.format( row['run_id'], str(self._db_path), row['guid']), driver=self._source_driver, direct_access='forbid', args={ 'db_path': str(self._db_path), 'guid': row['guid'], 'run_id': row['run_id'] }, cache=None, parameters=[], metadata={ "start_time": row['run_timestamp'], "stop_time": row['completed_timestamp'], "dependent_parameters": dependent_parameters, "independent_parameters": independent_parameters, "experiment_name": exp_name, "sample_name": sample_name, "table_name": row['result_table_name'], 'plots': make_default_plots(run_description), }, catalog_dir=str(self._db_path), getenv=False, getshell=False, catalog=self, ) self._guid_lookup[row['run_id']] = row['guid'] exps.add(exp_name) samples.add(sample_name) self._experiments = list(exps) self._samples = list(samples) self._run_id_lookup = { val: key for key, val in self._guid_lookup.items() }
def test_up_override_and_render(): up = UserParameter('name', default='env(INTAKE_TEST_VAR)') e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"}, parameters=[up], getenv=False) s = e(name='other') assert s.kwargs['arg1'] == 'other'