def main(): ctx = tiledb.Ctx() # Create dimensions d1 = tiledb.Dim(ctx, "d1", domain=(1, 4), tile=2, dtype="uint64") d2 = tiledb.Dim(ctx, "d2", domain=(1, 4), tile=2, dtype="uint64") # Create domain domain = tiledb.Domain(ctx, d1, d2) # Create attributes a1 = tiledb.Attr(ctx, "a1", compressor=('blosc-lz', -1), dtype="int32") a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="S10") a3 = tiledb.Attr(ctx, "a3", compressor=('zstd', -1), dtype='float32,float32') # Create sparse array tiledb.SparseArray(ctx, "my_sparse_array", domain=domain, attrs=(a1, a2, a3), capacity=2, cell_order='row-major', tile_order='row-major')
def create_tiledb_array(self, n_slots, description): array_name = self.sensor_data_path(description['code']) if tiledb.object_type(array_name) is not None: raise ValueError('duplicate object with path %s' % array_name) shape = description['shape'] assert len(shape) > 0 and n_slots > 0 dims = [ tiledb.Dim(name="delta_t", domain=(0, n_slots), tile=1, dtype=np.int32) ] dims = dims + [ tiledb.Dim( name=f"dim{i}", domain=(0, n - 1), tile=n, dtype=np.int32) for i, n in enumerate(shape) ] dom = tiledb.Domain(*dims, ctx=self.tiledb_ctx) attrs = [ tiledb.Attr(name=aname, dtype=np.float32) for aname in description['controlledProperty'] ] schema = tiledb.ArraySchema(domain=dom, sparse=False, attrs=attrs, ctx=self.tiledb_ctx) # Create the (empty) array on disk. tiledb.DenseArray.create(array_name, schema) return array_name
def _get_dims(self, trace_size: int) -> Iterable[tiledb.Dim]: dtype = np.uintc slow_lines = len(self.segy_file.slow_lines) if self.segy_file.is_inline: fast_dim, slow_dim = "ilines", "xlines" else: fast_dim, slow_dim = "xlines", "ilines" return [ tiledb.Dim( name=fast_dim, domain=(0, len(self.segy_file.fast_lines) - 1), dtype=dtype, tile=self._fast_tile(trace_size), ), tiledb.Dim( name=slow_dim, domain=(0, slow_lines - 1), dtype=dtype, tile=slow_lines, ), tiledb.Dim( name="offsets", domain=(0, len(self.segy_file.offsets) - 1), dtype=dtype, tile=1, ), ]
def test_sparse_schema(self): ctx = tiledb.Ctx() # create dimensions d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64") d2 = tiledb.Dim(ctx, "d2", domain=(101, 10000), tile=100, dtype="uint64") # create domain domain = tiledb.Domain(ctx, d1, d2) # create attributes a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32") a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32") # create sparse array with schema schema = tiledb.SparseArray(ctx, self.path("sparse_array_schema"), domain=domain, attrs=(a1, a2), capacity=10, cell_order='col-major', tile_order='row-major', coords_compressor=('zstd', 4), offsets_compressor=('blosc-lz', 5)) self.assertEqual(schema.capacity, 10) self.assertEqual(schema.cell_order, "col-major") self.assertEqual(schema.tile_order, "row-major") self.assertEqual(schema.coords_compressor, ('zstd', 4)) self.assertEqual(schema.offsets_compressor, ('blosc-lz', 5))
def create_matrix_array(matrix_name, number_of_rows, number_of_columns, encode_as_sparse_array): filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=np.float32, filters=filters)] if encode_as_sparse_array: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, number_of_rows - 1), tile=min(number_of_rows, 512), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, number_of_columns - 1), tile=min(number_of_columns, 2048), dtype=np.uint32), ) else: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, number_of_rows - 1), tile=min(number_of_rows, 50), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, number_of_columns - 1), tile=min(number_of_columns, 100), dtype=np.uint32), ) schema = tiledb.ArraySchema(domain=domain, sparse=encode_as_sparse_array, attrs=attrs, cell_order="row-major", tile_order="col-major") if encode_as_sparse_array: tiledb.SparseArray.create(matrix_name, schema) else: tiledb.DenseArray.create(matrix_name, schema)
def time_tiledb(dataset, batch_size=1, num_batches=1): if os.path.exists(dataset + "_tileDB"): ds_tldb = tiledb.open(dataset + "_tileDB", mode="w") else: y_dim = tiledb.Dim( name="y", domain=(0, batch_size * num_batches - 1), tile=batch_size * num_batches, dtype="uint64", ) x_dim = tiledb.Dim(name="x", domain=(0, 784), tile=785, dtype="uint64") domain = tiledb.Domain(y_dim, x_dim) attr = tiledb.Attr(name="", dtype="int64", var=False) schema = tiledb.ArraySchema( domain=domain, attrs=[attr], cell_order="row-major", tile_order="row-major", sparse=False, ) tiledb.Array.create(dataset + "_tileDB", schema) ds_tldb = tiledb.open(dataset + "_tileDB", mode="w") assert type(ds_tldb) == tiledb.array.DenseArray time_batches(ds_tldb, batch_size, num_batches)
def create_test_array_sparse_25x12_mult(temp_rootdir): """ Create a simple sparse test array. """ path = os.path.abspath(os.path.join(temp_rootdir, "sparse_25x12_mult")) ctx = tiledb.default_ctx() rows_dim = tiledb.Dim("row", ctx=ctx, domain=(1, 25), dtype=np.int64) cols_dim = tiledb.Dim("col", ctx=ctx, domain=(1, 12), dtype=np.int64) dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx) att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64) att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64) schema = tiledb.ArraySchema(ctx=ctx, sparse=True, domain=dom, attrs=(att1, att2)) tiledb.SparseArray.create(path, schema) coords = np.array(list(itertools.product(np.arange(1, 26), np.arange(1, 13)))) rows = coords[:, 0] cols = coords[:, 1] data = np.arange(300) with tiledb.SparseArray(path, mode="w", timestamp=1) as A: A[rows, cols] = {"a": data, "b": data} with tiledb.SparseArray(path, mode="w", timestamp=2) as A: A[rows, cols] = {"a": data / 2, "b": data * 2}
def _initialize_stat_values_store_if_needed( self, shape: Tuple[int, ...]) -> None: """ Initialize storage for the benchmark statistics if it wasn't created yet. :param shape: Shape of the stats map. """ if self.__tiledb_stats_array is not None and tiledb.array_exists( self.__tiledb_stats_array): return # Create array with one dense dimension to store read statistics from the latest benchmark run. dom = tiledb.Domain( tiledb.Dim(name='n', domain=(0, shape[0] - 1), tile=shape[0] - 1, dtype=np.int64), tiledb.Dim(name='f', domain=(0, shape[1] - 1), tile=(shape[1] - 1), dtype=np.int64)) # Schema contains one attribute for READ count schema = tiledb.ArraySchema( domain=dom, sparse=False, attrs=[tiledb.Attr(name='read', dtype=np.int32)]) # Create the (empty) array on disk. tiledb.DenseArray.create(self.__tiledb_stats_array, schema) # Fill with zeroes with tiledb.DenseArray(self.__tiledb_stats_array, mode='w') as rr: zero_data = np.zeros(shape, dtype=np.int32) rr[:] = zero_data
def uri(temp_rootdir): """ Create a simple dense test array. """ path = os.path.abspath(os.path.join(temp_rootdir, "test_array")) ctx = tiledb.default_ctx() rows_dim = tiledb.Dim(ctx=ctx, domain=(1, 25), dtype=np.int64) cols_dim = tiledb.Dim(ctx=ctx, domain=(1, 12), dtype=np.int64) dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx) att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64) att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64) schema = tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att1, att2)) tiledb.Array.create(path, schema) data = np.reshape(np.arange(300), (25, 12)) for ts in range(1, 4): with tiledb.open(path, mode="w", timestamp=ts) as A: A[:] = {"a": data, "b": data} yield path shutil.rmtree(path)
def create_X(X_name, shape, is_sparse): """ The X matrix is accessed in both row and column oriented patterns, depending on the particular operation. Because of the data type, default compression works best. The tile size, (50, 100) for dense, and (512,2048) for sparse, and global layout (row/col) was chosen empirically, by benchmarking the current cellxgene backend. """ filters = tiledb.FilterList([tiledb.ZstdFilter()]) attrs = [tiledb.Attr(dtype=np.float32, filters=filters)] if is_sparse: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 512), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 2048), dtype=np.uint32), ) else: domain = tiledb.Domain( tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 50), dtype=np.uint32), tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 100), dtype=np.uint32), ) schema = tiledb.ArraySchema( domain=domain, sparse=is_sparse, attrs=attrs, cell_order="row-major", tile_order="col-major" ) if is_sparse: tiledb.SparseArray.create(X_name, schema) else: tiledb.DenseArray.create(X_name, schema)
def test_tiledb_test(): import tiledb n = 1000 m = 1000 num_vals = 1000 n_idxs = np.sort(np.random.choice(n, num_vals, replace=False)) m_idxs = np.sort(np.random.choice(m, num_vals, replace=False)) values = np.random.randint(0, 100, num_vals, np.uint8) ctx = tiledb.Ctx() n_tile_extent = min(100, n) d1 = tiledb.Dim("ndom", domain=(0, n - 1), tile=n_tile_extent, dtype="uint32", ctx=ctx) d2 = tiledb.Dim("mdom", domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx) domain = tiledb.Domain(d1, d2, ctx=ctx) v = tiledb.Attr( "v", filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]), dtype="uint8", ctx=ctx, ) schema = tiledb.ArraySchema( domain=domain, attrs=(v, ), capacity=10000, cell_order="row-major", tile_order="row-major", sparse=True, ctx=ctx, ) with tempfile.TemporaryDirectory() as tdir: path = os.path.join(tdir, "arr.tiledb") tiledb.SparseArray.create(path, schema) with tiledb.SparseArray(path, mode="w", ctx=ctx) as A: A[n_idxs, m_idxs] = values ctx2 = tiledb.Ctx() s = tiledb.SparseArray(path, mode="r", ctx=ctx2) vs1 = s[1:10, 1:50] _ = s[:, :] vs2 = s[1:10, 1:50] assert vs1["v"].shape[0] == vs2["v"].shape[0]
def testFromTileDB(self): ctx = tiledb.Ctx() for sparse in (True, False): dom = tiledb.Domain( tiledb.Dim(ctx=ctx, name="i", domain=(1, 30), tile=7, dtype=np.int32), tiledb.Dim(ctx=ctx, name="j", domain=(1, 20), tile=3, dtype=np.int32), tiledb.Dim(ctx=ctx, name="k", domain=(1, 10), tile=4, dtype=np.int32), ctx=ctx, ) schema = tiledb.ArraySchema(ctx=ctx, domain=dom, sparse=sparse, attrs=[tiledb.Attr(ctx=ctx, name='a', dtype=np.float32)]) tempdir = tempfile.mkdtemp() try: # create tiledb array array_type = tiledb.DenseArray if not sparse else tiledb.SparseArray array_type.create(tempdir, schema) tensor = fromtiledb(tempdir) self.assertIsInstance(tensor.op, TensorTileDBDataSource) self.assertEqual(tensor.op.issparse(), sparse) self.assertEqual(tensor.shape, (30, 20, 10)) self.assertEqual(tensor.extra_params.raw_chunk_size, (7, 3, 4)) self.assertIsNone(tensor.op.tiledb_config) self.assertEqual(tensor.op.tiledb_uri, tempdir) self.assertIsNone(tensor.op.tiledb_key) self.assertIsNone(tensor.op.tiledb_timestamp) tensor = tensor.tiles() self.assertEqual(len(tensor.chunks), 105) self.assertIsInstance(tensor.chunks[0].op, TensorTileDBDataSource) self.assertEqual(tensor.chunks[0].op.issparse(), sparse) self.assertEqual(tensor.chunks[0].shape, (7, 3, 4)) self.assertIsNone(tensor.chunks[0].op.tiledb_config) self.assertEqual(tensor.chunks[0].op.tiledb_uri, tempdir) self.assertIsNone(tensor.chunks[0].op.tiledb_key) self.assertIsNone(tensor.chunks[0].op.tiledb_timestamp) self.assertEqual(tensor.chunks[0].op.tiledb_dim_starts, (1, 1, 1)) # test axis_offsets of chunk op self.assertEqual(tensor.chunks[0].op.axis_offsets, (0, 0, 0)) self.assertEqual(tensor.chunks[1].op.axis_offsets, (0, 0, 4)) self.assertEqual(tensor.cix[0, 2, 2].op.axis_offsets, (0, 6, 8)) self.assertEqual(tensor.cix[0, 6, 2].op.axis_offsets, (0, 18, 8)) self.assertEqual(tensor.cix[4, 6, 2].op.axis_offsets, (28, 18, 8)) tensor2 = fromtiledb(tempdir, ctx=ctx) self.assertEqual(tensor2.op.tiledb_config, ctx.config().dict()) tensor2 = tensor2.tiles() self.assertEqual(tensor2.chunks[0].op.tiledb_config, ctx.config().dict()) finally: shutil.rmtree(tempdir)
def ccd(_input, bands, output=None, config=None, neighbourhood=7, overlap=1): if len(bands) == 2: if output is None or not os.path.exists(output): cfg = tiledb.Config(config) ctx = tiledb.Ctx(config=cfg) with tiledb.DenseArray(_input, 'r', ctx=ctx) as arr: y_dim = arr.schema.domain.dim(1) x_dim = arr.schema.domain.dim(2) height = y_dim.size width = x_dim.size tile_y_size = y_dim.tile tile_x_size = x_dim.tile dom = tiledb.Domain( tiledb.Dim(domain=(0, height - 1), tile=tile_y_size, dtype=np.uint64), tiledb.Dim(domain=(0, width - 1), tile=tile_x_size, dtype=np.uint64)) schema = tiledb.ArraySchema( domain=dom, sparse=False, attrs=[tiledb.Attr(name="c", dtype=np.float32)], ctx=ctx) if output is None: output = _input + '_result_' + ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(4)) # noqa tiledb.DenseArray.create(output, schema) x = da.from_tiledb(_input, storage_options=config) _, h, w = x.shape _, tile_y_size, tile_x_size = x.chunksize # w and h are an exact multiple of tile size n_tiles_x = w // tile_x_size n_tiles_y = h // tile_x_size # manually chunk and collect f = [] for y in range(n_tiles_y): for x in range(n_tiles_x): f.append( client.submit(calculate_change, _input, bands, neighbourhood, x, y, tile_x_size, tile_y_size, output, config)) client.gather(f) return output else: raise IndexError('CCD function requires two band indexes')
def create_array(array_name, dim_medium, first_timestamp, last_timestamp, dim_article, tile_extent): # The array will be 10000 x seconds_in_year x 100 with # dimensions "medium", "time", "article" print(int(dim_medium - tile_extent)) print(first_timestamp) print(last_timestamp) print(int(dim_article - tile_extent)) dom = tiledb.Domain( tiledb.Dim( name="medium", domain=(1, int(dim_medium - tile_extent)), tile=tile_extent, dtype=np.uint64, ), tiledb.Dim( name="time", domain=(first_timestamp, last_timestamp), tile=tile_extent, dtype=np.uint64, ), tiledb.Dim( name="article", domain=(1, int(dim_article - tile_extent)), tile=tile_extent, dtype=np.uint64, ), ) # The array will be sparse, having following attributes schema = tiledb.ArraySchema( domain=dom, sparse=True, attrs=[ tiledb.Attr(name="title", var=True, dtype="U"), tiledb.Attr(name="modyfication_date", dtype=np.uint64), tiledb.Attr(name="medium_text", dtype=np.dtype("U1")), tiledb.Attr(name="medium_group", dtype=np.dtype("U1")), tiledb.Attr(name="medium_pageviews", dtype=np.uint64), tiledb.Attr(name="is_blog", dtype=np.int8), tiledb.Attr(name="url", dtype=np.dtype("U1")), tiledb.Attr(name="advertising_value_equivalency", dtype=np.uint32), tiledb.Attr(name="keyword", dtype=np.dtype("U1")), tiledb.Attr(name="snippet", dtype=np.dtype("U1")), tiledb.Attr(name="text", dtype=np.dtype("U1")), tiledb.Attr(name="importance", dtype=np.float32), tiledb.Attr(name="sentiment", dtype=np.float32), ], ) # Create the (empty) array on disk. tiledb.SparseArray.create(array_name, schema)
def write_tiledb(arr, path, overwrite=True): """Write a tiledb to disk. """ if os.path.exists(path) and os.path.isdir(path) and overwrite: shutil.rmtree(path) if os.path.exists(path): raise FileExistsError("Output path {} already exists".format(path)) ctx = tiledb.Ctx() n = arr.shape[0] n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n) d1 = tiledb.Dim(ctx, GENOME_DOMAIN_NAME, domain=(0, n - 1), tile=n_tile_extent, dtype="uint32") if arr.ndim == 1: domain = tiledb.Domain(ctx, d1) elif arr.ndim == 2: m = arr.shape[1] d2 = tiledb.Dim(ctx, SECONDARY_DOMAIN_NAME, domain=(0, m - 1), tile=m, dtype="uint32") domain = tiledb.Domain(ctx, d1, d2) else: raise ValueError("tiledb backend only supports 1D or 2D arrays") v = tiledb.Attr( ctx, GENOME_VALUE_NAME, compressor=(DEFAULT_COMPRESSOR, DEFAULT_COMPRESSOR_LEVEL), dtype="float32", ) schema = tiledb.ArraySchema(ctx, domain=domain, attrs=(v, ), cell_order="row-major", tile_order="row-major") A = tiledb.DenseArray.create(path, schema) values = arr.astype(np.float32) with tiledb.DenseArray(ctx, path, mode="w") as A: A[:] = {GENOME_VALUE_NAME: values}
def create_array(): # The array will be 4x4 with dimensions "rows" and "cols", with domain [1,4]. dom = tiledb.Domain(tiledb.Dim(name="rows", domain=(1, 4), tile=4, dtype=np.int32), tiledb.Dim(name="cols", domain=(1, 4), tile=4, dtype=np.int32)) # The array will be dense with a single attribute "a" so each (i,j) cell can store an integer. schema = tiledb.ArraySchema(domain=dom, sparse=False, attrs=[tiledb.Attr(name="a", dtype=np.int32)]) # Create the (empty) array on disk. tiledb.DenseArray.create(array_name, schema)
def main(): ctx = tiledb.Ctx() # create dimensions d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64") d2 = tiledb.Dim(ctx, "d2", domain=(101, 10000), tile=100, dtype="uint64") # create domain domain = tiledb.Domain(ctx, d1, d2) # create attributes a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32") a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32") # create sparse array with schema schema = tiledb.SparseArray(ctx, "sparse_array_schema", domain=domain, attrs=(a1, a2), capacity=10, tile_order='row-major', cell_order='col-major', coords_compressor=('zstd', 4), offsets_compressor=('blosc-lz', 5)) schema.dump() # Print from schema print("From schema properties:") print("- Array type: ", "sparse" if schema.sparse else "dense") print("- Cell order: ", schema.cell_order) print("- Tile order: ", schema.tile_order) print("- Capacity: ", schema.capacity) print("- Coordinates compressor: ", schema.coords_compressor) print("- Offsets compressor: ", schema.offsets_compressor) print() # Print the attribute names: print("Array schema attribute names: ") for i in range(schema.nattr): print("* {!r}".format(schema.attr(i).name)) print() # Print domain domain = schema.domain domain.dump() # print the dimension names print("Array schema dimension names: ") for i in range(schema.ndim): dim = domain.dim(i) print("* {!r}".format(dim.name)) print()
def create_array(): ctx = tiledb.Ctx() dom = tiledb.Domain( ctx, tiledb.Dim(ctx, name="rows", domain=(1, 10), tile=10, dtype=np.int32), tiledb.Dim(ctx, name="cols", domain=(1, 10), tile=10, dtype=np.int32)) schema = tiledb.ArraySchema( ctx, domain=dom, sparse=True, attrs=[tiledb.Attr(ctx, name="a", dtype=np.int32)]) tiledb.SparseArray.create(array_name, schema)
def get_tiledb_schema_from_tensor(tensor, tiledb_ctx, nsplits, **kw): from ..core import TensorOrder ctx = tiledb_ctx dims = [] for d in range(tensor.ndim): extent = tensor.shape[d] domain = (0, extent - 1) tile = max(nsplits[d]) dims.append( tiledb.Dim(name="", domain=domain, tile=tile, dtype=np.int64, ctx=ctx)) dom = tiledb.Domain(*dims, **dict(ctx=ctx)) att = tiledb.Attr(ctx=ctx, dtype=tensor.dtype) cell_order = 'C' if tensor.order == TensorOrder.C_ORDER else 'F' return tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att, ), sparse=tensor.issparse(), cell_order=cell_order, **kw)
def _ingest_in_tiledb( uri: str, data: np.ndarray, sparse: bool, batch_size: int, num_attrs: int ) -> None: dims = [ tiledb.Dim( name=f"dim_{dim}", domain=(0, data.shape[dim] - 1), tile=np.random.randint(1, data.shape[dim] if dim > 0 else batch_size), dtype=np.int32, ) for dim in range(data.ndim) ] # TileDB schema schema = tiledb.ArraySchema( domain=tiledb.Domain(*dims), sparse=sparse, attrs=[ tiledb.Attr(name=f"features_{attr}", dtype=np.float32) for attr in range(num_attrs) ], ) # Create the (empty) array on disk. tiledb.Array.create(uri, schema) # Ingest with tiledb.open(uri, "w") as tiledb_array: idx = np.nonzero(data) if sparse else slice(None) tiledb_array[idx] = {f"features_{attr}": data[idx] for attr in range(num_attrs)}
def test_int_dtypes(self, runner, temp_rootdir, sparse, dtype): uri = os.path.abspath( os.path.join( temp_rootdir, tempfile.mkdtemp(), "test_int_dtypes_" f"{'sparse' if sparse else 'dense'}_" f"{np.dtype(dtype).name}", )) dom = tiledb.Domain(tiledb.Dim(domain=(1, 10), dtype=dtype)) att = tiledb.Attr(dtype=dtype) schema = tiledb.ArraySchema(domain=dom, attrs=(att, ), sparse=sparse) tiledb.Array.create(uri, schema) with tiledb.open(uri, mode="w") as A: if sparse: A[np.arange(1, 11)] = np.random.randint(10, size=10, dtype=dtype) else: A[:] = np.random.randint(10, size=10, dtype=dtype) result = runner.invoke(root, ["dump", "array", uri, "5"]) assert result.exit_code == 0 result = runner.invoke(root, ["dump", "array", uri, "1:10"]) assert result.exit_code == 0
def create_tiledb_datetime_example(tmpdir): _data = np.linspace(-1.0, 20.0, num=16, endpoint=True, dtype=np.float64) _date = np.arange(np.datetime64("2000-01-01"), np.datetime64("2000-01-17")) # Create expected dataset expected = xr.Dataset( data_vars={"temperature": xr.DataArray(data=_data, dims="date")}, coords={"date": _date}, ) # Create TileDB array array_uri = str(tmpdir.join("tiledb_example_2")) schema = tiledb.ArraySchema( domain=tiledb.Domain( tiledb.Dim( name="date", domain=(np.datetime64("2000-01-01"), np.datetime64("2000-01-16")), tile=np.timedelta64(4, "D"), dtype=np.datetime64("", "D"), ), ), attrs=[tiledb.Attr(name="temperature", dtype=np.float64)], ) tiledb.DenseArray.create(array_uri, schema) with tiledb.DenseArray(array_uri, mode="w") as array: array[:] = {"temperature": _data} return array_uri, expected
def _create_array(self) -> None: """Create a TileDB array for a Sklearn model.""" dom = tiledb.Domain( tiledb.Dim(name="model", domain=(1, 1), tile=1, dtype=np.int32, ctx=self.ctx), ) attrs = [ tiledb.Attr( name="model_params", dtype=bytes, var=True, filters=tiledb.FilterList([tiledb.ZstdFilter()]), ctx=self.ctx, ), ] schema = tiledb.ArraySchema(domain=dom, sparse=False, attrs=attrs, ctx=self.ctx) tiledb.Array.create(self.uri, schema, ctx=self.ctx) # In case we are on TileDB-Cloud we have to update model array's file properties if self.namespace: update_file_properties(self.uri, self._file_properties)
def test_dim_start_float(): ctx = tiledb.Ctx() dom = tiledb.Domain( tiledb.Dim(ctx=ctx, name="i", domain=(0.0, 6.0), tile=6, dtype=np.float64), ctx=ctx, ) schema = tiledb.ArraySchema( ctx=ctx, domain=dom, sparse=True, attrs=[tiledb.Attr(ctx=ctx, name='a', dtype=np.float32)]) tempdir = tempfile.mkdtemp() try: # create tiledb array tiledb.SparseArray.create(tempdir, schema) with pytest.raises(ValueError): fromtiledb(tempdir, ctx=ctx) finally: shutil.rmtree(tempdir)
def test_datetime_dtype(self, runner, temp_rootdir, dtype): uri = os.path.abspath( os.path.join( temp_rootdir, tempfile.mkdtemp(), f"test_datetime_dtype_{np.dtype(dtype).name}", )) dom = tiledb.Domain( tiledb.Dim( domain=(np.datetime64("1970-01-01"), np.datetime64("1980-01-01")), dtype=dtype, )) att = tiledb.Attr(dtype=dtype) schema = tiledb.ArraySchema(domain=dom, attrs=(att, ), sparse=True) tiledb.Array.create(uri, schema) with tiledb.open(uri, mode="w") as A: A[np.arange(1, 11)] = np.random.randint(low=1, high=10, size=10) result = runner.invoke(root, ["dump", "array", uri, "'1970-01-04'"]) assert result.exit_code == 0 result = runner.invoke( root, ["dump", "array", uri, "'1970-01-01':'1980-01-01'"]) assert result.exit_code == 0
def _create_tiledb_dim(self, dim_name, coords): dim_coord = self.data_model.variables[dim_name] chunks = self.data_model.get_chunks(dim_name) # Handle scalar dimensions. if dim_name == self._scalar_unlimited: dim_coord_len = 1 chunks = (1, ) else: # TODO: work out nD coords (although a DimCoord will never be nD). dim_coord_len, = dim_coord.shape # Set the tdb dimension dtype to `int64` regardless of input. # Dimensions must have int indices for dense array schemas. # All tdb dims in a domain must have exactly the same dtype. dim_dtype = np.int64 # Sort out the domain, based on whether the dim is unlimited, # or whether it was specified that it should be by `self.unlimited_dims`. if dim_name in self.unlimited_dims or dim_name in self.data_model.unlimited_dim_coords: domain_max = np.iinfo(dim_dtype).max - dim_coord_len else: domain_max = dim_coord_len # Modify the name of the dimension if this dimension describes the domain # for a dim coord array. # Array attrs and dimensions must have different names. if coords: dim_name = f'{dim_name}_coord' return tiledb.Dim(name=dim_name, domain=(0, domain_max), tile=chunks, dtype=dim_dtype)
def create_new_array(size, array_out_name, tile_size, attribute_config, compressor='gzip', compression_level=-1): ''' Creates an empty tileDB array ''' tile_size = min(size, tile_size) tiledb_dim = tiledb.Dim(name='genome_coordinate', domain=(0, size - 1), tile=tile_size, dtype='uint32') tiledb_dom = tiledb.Domain(tiledb_dim, ctx=tdb_Context) #generate the attribute information attribute_info = get_attribute_info(attribute_config) attribs = [] for key in attribute_info: attribs.append( tiledb.Attr(name=key, filters=tiledb.FilterList([tiledb.GzipFilter()]), dtype=attribute_info[key]['dtype'])) tiledb_schema = tiledb.ArraySchema(domain=tiledb_dom, attrs=tuple(attribs), cell_order='row-major', tile_order='row-major') tiledb.DenseArray.create(array_out_name, tiledb_schema, ctx=tdb_Context) print("created empty array on disk") gc.collect() return
def create_new_array(tdb_Context, size, array_out_name, coord_tile_size, task_tile_size, attribute_config, attribute_config_file, compressor='gzip', compression_level=-1, var=False): ''' Creates an empty tileDB array size= tuple(num_indices,num_tasks) ''' coord_tile_size=min(size[0],coord_tile_size) task_tile_size=max([1,min(size[1],task_tile_size)]) tiledb_dim_coords = tiledb.Dim( name='genome_coordinate', domain=(0, size[0]), tile=coord_tile_size, dtype='uint32') tiledb_dim_tasks=tiledb.Dim( name='task', domain=(0,size[1]),#max([1,size[1]])), tile=task_tile_size, dtype='uint32') tiledb_dom = tiledb.Domain(tiledb_dim_coords,tiledb_dim_tasks,ctx=tdb_Context) #generate the attribute information attribute_info=get_attribute_info(attribute_config,attribute_config_file) attribs=[] for key in attribute_info: attribs.append(tiledb.Attr( name=key, var=var, filters=tiledb.FilterList([tiledb.GzipFilter()]), dtype=attribute_info[key]['dtype'])) tiledb_schema = tiledb.ArraySchema( domain=tiledb_dom, attrs=tuple(attribs), cell_order='row-major', tile_order='row-major') tiledb.DenseArray.create(array_out_name, tiledb_schema) print("created empty array on disk") return
def create_test_array_dense_25x12(temp_rootdir): """ Create a simple dense test array. """ path = os.path.abspath(os.path.join(temp_rootdir, "dense_25x12")) ctx = tiledb.default_ctx() rows_dim = tiledb.Dim("row", ctx=ctx, domain=(1, 25), dtype=np.int64) cols_dim = tiledb.Dim("col", ctx=ctx, domain=(1, 12), dtype=np.int64) dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx) att = tiledb.Attr(ctx=ctx, dtype=np.int64) schema = tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att,)) tiledb.DenseArray.create(path, schema) with tiledb.DenseArray(path, mode="w") as A: A[:] = np.reshape(np.arange(300), (25, 12))
def create_tiledb_example(tmpdir): # Define data float_data = np.linspace( -1.0, 1.0, num=32, endpoint=True, dtype=np.float64 ).reshape(8, 4) int_data = np.arange(0, 32, dtype=np.int32).reshape(8, 4) # Create expected dataset expected = xr.Dataset( data_vars={ "pressure": xr.DataArray( data=float_data, dims=["time", "x"], attrs={"long_name": "example float data"}, ), "count": xr.DataArray( data=int_data, dims=["time", "x"], attrs={"long_name": "example int data"}, ), }, coords={"time": np.arange(1, 9), "x": np.arange(1, 5)}, attrs={"global_1": "value1", "global_2": "value2"}, ) array_uri = str(tmpdir.join("tiledb_example_1")) schema = tiledb.ArraySchema( domain=tiledb.Domain( tiledb.Dim(name="time", domain=(1, 8), tile=4, dtype=np.int32), tiledb.Dim(name="x", domain=(1, 4), tile=4, dtype=np.int32), ), sparse=False, attrs=[ tiledb.Attr(name="count", dtype=np.int32), tiledb.Attr(name="pressure", dtype=np.float64), ], ) tiledb.DenseArray.create(array_uri, schema) with tiledb.DenseArray(array_uri, mode="w") as array: array[:, :] = { "pressure": float_data, "count": int_data, } array.meta["global_1"] = "value1" array.meta["global_2"] = "value2" array.meta["__tiledb_attr.float_data.long_name"] = "example float data" array.meta["__tiledb_attr.int_data.long_name"] = "example int data" return array_uri, expected