Ejemplo n.º 1
0
def uri(temp_rootdir):
    """
    Create a simple dense test array.
    """
    path = os.path.abspath(os.path.join(temp_rootdir, "test_array"))

    ctx = tiledb.default_ctx()
    rows_dim = tiledb.Dim(ctx=ctx, domain=(1, 25), dtype=np.int64)
    cols_dim = tiledb.Dim(ctx=ctx, domain=(1, 12), dtype=np.int64)
    dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx)
    att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64)
    att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64)
    schema = tiledb.ArraySchema(ctx=ctx, domain=dom, attrs=(att1, att2))

    tiledb.Array.create(path, schema)

    data = np.reshape(np.arange(300), (25, 12))

    for ts in range(1, 4):
        with tiledb.open(path, mode="w", timestamp=ts) as A:
            A[:] = {"a": data, "b": data}

    yield path

    shutil.rmtree(path)
Ejemplo n.º 2
0
def create_test_array_sparse_25x12_mult(temp_rootdir):
    """
    Create a simple sparse test array.
    """
    path = os.path.abspath(os.path.join(temp_rootdir, "sparse_25x12_mult"))

    ctx = tiledb.default_ctx()
    rows_dim = tiledb.Dim("row", ctx=ctx, domain=(1, 25), dtype=np.int64)
    cols_dim = tiledb.Dim("col", ctx=ctx, domain=(1, 12), dtype=np.int64)
    dom = tiledb.Domain(rows_dim, cols_dim, ctx=ctx)
    att1 = tiledb.Attr(name="a", ctx=ctx, dtype=np.float64)
    att2 = tiledb.Attr(name="b", ctx=ctx, dtype=np.float64)
    schema = tiledb.ArraySchema(ctx=ctx, sparse=True, domain=dom, attrs=(att1, att2))

    tiledb.SparseArray.create(path, schema)

    coords = np.array(list(itertools.product(np.arange(1, 26), np.arange(1, 13))))
    rows = coords[:, 0]
    cols = coords[:, 1]
    data = np.arange(300)

    with tiledb.SparseArray(path, mode="w", timestamp=1) as A:
        A[rows, cols] = {"a": data, "b": data}

    with tiledb.SparseArray(path, mode="w", timestamp=2) as A:
        A[rows, cols] = {"a": data / 2, "b": data * 2}
Ejemplo n.º 3
0
def main():
    ctx = tiledb.Ctx()

    # Create dimensions
    d1 = tiledb.Dim(ctx, "d1", domain=(1, 4), tile=2, dtype="uint64")
    d2 = tiledb.Dim(ctx, "d2", domain=(1, 4), tile=2, dtype="uint64")

    # Create domain
    domain = tiledb.Domain(ctx, d1, d2)

    # Create attributes
    a1 = tiledb.Attr(ctx, "a1", compressor=('blosc-lz', -1), dtype="int32")
    a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="S10")
    a3 = tiledb.Attr(ctx,
                     "a3",
                     compressor=('zstd', -1),
                     dtype='float32,float32')

    # Create sparse array
    tiledb.SparseArray(ctx,
                       "my_sparse_array",
                       domain=domain,
                       attrs=(a1, a2, a3),
                       capacity=2,
                       cell_order='row-major',
                       tile_order='row-major')
Ejemplo n.º 4
0
    def test_sparse_schema(self):
        ctx = tiledb.Ctx()

        # create dimensions
        d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64")
        d2 = tiledb.Dim(ctx,
                        "d2",
                        domain=(101, 10000),
                        tile=100,
                        dtype="uint64")

        # create domain
        domain = tiledb.Domain(ctx, d1, d2)

        # create attributes
        a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32")
        a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32")

        # create sparse array with schema
        schema = tiledb.SparseArray(ctx,
                                    self.path("sparse_array_schema"),
                                    domain=domain,
                                    attrs=(a1, a2),
                                    capacity=10,
                                    cell_order='col-major',
                                    tile_order='row-major',
                                    coords_compressor=('zstd', 4),
                                    offsets_compressor=('blosc-lz', 5))
        self.assertEqual(schema.capacity, 10)
        self.assertEqual(schema.cell_order, "col-major")
        self.assertEqual(schema.tile_order, "row-major")
        self.assertEqual(schema.coords_compressor, ('zstd', 4))
        self.assertEqual(schema.offsets_compressor, ('blosc-lz', 5))
Ejemplo n.º 5
0
def main():

    ctx = tiledb.Ctx()

    # create dimensions
    d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64")
    d2 = tiledb.Dim(ctx, "d2", domain=(101, 10000), tile=100, dtype="uint64")

    # create domain
    domain = tiledb.Domain(ctx, d1, d2)

    # create attributes
    a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32")
    a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32")

    # create sparse array with schema
    schema = tiledb.SparseArray(ctx,
                                "sparse_array_schema",
                                domain=domain,
                                attrs=(a1, a2),
                                capacity=10,
                                tile_order='row-major',
                                cell_order='col-major',
                                coords_compressor=('zstd', 4),
                                offsets_compressor=('blosc-lz', 5))
    schema.dump()

    # Print from schema
    print("From schema properties:")
    print("- Array type: ", "sparse" if schema.sparse else "dense")
    print("- Cell order: ", schema.cell_order)
    print("- Tile order: ", schema.tile_order)
    print("- Capacity: ", schema.capacity)
    print("- Coordinates compressor: ", schema.coords_compressor)
    print("- Offsets compressor: ", schema.offsets_compressor)
    print()

    # Print the attribute names:
    print("Array schema attribute names: ")
    for i in range(schema.nattr):
        print("* {!r}".format(schema.attr(i).name))
    print()

    # Print domain
    domain = schema.domain
    domain.dump()

    # print the dimension names
    print("Array schema dimension names: ")
    for i in range(schema.ndim):
        dim = domain.dim(i)
        print("* {!r}".format(dim.name))
    print()
Ejemplo n.º 6
0
def time_tiledb(dataset, batch_size=1, num_batches=1):
    if os.path.exists(dataset + "_tileDB"):
        ds_tldb = tiledb.open(dataset + "_tileDB", mode="w")
    else:
        y_dim = tiledb.Dim(
            name="y",
            domain=(0, batch_size * num_batches - 1),
            tile=batch_size * num_batches,
            dtype="uint64",
        )
        x_dim = tiledb.Dim(name="x", domain=(0, 784), tile=785, dtype="uint64")
        domain = tiledb.Domain(y_dim, x_dim)
        attr = tiledb.Attr(name="", dtype="int64", var=False)
        schema = tiledb.ArraySchema(
            domain=domain,
            attrs=[attr],
            cell_order="row-major",
            tile_order="row-major",
            sparse=False,
        )
        tiledb.Array.create(dataset + "_tileDB", schema)
        ds_tldb = tiledb.open(dataset + "_tileDB", mode="w")

    assert type(ds_tldb) == tiledb.array.DenseArray
    time_batches(ds_tldb, batch_size, num_batches)
Ejemplo n.º 7
0
def create_tiledb_datetime_example(tmpdir):
    _data = np.linspace(-1.0, 20.0, num=16, endpoint=True, dtype=np.float64)
    _date = np.arange(np.datetime64("2000-01-01"), np.datetime64("2000-01-17"))
    # Create expected dataset
    expected = xr.Dataset(
        data_vars={"temperature": xr.DataArray(data=_data, dims="date")},
        coords={"date": _date},
    )
    # Create TileDB array
    array_uri = str(tmpdir.join("tiledb_example_2"))
    schema = tiledb.ArraySchema(
        domain=tiledb.Domain(
            tiledb.Dim(
                name="date",
                domain=(np.datetime64("2000-01-01"), np.datetime64("2000-01-16")),
                tile=np.timedelta64(4, "D"),
                dtype=np.datetime64("", "D"),
            ),
        ),
        attrs=[tiledb.Attr(name="temperature", dtype=np.float64)],
    )
    tiledb.DenseArray.create(array_uri, schema)
    with tiledb.DenseArray(array_uri, mode="w") as array:
        array[:] = {"temperature": _data}
    return array_uri, expected
Ejemplo n.º 8
0
def main():
    # Create TileDB context
    ctx = tiledb.Ctx()

    # KV objects are limited to storing string keys/values for the time being
    a1 = tiledb.Attr(ctx, "value", compressor=("gzip", -1), dtype=bytes)
    kv = tiledb.KV(ctx, "my_kv", attrs=(a1, ))

    # Dump the KV schema
    kv.dump()

    # Update the KV with some key-value pairs
    vals = {"key1": "a", "key2": "bb", "key3": "dddd"}
    print("Updating KV with values: {!r}\n".format(vals))
    kv.update(vals)

    # Get kv item
    print("KV value for 'key3': {}\n".format(kv['key3']))

    try:
        kv["don't exist"]
    except KeyError:
        print("KeyError was raised for key 'don't exist'\n")

    # Set kv item
    kv['key3'] = "eeeee"
    print("Updated KV value for 'key3': {}\n".format(kv['key3']))

    # Consolidate kv updates
    kv.consolidate()

    # Convert kv to Python dict
    kv_dict = dict(kv)
    print("Convert to Python dict: {!r}\n".format(kv_dict))
Ejemplo n.º 9
0
    def _create_array(self) -> None:
        """Create a TileDB array for a Sklearn model."""
        dom = tiledb.Domain(
            tiledb.Dim(name="model",
                       domain=(1, 1),
                       tile=1,
                       dtype=np.int32,
                       ctx=self.ctx), )

        attrs = [
            tiledb.Attr(
                name="model_params",
                dtype=bytes,
                var=True,
                filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ctx=self.ctx,
            ),
        ]

        schema = tiledb.ArraySchema(domain=dom,
                                    sparse=False,
                                    attrs=attrs,
                                    ctx=self.ctx)

        tiledb.Array.create(self.uri, schema, ctx=self.ctx)

        # In case we are on TileDB-Cloud we have to update model array's file properties
        if self.namespace:
            update_file_properties(self.uri, self._file_properties)
Ejemplo n.º 10
0
    def create_domain_arrays(self, domain_vars, domain_name, coords=False):
        """Create one single-attribute array per data var in this NC domain."""
        for var_name in domain_vars:
            # Set dims for the enclosing domain.

            data_var = self.data_model.variables[var_name]
            data_var_dims = data_var.dimensions
            # Handle scalar append dimension coordinates.
            if not len(data_var_dims) and var_name == self._scalar_unlimited:
                data_var_dims = [self._scalar_unlimited]
            array_dims = [
                self._create_tiledb_dim(dim_name, coords)
                for dim_name in data_var_dims
            ]
            tdb_domain = tiledb.Domain(*array_dims)

            # Get tdb attributes.
            attr = tiledb.Attr(name=var_name, dtype=data_var.dtype)

            # Create the URI for the array.
            array_filename = self.array_path.construct_path(
                domain_name, var_name)
            # Create an empty array.
            schema = tiledb.ArraySchema(domain=tdb_domain,
                                        sparse=False,
                                        attrs=[attr],
                                        ctx=self.ctx)
            tiledb.Array.create(array_filename, schema)
Ejemplo n.º 11
0
def get_tiledb_schema_from_tensor(tensor, tiledb_ctx, nsplits, **kw):
    from ..core import TensorOrder

    ctx = tiledb_ctx

    dims = []
    for d in range(tensor.ndim):
        extent = tensor.shape[d]
        domain = (0, extent - 1)
        tile = max(nsplits[d])
        dims.append(
            tiledb.Dim(name="",
                       domain=domain,
                       tile=tile,
                       dtype=np.int64,
                       ctx=ctx))
    dom = tiledb.Domain(*dims, **dict(ctx=ctx))
    att = tiledb.Attr(ctx=ctx, dtype=tensor.dtype)
    cell_order = 'C' if tensor.order == TensorOrder.C_ORDER else 'F'
    return tiledb.ArraySchema(ctx=ctx,
                              domain=dom,
                              attrs=(att, ),
                              sparse=tensor.issparse(),
                              cell_order=cell_order,
                              **kw)
Ejemplo n.º 12
0
    def _initialize_stat_values_store_if_needed(
            self, shape: Tuple[int, ...]) -> None:
        """
        Initialize storage for the benchmark statistics if it wasn't created yet.
        :param shape: Shape of the stats map.
        """

        if self.__tiledb_stats_array is not None and tiledb.array_exists(
                self.__tiledb_stats_array):
            return
        # Create array with one dense dimension to store read statistics from the latest benchmark run.
        dom = tiledb.Domain(
            tiledb.Dim(name='n',
                       domain=(0, shape[0] - 1),
                       tile=shape[0] - 1,
                       dtype=np.int64),
            tiledb.Dim(name='f',
                       domain=(0, shape[1] - 1),
                       tile=(shape[1] - 1),
                       dtype=np.int64))
        # Schema contains one attribute for READ count
        schema = tiledb.ArraySchema(
            domain=dom,
            sparse=False,
            attrs=[tiledb.Attr(name='read', dtype=np.int32)])
        # Create the (empty) array on disk.
        tiledb.DenseArray.create(self.__tiledb_stats_array, schema)
        # Fill with zeroes
        with tiledb.DenseArray(self.__tiledb_stats_array, mode='w') as rr:
            zero_data = np.zeros(shape, dtype=np.int32)
            rr[:] = zero_data
Ejemplo n.º 13
0
 def create_matrix_array(matrix_name, number_of_rows, number_of_columns,
                         encode_as_sparse_array):
     filters = tiledb.FilterList([tiledb.ZstdFilter()])
     attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
     if encode_as_sparse_array:
         domain = tiledb.Domain(
             tiledb.Dim(name="obs",
                        domain=(0, number_of_rows - 1),
                        tile=min(number_of_rows, 512),
                        dtype=np.uint32),
             tiledb.Dim(name="var",
                        domain=(0, number_of_columns - 1),
                        tile=min(number_of_columns, 2048),
                        dtype=np.uint32),
         )
     else:
         domain = tiledb.Domain(
             tiledb.Dim(name="obs",
                        domain=(0, number_of_rows - 1),
                        tile=min(number_of_rows, 50),
                        dtype=np.uint32),
             tiledb.Dim(name="var",
                        domain=(0, number_of_columns - 1),
                        tile=min(number_of_columns, 100),
                        dtype=np.uint32),
         )
     schema = tiledb.ArraySchema(domain=domain,
                                 sparse=encode_as_sparse_array,
                                 attrs=attrs,
                                 cell_order="row-major",
                                 tile_order="col-major")
     if encode_as_sparse_array:
         tiledb.SparseArray.create(matrix_name, schema)
     else:
         tiledb.DenseArray.create(matrix_name, schema)
Ejemplo n.º 14
0
 def create_tiledb_array(self, n_slots, description):
     array_name = self.sensor_data_path(description['code'])
     if tiledb.object_type(array_name) is not None:
         raise ValueError('duplicate object with path %s' % array_name)
     shape = description['shape']
     assert len(shape) > 0 and n_slots > 0
     dims = [
         tiledb.Dim(name="delta_t",
                    domain=(0, n_slots),
                    tile=1,
                    dtype=np.int32)
     ]
     dims = dims + [
         tiledb.Dim(
             name=f"dim{i}", domain=(0, n - 1), tile=n, dtype=np.int32)
         for i, n in enumerate(shape)
     ]
     dom = tiledb.Domain(*dims, ctx=self.tiledb_ctx)
     attrs = [
         tiledb.Attr(name=aname, dtype=np.float32)
         for aname in description['controlledProperty']
     ]
     schema = tiledb.ArraySchema(domain=dom,
                                 sparse=False,
                                 attrs=attrs,
                                 ctx=self.tiledb_ctx)
     # Create the (empty) array on disk.
     tiledb.DenseArray.create(array_name, schema)
     return array_name
Ejemplo n.º 15
0
    def create_multiattr_array(self, domain_var_names, domain_dims,
                               domain_name, data_array_name):
        """Create one multi-attr TileDB array with an attr for each data variable."""
        # Create dimensions and domain for the multi-attr array.
        array_dims = [
            self._create_tiledb_dim(dim_name, coords=False)
            for dim_name in domain_dims
        ]
        tdb_domain = tiledb.Domain(*array_dims)

        # Set up the multiple attrs for the array.
        attrs = []
        for var_name in domain_var_names:
            dtype = self.data_model.variables[var_name].dtype
            attr = tiledb.Attr(name=var_name, dtype=dtype)
            attrs.append(attr)

        # Create the URI for the array.
        array_filename = self.array_path.construct_path(
            domain_name, data_array_name)
        # Create an empty array.
        schema = tiledb.ArraySchema(domain=tdb_domain,
                                    sparse=False,
                                    attrs=attrs,
                                    ctx=self.ctx)
        tiledb.Array.create(array_filename, schema)
Ejemplo n.º 16
0
def _ingest_in_tiledb(
    uri: str, data: np.ndarray, sparse: bool, batch_size: int, num_attrs: int
) -> None:
    dims = [
        tiledb.Dim(
            name=f"dim_{dim}",
            domain=(0, data.shape[dim] - 1),
            tile=np.random.randint(1, data.shape[dim] if dim > 0 else batch_size),
            dtype=np.int32,
        )
        for dim in range(data.ndim)
    ]

    # TileDB schema
    schema = tiledb.ArraySchema(
        domain=tiledb.Domain(*dims),
        sparse=sparse,
        attrs=[
            tiledb.Attr(name=f"features_{attr}", dtype=np.float32)
            for attr in range(num_attrs)
        ],
    )

    # Create the (empty) array on disk.
    tiledb.Array.create(uri, schema)

    # Ingest
    with tiledb.open(uri, "w") as tiledb_array:
        idx = np.nonzero(data) if sparse else slice(None)
        tiledb_array[idx] = {f"features_{attr}": data[idx] for attr in range(num_attrs)}
Ejemplo n.º 17
0
def test_dim_start_float():
    ctx = tiledb.Ctx()

    dom = tiledb.Domain(
        tiledb.Dim(ctx=ctx,
                   name="i",
                   domain=(0.0, 6.0),
                   tile=6,
                   dtype=np.float64),
        ctx=ctx,
    )
    schema = tiledb.ArraySchema(
        ctx=ctx,
        domain=dom,
        sparse=True,
        attrs=[tiledb.Attr(ctx=ctx, name='a', dtype=np.float32)])

    tempdir = tempfile.mkdtemp()
    try:
        # create tiledb array
        tiledb.SparseArray.create(tempdir, schema)

        with pytest.raises(ValueError):
            fromtiledb(tempdir, ctx=ctx)
    finally:
        shutil.rmtree(tempdir)
Ejemplo n.º 18
0
    def test_datetime_dtype(self, runner, temp_rootdir, dtype):
        uri = os.path.abspath(
            os.path.join(
                temp_rootdir,
                tempfile.mkdtemp(),
                f"test_datetime_dtype_{np.dtype(dtype).name}",
            ))

        dom = tiledb.Domain(
            tiledb.Dim(
                domain=(np.datetime64("1970-01-01"),
                        np.datetime64("1980-01-01")),
                dtype=dtype,
            ))
        att = tiledb.Attr(dtype=dtype)
        schema = tiledb.ArraySchema(domain=dom, attrs=(att, ), sparse=True)
        tiledb.Array.create(uri, schema)

        with tiledb.open(uri, mode="w") as A:
            A[np.arange(1, 11)] = np.random.randint(low=1, high=10, size=10)

        result = runner.invoke(root, ["dump", "array", uri, "'1970-01-04'"])
        assert result.exit_code == 0

        result = runner.invoke(
            root, ["dump", "array", uri, "'1970-01-01':'1980-01-01'"])
        assert result.exit_code == 0
Ejemplo n.º 19
0
    def test_int_dtypes(self, runner, temp_rootdir, sparse, dtype):
        uri = os.path.abspath(
            os.path.join(
                temp_rootdir,
                tempfile.mkdtemp(),
                "test_int_dtypes_"
                f"{'sparse' if sparse else 'dense'}_"
                f"{np.dtype(dtype).name}",
            ))

        dom = tiledb.Domain(tiledb.Dim(domain=(1, 10), dtype=dtype))
        att = tiledb.Attr(dtype=dtype)
        schema = tiledb.ArraySchema(domain=dom, attrs=(att, ), sparse=sparse)
        tiledb.Array.create(uri, schema)

        with tiledb.open(uri, mode="w") as A:
            if sparse:
                A[np.arange(1, 11)] = np.random.randint(10,
                                                        size=10,
                                                        dtype=dtype)
            else:
                A[:] = np.random.randint(10, size=10, dtype=dtype)

        result = runner.invoke(root, ["dump", "array", uri, "5"])
        assert result.exit_code == 0

        result = runner.invoke(root, ["dump", "array", uri, "1:10"])
        assert result.exit_code == 0
Ejemplo n.º 20
0
def create_X(X_name, shape, is_sparse):
    """
    The X matrix is accessed in both row and column oriented patterns, depending on the
    particular operation.  Because of the data type, default compression works best.
    The tile size, (50, 100) for dense, and (512,2048) for sparse,
    and global layout (row/col) was chosen empirically, by benchmarking
    the current cellxgene backend.
    """
    filters = tiledb.FilterList([tiledb.ZstdFilter()])
    attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
    if is_sparse:
        domain = tiledb.Domain(
            tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 512), dtype=np.uint32),
            tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 2048), dtype=np.uint32),
        )
    else:
        domain = tiledb.Domain(
            tiledb.Dim(name="obs", domain=(0, shape[0] - 1), tile=min(shape[0], 50), dtype=np.uint32),
            tiledb.Dim(name="var", domain=(0, shape[1] - 1), tile=min(shape[1], 100), dtype=np.uint32),
        )
    schema = tiledb.ArraySchema(
        domain=domain, sparse=is_sparse, attrs=attrs, cell_order="row-major", tile_order="col-major"
    )
    if is_sparse:
        tiledb.SparseArray.create(X_name, schema)
    else:
        tiledb.DenseArray.create(X_name, schema)
Ejemplo n.º 21
0
def create_new_array(size,
                     array_out_name,
                     tile_size,
                     attribute_config,
                     compressor='gzip',
                     compression_level=-1):
    '''
    Creates an empty tileDB array
    '''

    tile_size = min(size, tile_size)
    tiledb_dim = tiledb.Dim(name='genome_coordinate',
                            domain=(0, size - 1),
                            tile=tile_size,
                            dtype='uint32')
    tiledb_dom = tiledb.Domain(tiledb_dim, ctx=tdb_Context)

    #generate the attribute information
    attribute_info = get_attribute_info(attribute_config)
    attribs = []
    for key in attribute_info:
        attribs.append(
            tiledb.Attr(name=key,
                        filters=tiledb.FilterList([tiledb.GzipFilter()]),
                        dtype=attribute_info[key]['dtype']))
    tiledb_schema = tiledb.ArraySchema(domain=tiledb_dom,
                                       attrs=tuple(attribs),
                                       cell_order='row-major',
                                       tile_order='row-major')

    tiledb.DenseArray.create(array_out_name, tiledb_schema, ctx=tdb_Context)
    print("created empty array on disk")
    gc.collect()
    return
Ejemplo n.º 22
0
def test_tiledb_test():
    import tiledb

    n = 1000
    m = 1000
    num_vals = 1000

    n_idxs = np.sort(np.random.choice(n, num_vals, replace=False))
    m_idxs = np.sort(np.random.choice(m, num_vals, replace=False))
    values = np.random.randint(0, 100, num_vals, np.uint8)

    ctx = tiledb.Ctx()

    n_tile_extent = min(100, n)

    d1 = tiledb.Dim("ndom",
                    domain=(0, n - 1),
                    tile=n_tile_extent,
                    dtype="uint32",
                    ctx=ctx)
    d2 = tiledb.Dim("mdom", domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx)

    domain = tiledb.Domain(d1, d2, ctx=ctx)

    v = tiledb.Attr(
        "v",
        filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]),
        dtype="uint8",
        ctx=ctx,
    )

    schema = tiledb.ArraySchema(
        domain=domain,
        attrs=(v, ),
        capacity=10000,
        cell_order="row-major",
        tile_order="row-major",
        sparse=True,
        ctx=ctx,
    )

    with tempfile.TemporaryDirectory() as tdir:

        path = os.path.join(tdir, "arr.tiledb")

        tiledb.SparseArray.create(path, schema)

        with tiledb.SparseArray(path, mode="w", ctx=ctx) as A:
            A[n_idxs, m_idxs] = values

        ctx2 = tiledb.Ctx()

        s = tiledb.SparseArray(path, mode="r", ctx=ctx2)
        vs1 = s[1:10, 1:50]

        _ = s[:, :]
        vs2 = s[1:10, 1:50]

        assert vs1["v"].shape[0] == vs2["v"].shape[0]
Ejemplo n.º 23
0
def create_tiledb_example(tmpdir):
    # Define data
    float_data = np.linspace(
        -1.0, 1.0, num=32, endpoint=True, dtype=np.float64
    ).reshape(8, 4)
    int_data = np.arange(0, 32, dtype=np.int32).reshape(8, 4)
    # Create expected dataset
    expected = xr.Dataset(
        data_vars={
            "pressure": xr.DataArray(
                data=float_data,
                dims=["time", "x"],
                attrs={"long_name": "example float data"},
            ),
            "count": xr.DataArray(
                data=int_data,
                dims=["time", "x"],
                attrs={"long_name": "example int data"},
            ),
        },
        coords={"time": np.arange(1, 9), "x": np.arange(1, 5)},
        attrs={"global_1": "value1", "global_2": "value2"},
    )
    array_uri = str(tmpdir.join("tiledb_example_1"))
    schema = tiledb.ArraySchema(
        domain=tiledb.Domain(
            tiledb.Dim(name="time", domain=(1, 8), tile=4, dtype=np.int32),
            tiledb.Dim(name="x", domain=(1, 4), tile=4, dtype=np.int32),
        ),
        sparse=False,
        attrs=[
            tiledb.Attr(name="count", dtype=np.int32),
            tiledb.Attr(name="pressure", dtype=np.float64),
        ],
    )
    tiledb.DenseArray.create(array_uri, schema)
    with tiledb.DenseArray(array_uri, mode="w") as array:
        array[:, :] = {
            "pressure": float_data,
            "count": int_data,
        }
        array.meta["global_1"] = "value1"
        array.meta["global_2"] = "value2"
        array.meta["__tiledb_attr.float_data.long_name"] = "example float data"
        array.meta["__tiledb_attr.int_data.long_name"] = "example int data"
    return array_uri, expected
Ejemplo n.º 24
0
    def _create_array(self):
        """
        Creates a TileDB array for a Tensorflow model
        """
        try:
            dom = tiledb.Domain(
                tiledb.Dim(name="model", domain=(1, 1), tile=1,
                           dtype=np.int32), )

            attrs = [
                tiledb.Attr(
                    name="model_weights",
                    dtype="S1",
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ),
                tiledb.Attr(
                    name="optimizer_weights",
                    dtype="S1",
                    var=True,
                    filters=tiledb.FilterList([tiledb.ZstdFilter()]),
                ),
            ]

            schema = tiledb.ArraySchema(
                domain=dom,
                sparse=False,
                attrs=attrs,
            )

            tiledb.Array.create(self.uri, schema)
        except tiledb.TileDBError as error:
            if "Error while listing with prefix" in str(error):
                # It is possible to land here if user sets wrong default s3 credentials
                # with respect to default s3 path
                raise HTTPError(
                    code=400,
                    msg=
                    f"Error creating file, {error} Are your S3 credentials valid?",
                )

            if "already exists" in str(error):
                logging.warning(
                    "TileDB array already exists but update=False. "
                    "Next time set update=True. Returning")
                raise error
Ejemplo n.º 25
0
    def to_tiledb(self, uri: Union[str, PurePath]) -> None:
        uri = URL(uri) if not isinstance(uri, PurePath) else uri

        if tiledb.object_type(str(uri)) != "group":
            tiledb.group_create(str(uri))

        headers_uri = str(uri / "headers")
        if tiledb.object_type(headers_uri) != "array":
            dims = self._get_dims(TRACE_FIELDS_SIZE)
            header_schema = tiledb.ArraySchema(
                domain=tiledb.Domain(*dims),
                sparse=False,
                attrs=[
                    tiledb.Attr(f.name, f.dtype, filters=TRACE_FIELD_FILTERS)
                    for f in TRACE_FIELDS
                ],
            )
            with self._tiledb_array(headers_uri, header_schema) as tdb:
                self._fill_headers(tdb)

        data_uri = str(uri / "data")
        if tiledb.object_type(data_uri) != "array":
            samples = len(self.segy_file.samples)
            sample_dtype = self.segy_file.dtype
            sample_size = sample_dtype.itemsize
            dims = list(self._get_dims(sample_size * samples))
            dims.append(
                tiledb.Dim(
                    name="samples",
                    domain=(0, samples - 1),
                    dtype=dims[0].dtype,
                    tile=np.clip(self.tile_size // sample_size, 1, samples),
                ))
            data_schema = tiledb.ArraySchema(
                domain=tiledb.Domain(*dims),
                sparse=False,
                attrs=[
                    tiledb.Attr("trace",
                                sample_dtype,
                                filters=(tiledb.LZ4Filter(), ))
                ],
            )
            with self._tiledb_array(data_uri, data_schema) as tdb:
                self._fill_data(tdb)
Ejemplo n.º 26
0
    def testFromTileDB(self):
        ctx = tiledb.Ctx()

        for sparse in (True, False):
            dom = tiledb.Domain(
                tiledb.Dim(ctx=ctx, name="i", domain=(1, 30), tile=7, dtype=np.int32),
                tiledb.Dim(ctx=ctx, name="j", domain=(1, 20), tile=3, dtype=np.int32),
                tiledb.Dim(ctx=ctx, name="k", domain=(1, 10), tile=4, dtype=np.int32),
                ctx=ctx,
            )
            schema = tiledb.ArraySchema(ctx=ctx, domain=dom, sparse=sparse,
                                        attrs=[tiledb.Attr(ctx=ctx, name='a', dtype=np.float32)])

            tempdir = tempfile.mkdtemp()
            try:
                # create tiledb array
                array_type = tiledb.DenseArray if not sparse else tiledb.SparseArray
                array_type.create(tempdir, schema)

                tensor = fromtiledb(tempdir)
                self.assertIsInstance(tensor.op, TensorTileDBDataSource)
                self.assertEqual(tensor.op.issparse(), sparse)
                self.assertEqual(tensor.shape, (30, 20, 10))
                self.assertEqual(tensor.extra_params.raw_chunk_size, (7, 3, 4))
                self.assertIsNone(tensor.op.tiledb_config)
                self.assertEqual(tensor.op.tiledb_uri, tempdir)
                self.assertIsNone(tensor.op.tiledb_key)
                self.assertIsNone(tensor.op.tiledb_timestamp)

                tensor = tensor.tiles()

                self.assertEqual(len(tensor.chunks), 105)
                self.assertIsInstance(tensor.chunks[0].op, TensorTileDBDataSource)
                self.assertEqual(tensor.chunks[0].op.issparse(), sparse)
                self.assertEqual(tensor.chunks[0].shape, (7, 3, 4))
                self.assertIsNone(tensor.chunks[0].op.tiledb_config)
                self.assertEqual(tensor.chunks[0].op.tiledb_uri, tempdir)
                self.assertIsNone(tensor.chunks[0].op.tiledb_key)
                self.assertIsNone(tensor.chunks[0].op.tiledb_timestamp)
                self.assertEqual(tensor.chunks[0].op.tiledb_dim_starts, (1, 1, 1))

                # test axis_offsets of chunk op
                self.assertEqual(tensor.chunks[0].op.axis_offsets, (0, 0, 0))
                self.assertEqual(tensor.chunks[1].op.axis_offsets, (0, 0, 4))
                self.assertEqual(tensor.cix[0, 2, 2].op.axis_offsets, (0, 6, 8))
                self.assertEqual(tensor.cix[0, 6, 2].op.axis_offsets, (0, 18, 8))
                self.assertEqual(tensor.cix[4, 6, 2].op.axis_offsets, (28, 18, 8))

                tensor2 = fromtiledb(tempdir, ctx=ctx)
                self.assertEqual(tensor2.op.tiledb_config, ctx.config().dict())

                tensor2 = tensor2.tiles()

                self.assertEqual(tensor2.chunks[0].op.tiledb_config, ctx.config().dict())
            finally:
                shutil.rmtree(tempdir)
Ejemplo n.º 27
0
def ccd(_input, bands, output=None, config=None, neighbourhood=7, overlap=1):
    if len(bands) == 2:
        if output is None or not os.path.exists(output):
            cfg = tiledb.Config(config)
            ctx = tiledb.Ctx(config=cfg)
            with tiledb.DenseArray(_input, 'r', ctx=ctx) as arr:
                y_dim = arr.schema.domain.dim(1)
                x_dim = arr.schema.domain.dim(2)
                height = y_dim.size
                width = x_dim.size
                tile_y_size = y_dim.tile
                tile_x_size = x_dim.tile

            dom = tiledb.Domain(
                tiledb.Dim(domain=(0, height - 1),
                           tile=tile_y_size,
                           dtype=np.uint64),
                tiledb.Dim(domain=(0, width - 1),
                           tile=tile_x_size,
                           dtype=np.uint64))

            schema = tiledb.ArraySchema(
                domain=dom,
                sparse=False,
                attrs=[tiledb.Attr(name="c", dtype=np.float32)],
                ctx=ctx)
            if output is None:
                output = _input + '_result_' + ''.join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(4))  # noqa

            tiledb.DenseArray.create(output, schema)

        x = da.from_tiledb(_input, storage_options=config)
        _, h, w = x.shape
        _, tile_y_size, tile_x_size = x.chunksize

        # w and h are an exact multiple of tile size
        n_tiles_x = w // tile_x_size
        n_tiles_y = h // tile_x_size

        # manually chunk and collect
        f = []

        for y in range(n_tiles_y):
            for x in range(n_tiles_x):
                f.append(
                    client.submit(calculate_change, _input, bands,
                                  neighbourhood, x, y, tile_x_size,
                                  tile_y_size, output, config))
        client.gather(f)
        return output
    else:
        raise IndexError('CCD function requires two band indexes')
Ejemplo n.º 28
0
def create_array():

    # The array will be 4x4 with dimensions "rows" and "cols", with domain [1,4].
    dom = tiledb.Domain(tiledb.Dim(name="rows", domain=(1, 4), tile=4, dtype=np.int32),
                        tiledb.Dim(name="cols", domain=(1, 4), tile=4, dtype=np.int32))

    # The array will be dense with a single attribute "a" so each (i,j) cell can store an integer.
    schema = tiledb.ArraySchema(domain=dom, sparse=False,
                                attrs=[tiledb.Attr(name="a", dtype=np.int32)])

    # Create the (empty) array on disk.
    tiledb.DenseArray.create(array_name, schema)
Ejemplo n.º 29
0
def write_tiledb(arr, path, overwrite=True):
    """Write a tiledb to disk.
    """
    if os.path.exists(path) and os.path.isdir(path) and overwrite:
        shutil.rmtree(path)

    if os.path.exists(path):
        raise FileExistsError("Output path {} already exists".format(path))

    ctx = tiledb.Ctx()

    n = arr.shape[0]
    n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)

    d1 = tiledb.Dim(ctx,
                    GENOME_DOMAIN_NAME,
                    domain=(0, n - 1),
                    tile=n_tile_extent,
                    dtype="uint32")

    if arr.ndim == 1:
        domain = tiledb.Domain(ctx, d1)

    elif arr.ndim == 2:
        m = arr.shape[1]
        d2 = tiledb.Dim(ctx,
                        SECONDARY_DOMAIN_NAME,
                        domain=(0, m - 1),
                        tile=m,
                        dtype="uint32")
        domain = tiledb.Domain(ctx, d1, d2)

    else:
        raise ValueError("tiledb backend only supports 1D or 2D arrays")

    v = tiledb.Attr(
        ctx,
        GENOME_VALUE_NAME,
        compressor=(DEFAULT_COMPRESSOR, DEFAULT_COMPRESSOR_LEVEL),
        dtype="float32",
    )

    schema = tiledb.ArraySchema(ctx,
                                domain=domain,
                                attrs=(v, ),
                                cell_order="row-major",
                                tile_order="row-major")
    A = tiledb.DenseArray.create(path, schema)

    values = arr.astype(np.float32)

    with tiledb.DenseArray(ctx, path, mode="w") as A:
        A[:] = {GENOME_VALUE_NAME: values}
Ejemplo n.º 30
0
def test_copy_fragments_to_existing(runner, uri, temp_rootdir, start_time,
                                    end_time):
    """
    Test for command

        tiledb fragments copy [old_array_uri] [new_array_uri] [start_time] [end_time]
    """
    old_uri = uri
    new_uri = os.path.abspath(
        os.path.join(
            temp_rootdir,
            f"test_copy_fragments_to_existing_{start_time}_{end_time}",
        ))

    rows_dim = tiledb.Dim(domain=(1, 25), dtype=np.int64)
    cols_dim = tiledb.Dim(domain=(1, 12), dtype=np.int64)
    dom = tiledb.Domain(rows_dim, cols_dim)
    att1 = tiledb.Attr(name="a", dtype=np.float64)
    att2 = tiledb.Attr(name="b", dtype=np.float64)
    schema = tiledb.ArraySchema(domain=dom, attrs=(att1, att2))
    tiledb.Array.create(new_uri, schema)

    data = np.reshape(np.arange(300), (25, 12))
    for ts in range(4, 6):
        with tiledb.open(new_uri, mode="w", timestamp=ts) as A:
            A[:] = {"a": data, "b": data}

    result = runner.invoke(
        root,
        ["fragments", "copy", "-f", old_uri, new_uri, start_time, end_time],
    )
    assert result.exit_code == 0

    fragments = tiledb.array_fragments(old_uri)
    assert len(fragments) == 3
    assert fragments.timestamp_range == ((1, 1), (2, 2), (3, 3))

    fragments = tiledb.array_fragments(new_uri)
    assert len(fragments) == 4
    assert fragments.timestamp_range == ((2, 2), (3, 3), (4, 4), (5, 5))