Ejemplo n.º 1
0
def test_tiledb_test():
    import tiledb

    n = 1000
    m = 1000
    num_vals = 1000

    n_idxs = np.sort(np.random.choice(n, num_vals, replace=False))
    m_idxs = np.sort(np.random.choice(m, num_vals, replace=False))
    values = np.random.randint(0, 100, num_vals, np.uint8)

    ctx = tiledb.Ctx()

    n_tile_extent = min(100, n)

    d1 = tiledb.Dim("ndom",
                    domain=(0, n - 1),
                    tile=n_tile_extent,
                    dtype="uint32",
                    ctx=ctx)
    d2 = tiledb.Dim("mdom", domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx)

    domain = tiledb.Domain(d1, d2, ctx=ctx)

    v = tiledb.Attr(
        "v",
        filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]),
        dtype="uint8",
        ctx=ctx,
    )

    schema = tiledb.ArraySchema(
        domain=domain,
        attrs=(v, ),
        capacity=10000,
        cell_order="row-major",
        tile_order="row-major",
        sparse=True,
        ctx=ctx,
    )

    with tempfile.TemporaryDirectory() as tdir:

        path = os.path.join(tdir, "arr.tiledb")

        tiledb.SparseArray.create(path, schema)

        with tiledb.SparseArray(path, mode="w", ctx=ctx) as A:
            A[n_idxs, m_idxs] = values

        ctx2 = tiledb.Ctx()

        s = tiledb.SparseArray(path, mode="r", ctx=ctx2)
        vs1 = s[1:10, 1:50]

        _ = s[:, :]
        vs2 = s[1:10, 1:50]

        assert vs1["v"].shape[0] == vs2["v"].shape[0]
Ejemplo n.º 2
0
    def to_tiledb(self, uri: Union[str, PurePath]) -> None:
        uri = URL(uri) if not isinstance(uri, PurePath) else uri

        if tiledb.object_type(str(uri)) != "group":
            tiledb.group_create(str(uri))

        headers_uri = str(uri / "headers")
        if tiledb.object_type(headers_uri) != "array":
            dims = self._get_dims(TRACE_FIELDS_SIZE)
            header_schema = tiledb.ArraySchema(
                domain=tiledb.Domain(*dims),
                sparse=False,
                attrs=[
                    tiledb.Attr(f.name, f.dtype, filters=TRACE_FIELD_FILTERS)
                    for f in TRACE_FIELDS
                ],
            )
            with self._tiledb_array(headers_uri, header_schema) as tdb:
                self._fill_headers(tdb)

        data_uri = str(uri / "data")
        if tiledb.object_type(data_uri) != "array":
            samples = len(self.segy_file.samples)
            sample_dtype = self.segy_file.dtype
            sample_size = sample_dtype.itemsize
            dims = list(self._get_dims(sample_size * samples))
            dims.append(
                tiledb.Dim(
                    name="samples",
                    domain=(0, samples - 1),
                    dtype=dims[0].dtype,
                    tile=np.clip(self.tile_size // sample_size, 1, samples),
                ))
            data_schema = tiledb.ArraySchema(
                domain=tiledb.Domain(*dims),
                sparse=False,
                attrs=[
                    tiledb.Attr("trace",
                                sample_dtype,
                                filters=(tiledb.LZ4Filter(), ))
                ],
            )
            with self._tiledb_array(data_uri, data_schema) as tdb:
                self._fill_data(tdb)
Ejemplo n.º 3
0
    def test_attr_filters_multi(self, runner, temp_rootdir,
                                create_test_simple_csv):
        """
        Test for command

            tiledb convert_from [csv_file] [uri] --attr-filters <attr name>:<filter name>,<filter name>,...
        """
        test_name, _ = create_test_simple_csv
        input_path = os.path.join(temp_rootdir, f"{test_name}.csv")
        uri = os.path.join(temp_rootdir, "test_attr_filters_multi.tdb")

        result = runner.invoke(
            root,
            [
                "convert-from",
                "csv",
                input_path,
                uri,
                "--attr-filters",
                ("a:LZ4Filter=10,BitShuffleFilter;"
                 "b:DoubleDeltaFilter,PositiveDeltaFilter=3"),
            ],
        )

        print(result.stdout)
        assert result.exit_code == 0

        with tiledb.open(uri) as array:
            assert array.schema.attr("a").filters.nfilters == 2
            assert array.schema.attr("a").filters[0] == tiledb.LZ4Filter(10)
            assert array.schema.attr(
                "a").filters[1] == tiledb.BitShuffleFilter()

            assert array.schema.attr("b").filters.nfilters == 2
            assert array.schema.attr(
                "b").filters[0] == tiledb.DoubleDeltaFilter()
            assert array.schema.attr(
                "b").filters[1] == tiledb.PositiveDeltaFilter(3)

            assert array.schema.attr("c").filters.nfilters == 0

            assert array.schema.attr("date").filters.nfilters == 0
Ejemplo n.º 4
0
def write_sparse_array(path, n, m, n_idxs, m_idxs, values, clip=True):
    if os.path.exists(path):
        raise FileExistsError("{} already exists".format(path))

    if n_idxs.min() < 0 or n_idxs.max() >= n:
        raise ValueError("row indexes must be in range [0, n - 1]")

    if m_idxs.min() < 0 or m_idxs.max() >= m:
        raise ValueError("column indexes must in in range [0, m - 1]")

    sparse = coo_matrix((values, (n_idxs, m_idxs)), dtype=np.int32)
    sparse = sparse.tocsc(copy=False).tocoo(copy=False)

    n_idxs = sparse.row
    m_idxs = sparse.col
    values = sparse.data

    if clip:
        values = np.minimum(values, VPLOT_MAX_VALUE)

    if values.min() < 0 or values.max() > VPLOT_MAX_VALUE:
        raise ValueError(
            "vplot values must be in range [0, {}]".format(VPLOT_MAX_VALUE))

    # ctx = tiledb.Ctx()

    n_tile_extent = min(DEFAULT_GENOME_TILE_EXTENT, n)

    d1 = tiledb.Dim(
        GENOME_DOMAIN_NAME,
        domain=(0, n - 1),
        tile=n_tile_extent,
        dtype="uint32",
        ctx=ctx,
    )
    d2 = tiledb.Dim(INSERT_DOMAIN_NAME,
                    domain=(0, m - 1),
                    tile=m,
                    dtype="uint32",
                    ctx=ctx)

    domain = tiledb.Domain(d1, d2, ctx=ctx)

    v = tiledb.Attr(
        "v",
        filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]),
        dtype="uint8",
        ctx=ctx,
    )

    schema = tiledb.ArraySchema(
        ctx=ctx,
        domain=domain,
        attrs=(v, ),
        capacity=1000,
        cell_order="row-major",
        tile_order="row-major",
        sparse=True,
    )

    tiledb.SparseArray.create(path, schema)

    with tiledb.SparseArray(path, mode="w", ctx=ctx) as A:
        values = values.astype(np.uint8)
        # A[n_idxs, m_idxs] = {"v": values}
        A[n_idxs, m_idxs] = values
Ejemplo n.º 5
0
    size2dtype = {2: np.dtype(np.int16), 4: np.dtype(np.int32)}
    for f, f2 in zip(all_fields, all_fields[1:]):
        name = str(f)
        if name in include_names:
            yield TypedTraceField(name, f, size2dtype[int(f2) - int(f)])


TRACE_FIELDS = tuple(iter_typed_trace_fields())
TRACE_FIELD_ENUMS = tuple(int(f.enum) for f in TRACE_FIELDS)
TRACE_FIELD_NAMES = tuple(f.name for f in TRACE_FIELDS)
TRACE_FIELD_DTYPES = tuple(f.dtype for f in TRACE_FIELDS)
TRACE_FIELDS_SIZE = sum(dtype.itemsize for dtype in TRACE_FIELD_DTYPES)
TRACE_FIELD_FILTERS = (
    tiledb.BitWidthReductionFilter(),
    tiledb.ByteShuffleFilter(),
    tiledb.LZ4Filter(),
)


class ExtendedSegyFile(segyio.SegyFile):
    @cached_property
    def trace_size(self) -> int:
        return len(self._samples) * int(self._dtype.itemsize)

    @cached_property
    def fast_headerline(self) -> segyio.line.HeaderLine:
        return self._header.iline if self.is_inline else self._header.xline

    @cached_property
    def fast_lines(self) -> np.ndarray:
        return self._ilines if self.is_inline else self._xlines