def consolidate_fragments(
    uri,
    amplification,
    buffer_size,
    step_max_frags,
    step_min_frags,
    step_size_ratio,
    steps,
    vacuum,
):
    """
    Consolidate the fragments in an array located at uri.
    """
    config = tiledb.Config()
    config["sm.consolidation.mode"] = "fragments"
    config["sm.consolidation.amplification"] = amplification
    config["sm.consolidation.buffer_size"] = buffer_size
    config["sm.consolidation.step_max_frags"] = step_max_frags
    config["sm.consolidation.step_min_frags"] = step_min_frags
    config["sm.consolidation.step_size_ratio"] = step_size_ratio
    config["sm.consolidation.steps"] = steps
    ctx = tiledb.Ctx(config)

    tiledb.consolidate(uri, ctx=ctx)

    print(vacuum)
    if vacuum:
        config = tiledb.Config({"sm.vacuum.mode": "fragments"})
        tiledb.vacuum(uri, ctx=tiledb.Ctx(config))
        print("here?")
Exemple #2
0
def test_tiledb_test():
    import tiledb

    n = 1000
    m = 1000
    num_vals = 1000

    n_idxs = np.sort(np.random.choice(n, num_vals, replace=False))
    m_idxs = np.sort(np.random.choice(m, num_vals, replace=False))
    values = np.random.randint(0, 100, num_vals, np.uint8)

    ctx = tiledb.Ctx()

    n_tile_extent = min(100, n)

    d1 = tiledb.Dim("ndom",
                    domain=(0, n - 1),
                    tile=n_tile_extent,
                    dtype="uint32",
                    ctx=ctx)
    d2 = tiledb.Dim("mdom", domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx)

    domain = tiledb.Domain(d1, d2, ctx=ctx)

    v = tiledb.Attr(
        "v",
        filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]),
        dtype="uint8",
        ctx=ctx,
    )

    schema = tiledb.ArraySchema(
        domain=domain,
        attrs=(v, ),
        capacity=10000,
        cell_order="row-major",
        tile_order="row-major",
        sparse=True,
        ctx=ctx,
    )

    with tempfile.TemporaryDirectory() as tdir:

        path = os.path.join(tdir, "arr.tiledb")

        tiledb.SparseArray.create(path, schema)

        with tiledb.SparseArray(path, mode="w", ctx=ctx) as A:
            A[n_idxs, m_idxs] = values

        ctx2 = tiledb.Ctx()

        s = tiledb.SparseArray(path, mode="r", ctx=ctx2)
        vs1 = s[1:10, 1:50]

        _ = s[:, :]
        vs2 = s[1:10, 1:50]

        assert vs1["v"].shape[0] == vs2["v"].shape[0]
Exemple #3
0
def write_array(args, updating, chunks_to_process):    
    try:
        #config
        tdb_Config=tiledb.Config(tdb_config_params)
        tdb_write_Context=tiledb.Ctx(config=tdb_Config)   
        
        if updating is True:
            tdb_read_Context=tiledb.Ctx(config=tdb_Config)
            cur_array_toread=tiledb.DenseArray(args.array_name,ctx=tdb_read_Context,mode='r')
        cur_array_towrite=tiledb.DenseArray(args.array_name,ctx=tdb_write_Context,mode='w')
        chunks_processed=0
        while chunks_processed < chunks_to_process:
            while write_queue.empty() is True:
                time.sleep(10)
            processed_chunk=write_queue.get()
            processed_chunk_unpickled=pickle.loads(processed_chunk)
            task_index=processed_chunk_unpickled[0]
            start_index=processed_chunk_unpickled[1]
            end_index=processed_chunk_unpickled[2]
            dict_to_write=processed_chunk_unpickled[3]
            if updating is True:
                #we are only updating some attributes in the array
                cur_vals=cur_array_toread[start_index:end_index,task_index]            
                #print("got cur vals for task "+str(task_index)+" for "+str(start_index)+":"+str(end_index))
                for key in dict_to_write:
                    cur_vals[key]=dict_to_write[key]
                dict_to_write=cur_vals
                print("updated data dict for writing:"+args.array_name) 
            else:
                #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array
                required_attrib=list(get_attribute_info(args.attribute_config,args.attribute_config_file).keys())
                #print(str(required_attrib))
                for attrib in required_attrib:
                    if attrib not in dict_to_write:
                        print("augmenting")
                        dict_to_write[attrib]=np.full(end_index-start_index,np.nan)
            #write in chunks
            cur_array_towrite[start_index:end_index,task_index]=dict_to_write
            print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2))
            gc.collect()
            chunks_processed+=1
            print("wrote to disk "+str(task_index)+" for "+str(start_index)+":"+str(end_index)+";"+str(chunks_processed)+"/"+str(chunks_to_process))
        assert chunks_processed >=chunks_to_process
        print("closing arrays")
        if updating is True:
            cur_array_toread.close()
        cur_array_towrite.close()
        return 

    except KeyboardInterrupt:
        kill_child_processes(os.getpid())
        #try to delete all tmp files
        raise
    except Exception as e:
        print(e)
        kill_child_processes(os.getpid())
        raise Exception(e.message) 
def test_ingest_csv_sparse_array_apppend_header_mismatch(
    udf_uri, array_name, key, secret, namespace, bucket, config
):
    """
    Create a sparse array from a CSV file using ingest_csv() in the default
    ingest mode and then append additional data to it using the append mode.
    The appended data contains header names that do not match the data in the
    sparse array and must be renamed.
    """
    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment_sparse1"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        mode="ingest",
        full_domain=True,
        index_col=("x"),
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.SparseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])
        number_of_rows = data.shape[0]
        assert number_of_rows == 20

    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment_sparse2_mismatch"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        mode="append",
        full_domain=True,
        index_col=("x"),
        header=0,
        names=["x", "c", "b", "a"],
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.SparseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])

        for col, attribute in enumerate(("a", "b", "c"), 1):
            assert_array_equal(
                data[attribute],
                np.array([row * 10 + col for row in range(1, 21)] * 2),
            )
def consolidate_array_metadata(uri, vacuum):
    """
    Consolidate the array metadata in an array located at uri.
    """
    config = tiledb.Config()
    config["sm.consolidation.mode"] = "array_meta"
    ctx = tiledb.Ctx(config)

    tiledb.consolidate(uri, ctx=ctx)

    if vacuum:
        config = tiledb.Config({"sm.vacuum.mode": "array_meta"})
        tiledb.vacuum(uri, ctx=tiledb.Ctx(config))
Exemple #6
0
def main():
    ctx = tiledb.Ctx()

    # Create dimensions
    d1 = tiledb.Dim(ctx, "d1", domain=(1, 4), tile=2, dtype="uint64")
    d2 = tiledb.Dim(ctx, "d2", domain=(1, 4), tile=2, dtype="uint64")

    # Create domain
    domain = tiledb.Domain(ctx, d1, d2)

    # Create attributes
    a1 = tiledb.Attr(ctx, "a1", compressor=('blosc-lz', -1), dtype="int32")
    a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="S10")
    a3 = tiledb.Attr(ctx,
                     "a3",
                     compressor=('zstd', -1),
                     dtype='float32,float32')

    # Create sparse array
    tiledb.SparseArray(ctx,
                       "my_sparse_array",
                       domain=domain,
                       attrs=(a1, a2, a3),
                       capacity=2,
                       cell_order='row-major',
                       tile_order='row-major')
Exemple #7
0
    def test_walk_group(self):
        ctx = tiledb.Ctx()

        groups = []

        def append_to_groups(path, obj):
            groups.append((path, obj))

        tiledb.walk(ctx, self.path(""), append_to_groups, order="preorder")

        groups.sort()
        self.assertTrue(groups[0][0].endswith(self.group1)
                        and groups[0][1] == "group")
        self.assertTrue(groups[1][0].endswith(self.group2)
                        and groups[1][1] == "group")
        self.assertTrue(groups[2][0].endswith(self.group3)
                        and groups[2][1] == "group")
        self.assertTrue(groups[3][0].endswith(self.group4)
                        and groups[3][1] == "group")

        groups = []

        tiledb.walk(ctx, self.path(""), append_to_groups, order="postorder")

        self.assertTrue(groups[0][0].endswith(self.group2)
                        and groups[0][1] == "group")
        self.assertTrue(groups[1][0].endswith(self.group4)
                        and groups[1][1] == "group")
        self.assertTrue(groups[2][0].endswith(self.group3)
                        and groups[2][1] == "group")
        self.assertTrue(groups[3][0].endswith(self.group1)
                        and groups[3][1] == "group")
Exemple #8
0
def totiledb(uri, x, ctx=None, key=None, timestamp=None):
    import tiledb

    x = astensor(x)
    raw_ctx = ctx
    if raw_ctx is None:
        ctx = tiledb.Ctx()

    tiledb_array_type = tiledb.SparseArray if x.issparse(
    ) else tiledb.DenseArray
    try:
        tiledb_array = tiledb_array_type(uri=uri,
                                         key=key,
                                         timestamp=timestamp,
                                         ctx=ctx)
        # if already created, we will check the shape and dtype
        check_tiledb_array_with_tensor(x, tiledb_array)
    except tiledb.TileDBError:
        # not exist, as we don't know the tile,
        # we will create the tiledb array in the tile of tensor
        pass

    tiledb_config = None if raw_ctx is None else raw_ctx.config().dict()
    op = TensorTileDBDataStore(tiledb_config=tiledb_config,
                               tiledb_uri=uri,
                               tiledb_key=key,
                               tiledb_timestamp=timestamp,
                               dtype=x.dtype,
                               sparse=x.issparse())
    return op(x)
def test_ingest_csv_dense_array(
    udf_uri, array_name, key, secret, namespace, bucket, config
):
    """
    Create a dense array from a CSV file using ingest_csv().
    """
    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        sparse=False,
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.DenseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])

        for col, attribute in enumerate(("a", "b", "c"), 1):
            assert_array_equal(
                data[attribute],
                np.array([row * 10 + col for row in range(1, 21)]),
            )
Exemple #10
0
def main():
    ctx = tiledb.Ctx()
    try:
        tiledb.group_create(ctx, "mygroup")
        tiledb.group_create(ctx, "mygroup")
    except tiledb.TileDBError as err:
        print("TileDB exception: {!r}".format(err))
Exemple #11
0
    def test_remove_group(self):
        ctx = tiledb.Ctx()

        tiledb.remove(ctx, self.group3)

        self.assertFalse(self.is_group(ctx, self.group3))
        self.assertFalse(self.is_group(ctx, self.group4))
Exemple #12
0
    def testCheckTileDB(self):
        ctx = tiledb.Ctx()

        tempdir = tempfile.mkdtemp()
        try:
            np_a = np.random.rand(2, 3)
            tiledb_a = tiledb.DenseArray.from_numpy(ctx=ctx,
                                                    uri=tempdir,
                                                    array=np_a)

            with self.assertRaises(ValueError):
                # ndim not match
                check_tiledb_array_with_tensor(random.rand(2, 3, 4), tiledb_a)

            with self.assertRaises(ValueError):
                # shape not matchn
                check_tiledb_array_with_tensor(random.rand(2, 4), tiledb_a)

            with self.assertRaises(ValueError):
                # dtype not match
                check_tiledb_array_with_tensor(
                    random.rand(2, 3, dtype=np.float32), tiledb_a)

            # legal
            check_tiledb_array_with_tensor(random.rand(2, 3), tiledb_a)
        finally:
            shutil.rmtree(tempdir)
Exemple #13
0
def main():
    # Create TileDB context
    ctx = tiledb.Ctx()

    # Load the dense array schema
    dense_example = tiledb.DenseArray.load(ctx, "my_dense_array")

    # Retrieve and print the non-empty domain
    nonempty = dense_example.nonempty_domain()
    print("Non-empty domain:")
    for i in range(dense_example.ndim):
        print("{0!s}: {1!r}".format(
            dense_example.domain.dim(i).name, nonempty[i]))

    # Read the entire array. `result` is a dict of numpy arrays
    result = dense_example[:]

    # Print the results
    result_num = result["a1"].size
    print("\nResult num: ", result_num)
    print()
    print("{:<5s}{:<10s}{:<10s}{:<10s}".format("a1", "a2", "a3[0]", "a3[1]"))
    print("------------------------------")

    for i in range(4):
        for j in range(4):
            print("{:<5d}{:<10s}{:<10.1f}{:<10.1f}".format(
                result["a1"][i, j], result["a2"][i, j], result["a3"][i, j][0],
                result["a3"][i, j][1]))
    print()
Exemple #14
0
def test_dim_start_float():
    ctx = tiledb.Ctx()

    dom = tiledb.Domain(
        tiledb.Dim(ctx=ctx,
                   name="i",
                   domain=(0.0, 6.0),
                   tile=6,
                   dtype=np.float64),
        ctx=ctx,
    )
    schema = tiledb.ArraySchema(
        ctx=ctx,
        domain=dom,
        sparse=True,
        attrs=[tiledb.Attr(ctx=ctx, name='a', dtype=np.float32)])

    tempdir = tempfile.mkdtemp()
    try:
        # create tiledb array
        tiledb.SparseArray.create(tempdir, schema)

        with pytest.raises(ValueError):
            fromtiledb(tempdir, ctx=ctx)
    finally:
        shutil.rmtree(tempdir)
Exemple #15
0
def main():
    # Create TileDB context
    ctx = tiledb.Ctx()

    # Create TileDB VFS
    vfs = tiledb.VFS(ctx)

    # Create directory
    if not vfs.is_dir("dir_A"):
        vfs.create_dir("dir_A")
        print("Created dir_A")
    else:
        print("dir_A already exists")

    # Creating an(empty) file
    if not vfs.is_file("dir_A/file_A"):
        vfs.touch("dir_A/file_A")
        print("Created empty file dir_A/file_A")
    else:
        print("dir_A/file_A already exists")

    # Getting the file size
    print("File size: {0!s}".format(vfs.file_size("dir_A/file_A")))

    # Moving files(moving directories is similar)
    print("Moving file dir_A/file_A to dir_A/file_B")
    vfs.move("dir_A/file_A", "dir_A/file_B", force=True)

    # Deleting files and directories
    print("Deleting dir_A/file_B and dir_A")
    vfs.remove_file("dir_A/file_B")
    vfs.remove_dir("dir_A")
def get_upsampled_indices_chrom(inputs):
    region_start = inputs[0]
    region_end = inputs[1]
    tdb_array_name = inputs[2]
    tdb_ambig_attribute = inputs[3]
    tdb_partition_attribute_for_upsample = inputs[4]
    dataset_indices = inputs[5]
    tdb_partition_thresh_for_upsample = inputs[6]
    print("starting getting indices to upsample in range:" +
          str(region_start) + "-" + str(region_end))
    with tiledb.open(tdb_array_name, 'r',
                     ctx=tiledb.Ctx(get_default_config())) as tdb_array:
        if tdb_ambig_attribute is not None:
            attr_vals = tdb_array.query(attrs=[
                tdb_ambig_attribute, tdb_partition_attribute_for_upsample
            ]).multi_index[region_start:region_end - 1, dataset_indices]
            ambig_attr_vals = np.sum(attr_vals[tdb_ambig_attribute], axis=1)
        else:
            attr_vals = tdb_array.query(
                attrs=[tdb_partition_attribute_for_upsample]).multi_index[
                    region_start:region_end - 1, dataset_indices]
        upsample_vals = np.sum(attr_vals[tdb_partition_attribute_for_upsample],
                               axis=1)
    if tdb_ambig_attribute is not None:
        cur_upsampled_indices = region_start + np.argwhere(
            (upsample_vals >= tdb_partition_thresh_for_upsample)
            & (ambig_attr_vals == 0))
    else:
        cur_upsampled_indices = region_start + np.argwhere(
            upsample_vals >= tdb_partition_thresh_for_upsample)
    print("finished indices to upsample in range:" + str(region_start) + "-" +
          str(region_end))
    return cur_upsampled_indices
def test_ingest_csv_sparse_array_null_replace(
    udf_uri, array_name, key, secret, namespace, bucket, config
):
    """
    From a CSV file containing NaNs, produce a sparse array using ingest_csv()
    where the NaNs are replaced with the value given by fillna.
    """
    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment_nulls"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        fillna=123,
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.SparseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])

        assert_array_equal(data["a"], np.array([1, 1, 1]))
        assert_array_equal(data["b"], np.array([2, 2, 123]))
        assert_array_equal(data["c"], np.array([3, 123, 123]))
Exemple #18
0
    def test_sparse_schema(self):
        ctx = tiledb.Ctx()

        # create dimensions
        d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64")
        d2 = tiledb.Dim(ctx,
                        "d2",
                        domain=(101, 10000),
                        tile=100,
                        dtype="uint64")

        # create domain
        domain = tiledb.Domain(ctx, d1, d2)

        # create attributes
        a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32")
        a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32")

        # create sparse array with schema
        schema = tiledb.SparseArray(ctx,
                                    self.path("sparse_array_schema"),
                                    domain=domain,
                                    attrs=(a1, a2),
                                    capacity=10,
                                    cell_order='col-major',
                                    tile_order='row-major',
                                    coords_compressor=('zstd', 4),
                                    offsets_compressor=('blosc-lz', 5))
        self.assertEqual(schema.capacity, 10)
        self.assertEqual(schema.cell_order, "col-major")
        self.assertEqual(schema.tile_order, "row-major")
        self.assertEqual(schema.coords_compressor, ('zstd', 4))
        self.assertEqual(schema.offsets_compressor, ('blosc-lz', 5))
Exemple #19
0
def test_store_tiledb_execution(setup):
    ctx = tiledb.Ctx()

    tempdir = tempfile.mkdtemp()
    try:
        # store TileDB dense array
        expected = np.random.rand(8, 4, 3)
        a = tensor(expected, chunk_size=(3, 3, 2))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(expected, arr.read_direct())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store tensor with 1 chunk to TileDB dense array
        a = arange(12)
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(np.arange(12), arr.read_direct())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store 2-d TileDB sparse array
        expected = sps.random(8, 7, density=0.1)
        a = tensor(expected, chunk_size=(3, 5))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr:
            data = arr[:, :]
            coords = data['coords']
            value = data[arr.attr(0).name]
            ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim))
            result = sps.coo_matrix((value, ij), shape=arr.shape)

            np.testing.assert_allclose(expected.toarray(), result.toarray())
    finally:
        shutil.rmtree(tempdir)

    tempdir = tempfile.mkdtemp()
    try:
        # store TileDB dense array
        expected = np.asfortranarray(np.random.rand(8, 4, 3))
        a = tensor(expected, chunk_size=(3, 3, 2))
        save = totiledb(tempdir, a, ctx=ctx)
        save.execute()

        with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr:
            np.testing.assert_allclose(expected, arr.read_direct())
            assert arr.schema.cell_order == 'col-major'
    finally:
        shutil.rmtree(tempdir)
Exemple #20
0
def main():
    # Create TileDB context
    ctx = tiledb.Ctx()

    # KV objects are limited to storing string keys/values for the time being
    a1 = tiledb.Attr(ctx, "value", compressor=("gzip", -1), dtype=bytes)
    kv = tiledb.KV(ctx, "my_kv", attrs=(a1, ))

    # Dump the KV schema
    kv.dump()

    # Update the KV with some key-value pairs
    vals = {"key1": "a", "key2": "bb", "key3": "dddd"}
    print("Updating KV with values: {!r}\n".format(vals))
    kv.update(vals)

    # Get kv item
    print("KV value for 'key3': {}\n".format(kv['key3']))

    try:
        kv["don't exist"]
    except KeyError:
        print("KeyError was raised for key 'don't exist'\n")

    # Set kv item
    kv['key3'] = "eeeee"
    print("Updated KV value for 'key3': {}\n".format(kv['key3']))

    # Consolidate kv updates
    kv.consolidate()

    # Convert kv to Python dict
    kv_dict = dict(kv)
    print("Convert to Python dict: {!r}\n".format(kv_dict))
Exemple #21
0
    def testStoreTileDB(self):
        ctx = tiledb.Ctx()
        tempdir = tempfile.mkdtemp()
        try:
            t = random.rand(50, 30, chunk_size=13)
            t2 = t + 1

            saved = totiledb(tempdir, t2)
            self.assertEqual(saved.shape, (0, 0))
            self.assertIsNone(saved.op.tiledb_config)
            self.assertEquals(saved.op.tiledb_uri, tempdir)

            with self.assertRaises(tiledb.TileDBError):
                tiledb.DenseArray(ctx=ctx, uri=tempdir)

            # tiledb array is created in the tile
            saved.tiles()

            # no error
            tiledb.DenseArray(ctx=ctx, uri=tempdir)

            self.assertEqual(saved.chunks[0].op.axis_offsets, (0, 0))
            self.assertEqual(saved.chunks[1].op.axis_offsets, (0, 13))
            self.assertEqual(saved.cix[0, 2].op.axis_offsets, (0, 26))
            self.assertEqual(saved.cix[1, 2].op.axis_offsets, (13, 26))
            self.assertEqual(saved.cix[3, 2].op.axis_offsets, (39, 26))

            with self.assertRaises(ValueError):
                t3 = random.rand(30, 50)
                totiledb(tempdir, t3, ctx=ctx)  # shape incompatible
        finally:
            shutil.rmtree(tempdir)
Exemple #22
0
def write_cxg(
    adata, container, title, var_names=None, obs_names=None, about=None, extract_colors=False, sparse_threshold=5.0
):
    if not adata.var.index.is_unique:
        raise ValueError("Variable index is not unique - unable to convert.")
    if not adata.obs.index.is_unique:
        raise ValueError("Observation index is not unique - unable to convert.")

    """
    TileDB bug TileDB-Inc/TileDB#1575 requires that we sanitize all column names
    prior to saving.  This can be reverted when the bug is fixed.
    """
    log(0, "Warning: sanitizing all dataframe column names.")
    clean_all_column_names(adata)

    ctx = tiledb.Ctx(
        {
            "sm.num_reader_threads": 32,
            "sm.num_writer_threads": 32,
            "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024,
        }
    )

    tiledb.group_create(container, ctx=ctx)
    log(1, f"\t...group created, with name {container}")

    # dataset metadata
    metadata_dict = dict(cxg_version=CXG_VERSION, cxg_properties=json.dumps({"title": title, "about": about}))
    if extract_colors:
        try:
            metadata_dict["cxg_category_colors"] = json.dumps(
                convert_anndata_category_colors_to_cxg_category_colors(adata)
            )
        except ColorFormatException:
            log(
                0,
                "Warning: failed to extract colors from h5ad file! "
                "Fix the h5ad file or rerun with --disable-custom-colors. See help for details.",
            )
    save_metadata(container, metadata_dict)
    log(1, "\t...dataset metadata saved")

    # var/gene dataframe
    save_dataframe(container, "var", adata.var, var_names, ctx=ctx)
    log(1, "\t...var dataframe created")

    # obs/cell dataframe
    save_dataframe(container, "obs", adata.obs, obs_names, ctx=ctx)
    log(1, "\t...obs dataframe created")

    # embeddings
    e_container = f"{container}/emb"
    tiledb.group_create(e_container, ctx=ctx)
    save_embeddings(e_container, adata, ctx)
    log(1, "\t...embeddings created")

    # X matrix
    save_X(container, adata.X, ctx, sparse_threshold)
    log(1, "\t...X created")
Exemple #23
0
 def __call__(self, coords):
     '''
     coords is a list of named tuples : .chrom, .start, .end, .isplusstrand    
     returns nparray of values associated with coordinates
     '''
     assert len(coords) > 0
     self.ctx = tiledb.Ctx()
     return self.call_function(coords)
Exemple #24
0
def read_array():
    ctx = tiledb.Ctx()
    with tiledb.SparseArray(ctx, array_name, mode='r') as A:
        data = A[1:11]
        a_vals = data["a"]
        for i, coord in enumerate(data["coords"]):
            print("Cell (%d, %d) has data %d" %
                  (coord[0], coord[1], a_vals[i]))
def get_region_counts(inputs):
    start_index = inputs[0][0]
    end_index = inputs[0][1]
    ambig_attribute = inputs[3]
    label_attribute = inputs[2]
    task_index = inputs[4]
    upsample_thresh = inputs[5]
    upsample_attribute = inputs[6]
    flank = inputs[7]
    sample_size = inputs[8]
    counts = []
    print("ambig_attribute:" + str(ambig_attribute))
    print("label_attribute:" + str(label_attribute))
    print("upsample_attribute:" + str(upsample_attribute))
    with tiledb.open(inputs[1], mode='r',
                     ctx=tiledb.Ctx(get_default_config())) as array:
        if ambig_attribute is not None:
            print("starting query for ambig vals:" + str(start_index) + ":" +
                  str(end_index))
            ambig_vals = array.query(
                attrs=[ambig_attribute])[start_index:end_index - 1,
                                         task_index][ambig_attribute]
        if upsample_attribute is not None:
            print("starting query for upsample vals:" + str(start_index) +
                  ":" + str(end_index))
            upsample_vals = array.query(
                attrs=[upsample_attribute])[start_index:end_index - 1,
                                            task_index][upsample_attribute]
        print("starting query for label vals:" + str(start_index) + ":" +
              str(end_index))
        label_vals = array.query(
            attrs=[label_attribute])[start_index:end_index - 1,
                                     task_index][label_attribute]
        print("completed queries")
        if (ambig_attribute is not None) and (upsample_attribute is not None):
            indices_for_training = np.where(
                np.logical_and(ambig_vals == 0,
                               upsample_vals >= upsample_thresh))[0]
        elif (upsample_attribute is not None):
            indices_for_training = np.where(
                upsample_vals >= upsample_thresh)[0]
        elif (ambig_attribute is not None):
            non_ambig_indices = np.where(ambig_vals == 0)[0]
            np.random.seed(1234)
            indices_for_training = np.random.choice(non_ambig_indices,
                                                    sample_size)
        else:
            np.random.seed(1234)
            indices_for_training = np.random.choice(
                np.arange(label_vals.shape[0]), sample_size)
    print("got indices for region")
    for index in indices_for_training:
        try:
            counts.append(np.sum(label_vals[index - flank:index + flank]))
        except:
            #ran off array edge
            continue
    return counts
Exemple #26
0
def get_tiledb_predict_generator(args):
    global test_generator
    if args.upsample_ratio_list_predict is not None:
        upsample_ratio_predict = args.upsample_ratio_list_predict[0]
        print(
            "warning! only a single ratio for upsampling supported for tiledb as of now"
        )
    else:
        upsample_ratio_predict = None
    import tiledb
    tdb_config = get_default_config()
    tdb_ctx = tiledb.Ctx(config=tdb_config)
    #you can only specify one (or neither) or args.fold or args.predict chroms
    assert (args.fold is None) or (args.predict_chroms is None)
    if args.fold is None:
        predict_chroms = args.predict_chroms
    else:
        predict_chroms = get_chroms(args, split='test')
    test_generator = TiledbPredictGenerator(
        ref_fasta=args.ref_fasta,
        batch_size=args.batch_size,
        tdb_array=args.tdb_array,
        tdb_partition_attribute_for_upsample=args.
        tdb_partition_attribute_for_upsample,
        tdb_partition_thresh_for_upsample=args.
        tdb_partition_thresh_for_upsample,
        tdb_partition_datasets_for_upsample=args.
        tdb_partition_datasets_for_upsample,
        upsample_ratio=upsample_ratio_predict,
        num_threads=args.upsample_threads,
        tdb_ambig_attribute=args.tdb_ambig_attribute,
        tdb_input_source_attribute=args.tdb_input_source_attribute,
        tdb_input_flank=args.tdb_input_flank,
        tdb_input_min=args.tdb_input_min,
        tdb_input_max=args.tdb_input_max,
        tdb_output_source_attribute=args.tdb_output_source_attribute,
        tdb_output_flank=args.tdb_output_flank,
        tdb_output_min=args.tdb_output_min,
        tdb_output_max=args.tdb_output_max,
        num_inputs=args.num_inputs,
        num_outputs=args.num_outputs,
        tdb_input_aggregation=args.tdb_input_aggregation,
        tdb_input_transformation=args.tdb_input_transformation,
        pseudocount=args.tdb_transformation_pseudocount,
        tdb_output_aggregation=args.tdb_output_aggregation,
        tdb_output_transformation=args.tdb_output_transformation,
        tiledb_stride=args.tiledb_stride,
        chrom_sizes=args.chrom_sizes,
        chroms=predict_chroms,
        tdb_input_datasets=args.tdb_input_datasets,
        tdb_output_datasets=args.tdb_output_datasets,
        tdb_config=tdb_config,
        tdb_ctx=tdb_ctx,
        bed_regions=args.bed_regions,
        bed_regions_center=args.bed_regions_center,
        add_revcomp=args.revcomp)
    print("created TiledbPredictGenerator")
    return test_generator
def test_ingest_csv_dense_array_apppend(
    udf_uri, array_name, key, secret, namespace, bucket, config
):
    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, "increment"),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        mode="ingest",
        full_domain=True,
        sparse=False,
        name=udf_uri,  # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.DenseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])
        number_of_rows = data.shape[0]
        assert number_of_rows == 20

    tiledb.cloud.udf.exec(
        "s3://{}/inputs/{}.csv".format(bucket, array_name),
        "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name),
        key,
        secret,
        mode="append",
        row_start_idx=number_of_rows,
        name=udf_uri,  # unittest/test_ingest_csv --> TileDB-Inc/ingest_csv
    )

    time.sleep(10)

    with tiledb.DenseArray(
        "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config)
    ) as A:
        data = pd.DataFrame(A[:])

        for col, attribute in enumerate(("a", "b", "c"), 1):
            assert_array_equal(
                data[attribute],
                np.array([row * 10 + col for row in range(1, 21)] * 2),
            )
Exemple #28
0
def read_array_s3(rest_adress, array_uri, token):
    config = tiledb.Config()
    config["rest.token"] = token
    config["rest.server_address"] = rest_adress
    config["vfs.s3.region"] = "eu-central-1"

    ctx = tiledb.Ctx(config)
    with tiledb.SparseArray(array_uri, ctx=ctx) as A:
        print(A[:]["title"])
Exemple #29
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="input cxg directory")
    parser.add_argument("output", help="output cxg directory")
    parser.add_argument("--overwrite",
                        action="store_true",
                        help="replace output cxg directory")
    parser.add_argument("--verbose",
                        "-v",
                        action="count",
                        default=0,
                        help="verbose output")
    parser.add_argument(
        "--sparse-threshold",
        "-s",
        type=float,
        default=5.0,  # default is 5% non-zero values
        help=
        "The X array will be sparse if the percent of non-zeros falls below this value",
    )
    args = parser.parse_args()

    if os.path.exists(args.output):
        print("output dir exists:", args.output)
        if args.overwrite:
            print("output dir removed:", args.output)
            shutil.rmtree(args.output)
        else:
            print("use the overwrite option to remove the output directory")
            sys.exit(1)

    if not os.path.isdir(args.input):
        print("input is not a directory", args.input)
        sys.exit(1)

    shutil.copytree(args.input,
                    args.output,
                    ignore=shutil.ignore_patterns("X", "X_col_shift"))

    ctx = tiledb.Ctx({
        "sm.num_reader_threads": 32,
        "sm.num_writer_threads": 32,
        "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024,
    })

    with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r",
                           ctx=ctx) as X_in:
        is_sparse = cxgtool.save_X(args.output,
                                   X_in,
                                   ctx,
                                   args.sparse_threshold,
                                   expect_sparse=True)

    if is_sparse is False:
        print("The array is not sparse, cleaning up, abort.")
        shutil.rmtree(args.output)
        sys.exit(1)
Exemple #30
0
    def testFromTileDB(self):
        ctx = tiledb.Ctx()

        for sparse in (True, False):
            dom = tiledb.Domain(
                tiledb.Dim(ctx=ctx, name="i", domain=(1, 30), tile=7, dtype=np.int32),
                tiledb.Dim(ctx=ctx, name="j", domain=(1, 20), tile=3, dtype=np.int32),
                tiledb.Dim(ctx=ctx, name="k", domain=(1, 10), tile=4, dtype=np.int32),
                ctx=ctx,
            )
            schema = tiledb.ArraySchema(ctx=ctx, domain=dom, sparse=sparse,
                                        attrs=[tiledb.Attr(ctx=ctx, name='a', dtype=np.float32)])

            tempdir = tempfile.mkdtemp()
            try:
                # create tiledb array
                array_type = tiledb.DenseArray if not sparse else tiledb.SparseArray
                array_type.create(tempdir, schema)

                tensor = fromtiledb(tempdir)
                self.assertIsInstance(tensor.op, TensorTileDBDataSource)
                self.assertEqual(tensor.op.issparse(), sparse)
                self.assertEqual(tensor.shape, (30, 20, 10))
                self.assertEqual(tensor.extra_params.raw_chunk_size, (7, 3, 4))
                self.assertIsNone(tensor.op.tiledb_config)
                self.assertEqual(tensor.op.tiledb_uri, tempdir)
                self.assertIsNone(tensor.op.tiledb_key)
                self.assertIsNone(tensor.op.tiledb_timestamp)

                tensor = tensor.tiles()

                self.assertEqual(len(tensor.chunks), 105)
                self.assertIsInstance(tensor.chunks[0].op, TensorTileDBDataSource)
                self.assertEqual(tensor.chunks[0].op.issparse(), sparse)
                self.assertEqual(tensor.chunks[0].shape, (7, 3, 4))
                self.assertIsNone(tensor.chunks[0].op.tiledb_config)
                self.assertEqual(tensor.chunks[0].op.tiledb_uri, tempdir)
                self.assertIsNone(tensor.chunks[0].op.tiledb_key)
                self.assertIsNone(tensor.chunks[0].op.tiledb_timestamp)
                self.assertEqual(tensor.chunks[0].op.tiledb_dim_starts, (1, 1, 1))

                # test axis_offsets of chunk op
                self.assertEqual(tensor.chunks[0].op.axis_offsets, (0, 0, 0))
                self.assertEqual(tensor.chunks[1].op.axis_offsets, (0, 0, 4))
                self.assertEqual(tensor.cix[0, 2, 2].op.axis_offsets, (0, 6, 8))
                self.assertEqual(tensor.cix[0, 6, 2].op.axis_offsets, (0, 18, 8))
                self.assertEqual(tensor.cix[4, 6, 2].op.axis_offsets, (28, 18, 8))

                tensor2 = fromtiledb(tempdir, ctx=ctx)
                self.assertEqual(tensor2.op.tiledb_config, ctx.config().dict())

                tensor2 = tensor2.tiles()

                self.assertEqual(tensor2.chunks[0].op.tiledb_config, ctx.config().dict())
            finally:
                shutil.rmtree(tempdir)