def consolidate_fragments( uri, amplification, buffer_size, step_max_frags, step_min_frags, step_size_ratio, steps, vacuum, ): """ Consolidate the fragments in an array located at uri. """ config = tiledb.Config() config["sm.consolidation.mode"] = "fragments" config["sm.consolidation.amplification"] = amplification config["sm.consolidation.buffer_size"] = buffer_size config["sm.consolidation.step_max_frags"] = step_max_frags config["sm.consolidation.step_min_frags"] = step_min_frags config["sm.consolidation.step_size_ratio"] = step_size_ratio config["sm.consolidation.steps"] = steps ctx = tiledb.Ctx(config) tiledb.consolidate(uri, ctx=ctx) print(vacuum) if vacuum: config = tiledb.Config({"sm.vacuum.mode": "fragments"}) tiledb.vacuum(uri, ctx=tiledb.Ctx(config)) print("here?")
def test_tiledb_test(): import tiledb n = 1000 m = 1000 num_vals = 1000 n_idxs = np.sort(np.random.choice(n, num_vals, replace=False)) m_idxs = np.sort(np.random.choice(m, num_vals, replace=False)) values = np.random.randint(0, 100, num_vals, np.uint8) ctx = tiledb.Ctx() n_tile_extent = min(100, n) d1 = tiledb.Dim("ndom", domain=(0, n - 1), tile=n_tile_extent, dtype="uint32", ctx=ctx) d2 = tiledb.Dim("mdom", domain=(0, m - 1), tile=m, dtype="uint32", ctx=ctx) domain = tiledb.Domain(d1, d2, ctx=ctx) v = tiledb.Attr( "v", filters=tiledb.FilterList([tiledb.LZ4Filter(level=-1)]), dtype="uint8", ctx=ctx, ) schema = tiledb.ArraySchema( domain=domain, attrs=(v, ), capacity=10000, cell_order="row-major", tile_order="row-major", sparse=True, ctx=ctx, ) with tempfile.TemporaryDirectory() as tdir: path = os.path.join(tdir, "arr.tiledb") tiledb.SparseArray.create(path, schema) with tiledb.SparseArray(path, mode="w", ctx=ctx) as A: A[n_idxs, m_idxs] = values ctx2 = tiledb.Ctx() s = tiledb.SparseArray(path, mode="r", ctx=ctx2) vs1 = s[1:10, 1:50] _ = s[:, :] vs2 = s[1:10, 1:50] assert vs1["v"].shape[0] == vs2["v"].shape[0]
def write_array(args, updating, chunks_to_process): try: #config tdb_Config=tiledb.Config(tdb_config_params) tdb_write_Context=tiledb.Ctx(config=tdb_Config) if updating is True: tdb_read_Context=tiledb.Ctx(config=tdb_Config) cur_array_toread=tiledb.DenseArray(args.array_name,ctx=tdb_read_Context,mode='r') cur_array_towrite=tiledb.DenseArray(args.array_name,ctx=tdb_write_Context,mode='w') chunks_processed=0 while chunks_processed < chunks_to_process: while write_queue.empty() is True: time.sleep(10) processed_chunk=write_queue.get() processed_chunk_unpickled=pickle.loads(processed_chunk) task_index=processed_chunk_unpickled[0] start_index=processed_chunk_unpickled[1] end_index=processed_chunk_unpickled[2] dict_to_write=processed_chunk_unpickled[3] if updating is True: #we are only updating some attributes in the array cur_vals=cur_array_toread[start_index:end_index,task_index] #print("got cur vals for task "+str(task_index)+" for "+str(start_index)+":"+str(end_index)) for key in dict_to_write: cur_vals[key]=dict_to_write[key] dict_to_write=cur_vals print("updated data dict for writing:"+args.array_name) else: #we are writing for the first time, make sure all attributes are provided, if some are not, use a nan array required_attrib=list(get_attribute_info(args.attribute_config,args.attribute_config_file).keys()) #print(str(required_attrib)) for attrib in required_attrib: if attrib not in dict_to_write: print("augmenting") dict_to_write[attrib]=np.full(end_index-start_index,np.nan) #write in chunks cur_array_towrite[start_index:end_index,task_index]=dict_to_write print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2)) gc.collect() chunks_processed+=1 print("wrote to disk "+str(task_index)+" for "+str(start_index)+":"+str(end_index)+";"+str(chunks_processed)+"/"+str(chunks_to_process)) assert chunks_processed >=chunks_to_process print("closing arrays") if updating is True: cur_array_toread.close() cur_array_towrite.close() return except KeyboardInterrupt: kill_child_processes(os.getpid()) #try to delete all tmp files raise except Exception as e: print(e) kill_child_processes(os.getpid()) raise Exception(e.message)
def test_ingest_csv_sparse_array_apppend_header_mismatch( udf_uri, array_name, key, secret, namespace, bucket, config ): """ Create a sparse array from a CSV file using ingest_csv() in the default ingest mode and then append additional data to it using the append mode. The appended data contains header names that do not match the data in the sparse array and must be renamed. """ tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment_sparse1"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, mode="ingest", full_domain=True, index_col=("x"), name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.SparseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) number_of_rows = data.shape[0] assert number_of_rows == 20 tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment_sparse2_mismatch"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, mode="append", full_domain=True, index_col=("x"), header=0, names=["x", "c", "b", "a"], name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.SparseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) for col, attribute in enumerate(("a", "b", "c"), 1): assert_array_equal( data[attribute], np.array([row * 10 + col for row in range(1, 21)] * 2), )
def consolidate_array_metadata(uri, vacuum): """ Consolidate the array metadata in an array located at uri. """ config = tiledb.Config() config["sm.consolidation.mode"] = "array_meta" ctx = tiledb.Ctx(config) tiledb.consolidate(uri, ctx=ctx) if vacuum: config = tiledb.Config({"sm.vacuum.mode": "array_meta"}) tiledb.vacuum(uri, ctx=tiledb.Ctx(config))
def main(): ctx = tiledb.Ctx() # Create dimensions d1 = tiledb.Dim(ctx, "d1", domain=(1, 4), tile=2, dtype="uint64") d2 = tiledb.Dim(ctx, "d2", domain=(1, 4), tile=2, dtype="uint64") # Create domain domain = tiledb.Domain(ctx, d1, d2) # Create attributes a1 = tiledb.Attr(ctx, "a1", compressor=('blosc-lz', -1), dtype="int32") a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="S10") a3 = tiledb.Attr(ctx, "a3", compressor=('zstd', -1), dtype='float32,float32') # Create sparse array tiledb.SparseArray(ctx, "my_sparse_array", domain=domain, attrs=(a1, a2, a3), capacity=2, cell_order='row-major', tile_order='row-major')
def test_walk_group(self): ctx = tiledb.Ctx() groups = [] def append_to_groups(path, obj): groups.append((path, obj)) tiledb.walk(ctx, self.path(""), append_to_groups, order="preorder") groups.sort() self.assertTrue(groups[0][0].endswith(self.group1) and groups[0][1] == "group") self.assertTrue(groups[1][0].endswith(self.group2) and groups[1][1] == "group") self.assertTrue(groups[2][0].endswith(self.group3) and groups[2][1] == "group") self.assertTrue(groups[3][0].endswith(self.group4) and groups[3][1] == "group") groups = [] tiledb.walk(ctx, self.path(""), append_to_groups, order="postorder") self.assertTrue(groups[0][0].endswith(self.group2) and groups[0][1] == "group") self.assertTrue(groups[1][0].endswith(self.group4) and groups[1][1] == "group") self.assertTrue(groups[2][0].endswith(self.group3) and groups[2][1] == "group") self.assertTrue(groups[3][0].endswith(self.group1) and groups[3][1] == "group")
def totiledb(uri, x, ctx=None, key=None, timestamp=None): import tiledb x = astensor(x) raw_ctx = ctx if raw_ctx is None: ctx = tiledb.Ctx() tiledb_array_type = tiledb.SparseArray if x.issparse( ) else tiledb.DenseArray try: tiledb_array = tiledb_array_type(uri=uri, key=key, timestamp=timestamp, ctx=ctx) # if already created, we will check the shape and dtype check_tiledb_array_with_tensor(x, tiledb_array) except tiledb.TileDBError: # not exist, as we don't know the tile, # we will create the tiledb array in the tile of tensor pass tiledb_config = None if raw_ctx is None else raw_ctx.config().dict() op = TensorTileDBDataStore(tiledb_config=tiledb_config, tiledb_uri=uri, tiledb_key=key, tiledb_timestamp=timestamp, dtype=x.dtype, sparse=x.issparse()) return op(x)
def test_ingest_csv_dense_array( udf_uri, array_name, key, secret, namespace, bucket, config ): """ Create a dense array from a CSV file using ingest_csv(). """ tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, sparse=False, name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.DenseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) for col, attribute in enumerate(("a", "b", "c"), 1): assert_array_equal( data[attribute], np.array([row * 10 + col for row in range(1, 21)]), )
def main(): ctx = tiledb.Ctx() try: tiledb.group_create(ctx, "mygroup") tiledb.group_create(ctx, "mygroup") except tiledb.TileDBError as err: print("TileDB exception: {!r}".format(err))
def test_remove_group(self): ctx = tiledb.Ctx() tiledb.remove(ctx, self.group3) self.assertFalse(self.is_group(ctx, self.group3)) self.assertFalse(self.is_group(ctx, self.group4))
def testCheckTileDB(self): ctx = tiledb.Ctx() tempdir = tempfile.mkdtemp() try: np_a = np.random.rand(2, 3) tiledb_a = tiledb.DenseArray.from_numpy(ctx=ctx, uri=tempdir, array=np_a) with self.assertRaises(ValueError): # ndim not match check_tiledb_array_with_tensor(random.rand(2, 3, 4), tiledb_a) with self.assertRaises(ValueError): # shape not matchn check_tiledb_array_with_tensor(random.rand(2, 4), tiledb_a) with self.assertRaises(ValueError): # dtype not match check_tiledb_array_with_tensor( random.rand(2, 3, dtype=np.float32), tiledb_a) # legal check_tiledb_array_with_tensor(random.rand(2, 3), tiledb_a) finally: shutil.rmtree(tempdir)
def main(): # Create TileDB context ctx = tiledb.Ctx() # Load the dense array schema dense_example = tiledb.DenseArray.load(ctx, "my_dense_array") # Retrieve and print the non-empty domain nonempty = dense_example.nonempty_domain() print("Non-empty domain:") for i in range(dense_example.ndim): print("{0!s}: {1!r}".format( dense_example.domain.dim(i).name, nonempty[i])) # Read the entire array. `result` is a dict of numpy arrays result = dense_example[:] # Print the results result_num = result["a1"].size print("\nResult num: ", result_num) print() print("{:<5s}{:<10s}{:<10s}{:<10s}".format("a1", "a2", "a3[0]", "a3[1]")) print("------------------------------") for i in range(4): for j in range(4): print("{:<5d}{:<10s}{:<10.1f}{:<10.1f}".format( result["a1"][i, j], result["a2"][i, j], result["a3"][i, j][0], result["a3"][i, j][1])) print()
def test_dim_start_float(): ctx = tiledb.Ctx() dom = tiledb.Domain( tiledb.Dim(ctx=ctx, name="i", domain=(0.0, 6.0), tile=6, dtype=np.float64), ctx=ctx, ) schema = tiledb.ArraySchema( ctx=ctx, domain=dom, sparse=True, attrs=[tiledb.Attr(ctx=ctx, name='a', dtype=np.float32)]) tempdir = tempfile.mkdtemp() try: # create tiledb array tiledb.SparseArray.create(tempdir, schema) with pytest.raises(ValueError): fromtiledb(tempdir, ctx=ctx) finally: shutil.rmtree(tempdir)
def main(): # Create TileDB context ctx = tiledb.Ctx() # Create TileDB VFS vfs = tiledb.VFS(ctx) # Create directory if not vfs.is_dir("dir_A"): vfs.create_dir("dir_A") print("Created dir_A") else: print("dir_A already exists") # Creating an(empty) file if not vfs.is_file("dir_A/file_A"): vfs.touch("dir_A/file_A") print("Created empty file dir_A/file_A") else: print("dir_A/file_A already exists") # Getting the file size print("File size: {0!s}".format(vfs.file_size("dir_A/file_A"))) # Moving files(moving directories is similar) print("Moving file dir_A/file_A to dir_A/file_B") vfs.move("dir_A/file_A", "dir_A/file_B", force=True) # Deleting files and directories print("Deleting dir_A/file_B and dir_A") vfs.remove_file("dir_A/file_B") vfs.remove_dir("dir_A")
def get_upsampled_indices_chrom(inputs): region_start = inputs[0] region_end = inputs[1] tdb_array_name = inputs[2] tdb_ambig_attribute = inputs[3] tdb_partition_attribute_for_upsample = inputs[4] dataset_indices = inputs[5] tdb_partition_thresh_for_upsample = inputs[6] print("starting getting indices to upsample in range:" + str(region_start) + "-" + str(region_end)) with tiledb.open(tdb_array_name, 'r', ctx=tiledb.Ctx(get_default_config())) as tdb_array: if tdb_ambig_attribute is not None: attr_vals = tdb_array.query(attrs=[ tdb_ambig_attribute, tdb_partition_attribute_for_upsample ]).multi_index[region_start:region_end - 1, dataset_indices] ambig_attr_vals = np.sum(attr_vals[tdb_ambig_attribute], axis=1) else: attr_vals = tdb_array.query( attrs=[tdb_partition_attribute_for_upsample]).multi_index[ region_start:region_end - 1, dataset_indices] upsample_vals = np.sum(attr_vals[tdb_partition_attribute_for_upsample], axis=1) if tdb_ambig_attribute is not None: cur_upsampled_indices = region_start + np.argwhere( (upsample_vals >= tdb_partition_thresh_for_upsample) & (ambig_attr_vals == 0)) else: cur_upsampled_indices = region_start + np.argwhere( upsample_vals >= tdb_partition_thresh_for_upsample) print("finished indices to upsample in range:" + str(region_start) + "-" + str(region_end)) return cur_upsampled_indices
def test_ingest_csv_sparse_array_null_replace( udf_uri, array_name, key, secret, namespace, bucket, config ): """ From a CSV file containing NaNs, produce a sparse array using ingest_csv() where the NaNs are replaced with the value given by fillna. """ tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment_nulls"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, fillna=123, name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.SparseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) assert_array_equal(data["a"], np.array([1, 1, 1])) assert_array_equal(data["b"], np.array([2, 2, 123])) assert_array_equal(data["c"], np.array([3, 123, 123]))
def test_sparse_schema(self): ctx = tiledb.Ctx() # create dimensions d1 = tiledb.Dim(ctx, "", domain=(1, 1000), tile=10, dtype="uint64") d2 = tiledb.Dim(ctx, "d2", domain=(101, 10000), tile=100, dtype="uint64") # create domain domain = tiledb.Domain(ctx, d1, d2) # create attributes a1 = tiledb.Attr(ctx, "", dtype="int32,int32,int32") a2 = tiledb.Attr(ctx, "a2", compressor=("gzip", -1), dtype="float32") # create sparse array with schema schema = tiledb.SparseArray(ctx, self.path("sparse_array_schema"), domain=domain, attrs=(a1, a2), capacity=10, cell_order='col-major', tile_order='row-major', coords_compressor=('zstd', 4), offsets_compressor=('blosc-lz', 5)) self.assertEqual(schema.capacity, 10) self.assertEqual(schema.cell_order, "col-major") self.assertEqual(schema.tile_order, "row-major") self.assertEqual(schema.coords_compressor, ('zstd', 4)) self.assertEqual(schema.offsets_compressor, ('blosc-lz', 5))
def test_store_tiledb_execution(setup): ctx = tiledb.Ctx() tempdir = tempfile.mkdtemp() try: # store TileDB dense array expected = np.random.rand(8, 4, 3) a = tensor(expected, chunk_size=(3, 3, 2)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(expected, arr.read_direct()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store tensor with 1 chunk to TileDB dense array a = arange(12) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(np.arange(12), arr.read_direct()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store 2-d TileDB sparse array expected = sps.random(8, 7, density=0.1) a = tensor(expected, chunk_size=(3, 5)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.SparseArray(uri=tempdir, ctx=ctx) as arr: data = arr[:, :] coords = data['coords'] value = data[arr.attr(0).name] ij = tuple(coords[arr.domain.dim(k).name] for k in range(arr.ndim)) result = sps.coo_matrix((value, ij), shape=arr.shape) np.testing.assert_allclose(expected.toarray(), result.toarray()) finally: shutil.rmtree(tempdir) tempdir = tempfile.mkdtemp() try: # store TileDB dense array expected = np.asfortranarray(np.random.rand(8, 4, 3)) a = tensor(expected, chunk_size=(3, 3, 2)) save = totiledb(tempdir, a, ctx=ctx) save.execute() with tiledb.DenseArray(uri=tempdir, ctx=ctx) as arr: np.testing.assert_allclose(expected, arr.read_direct()) assert arr.schema.cell_order == 'col-major' finally: shutil.rmtree(tempdir)
def main(): # Create TileDB context ctx = tiledb.Ctx() # KV objects are limited to storing string keys/values for the time being a1 = tiledb.Attr(ctx, "value", compressor=("gzip", -1), dtype=bytes) kv = tiledb.KV(ctx, "my_kv", attrs=(a1, )) # Dump the KV schema kv.dump() # Update the KV with some key-value pairs vals = {"key1": "a", "key2": "bb", "key3": "dddd"} print("Updating KV with values: {!r}\n".format(vals)) kv.update(vals) # Get kv item print("KV value for 'key3': {}\n".format(kv['key3'])) try: kv["don't exist"] except KeyError: print("KeyError was raised for key 'don't exist'\n") # Set kv item kv['key3'] = "eeeee" print("Updated KV value for 'key3': {}\n".format(kv['key3'])) # Consolidate kv updates kv.consolidate() # Convert kv to Python dict kv_dict = dict(kv) print("Convert to Python dict: {!r}\n".format(kv_dict))
def testStoreTileDB(self): ctx = tiledb.Ctx() tempdir = tempfile.mkdtemp() try: t = random.rand(50, 30, chunk_size=13) t2 = t + 1 saved = totiledb(tempdir, t2) self.assertEqual(saved.shape, (0, 0)) self.assertIsNone(saved.op.tiledb_config) self.assertEquals(saved.op.tiledb_uri, tempdir) with self.assertRaises(tiledb.TileDBError): tiledb.DenseArray(ctx=ctx, uri=tempdir) # tiledb array is created in the tile saved.tiles() # no error tiledb.DenseArray(ctx=ctx, uri=tempdir) self.assertEqual(saved.chunks[0].op.axis_offsets, (0, 0)) self.assertEqual(saved.chunks[1].op.axis_offsets, (0, 13)) self.assertEqual(saved.cix[0, 2].op.axis_offsets, (0, 26)) self.assertEqual(saved.cix[1, 2].op.axis_offsets, (13, 26)) self.assertEqual(saved.cix[3, 2].op.axis_offsets, (39, 26)) with self.assertRaises(ValueError): t3 = random.rand(30, 50) totiledb(tempdir, t3, ctx=ctx) # shape incompatible finally: shutil.rmtree(tempdir)
def write_cxg( adata, container, title, var_names=None, obs_names=None, about=None, extract_colors=False, sparse_threshold=5.0 ): if not adata.var.index.is_unique: raise ValueError("Variable index is not unique - unable to convert.") if not adata.obs.index.is_unique: raise ValueError("Observation index is not unique - unable to convert.") """ TileDB bug TileDB-Inc/TileDB#1575 requires that we sanitize all column names prior to saving. This can be reverted when the bug is fixed. """ log(0, "Warning: sanitizing all dataframe column names.") clean_all_column_names(adata) ctx = tiledb.Ctx( { "sm.num_reader_threads": 32, "sm.num_writer_threads": 32, "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024, } ) tiledb.group_create(container, ctx=ctx) log(1, f"\t...group created, with name {container}") # dataset metadata metadata_dict = dict(cxg_version=CXG_VERSION, cxg_properties=json.dumps({"title": title, "about": about})) if extract_colors: try: metadata_dict["cxg_category_colors"] = json.dumps( convert_anndata_category_colors_to_cxg_category_colors(adata) ) except ColorFormatException: log( 0, "Warning: failed to extract colors from h5ad file! " "Fix the h5ad file or rerun with --disable-custom-colors. See help for details.", ) save_metadata(container, metadata_dict) log(1, "\t...dataset metadata saved") # var/gene dataframe save_dataframe(container, "var", adata.var, var_names, ctx=ctx) log(1, "\t...var dataframe created") # obs/cell dataframe save_dataframe(container, "obs", adata.obs, obs_names, ctx=ctx) log(1, "\t...obs dataframe created") # embeddings e_container = f"{container}/emb" tiledb.group_create(e_container, ctx=ctx) save_embeddings(e_container, adata, ctx) log(1, "\t...embeddings created") # X matrix save_X(container, adata.X, ctx, sparse_threshold) log(1, "\t...X created")
def __call__(self, coords): ''' coords is a list of named tuples : .chrom, .start, .end, .isplusstrand returns nparray of values associated with coordinates ''' assert len(coords) > 0 self.ctx = tiledb.Ctx() return self.call_function(coords)
def read_array(): ctx = tiledb.Ctx() with tiledb.SparseArray(ctx, array_name, mode='r') as A: data = A[1:11] a_vals = data["a"] for i, coord in enumerate(data["coords"]): print("Cell (%d, %d) has data %d" % (coord[0], coord[1], a_vals[i]))
def get_region_counts(inputs): start_index = inputs[0][0] end_index = inputs[0][1] ambig_attribute = inputs[3] label_attribute = inputs[2] task_index = inputs[4] upsample_thresh = inputs[5] upsample_attribute = inputs[6] flank = inputs[7] sample_size = inputs[8] counts = [] print("ambig_attribute:" + str(ambig_attribute)) print("label_attribute:" + str(label_attribute)) print("upsample_attribute:" + str(upsample_attribute)) with tiledb.open(inputs[1], mode='r', ctx=tiledb.Ctx(get_default_config())) as array: if ambig_attribute is not None: print("starting query for ambig vals:" + str(start_index) + ":" + str(end_index)) ambig_vals = array.query( attrs=[ambig_attribute])[start_index:end_index - 1, task_index][ambig_attribute] if upsample_attribute is not None: print("starting query for upsample vals:" + str(start_index) + ":" + str(end_index)) upsample_vals = array.query( attrs=[upsample_attribute])[start_index:end_index - 1, task_index][upsample_attribute] print("starting query for label vals:" + str(start_index) + ":" + str(end_index)) label_vals = array.query( attrs=[label_attribute])[start_index:end_index - 1, task_index][label_attribute] print("completed queries") if (ambig_attribute is not None) and (upsample_attribute is not None): indices_for_training = np.where( np.logical_and(ambig_vals == 0, upsample_vals >= upsample_thresh))[0] elif (upsample_attribute is not None): indices_for_training = np.where( upsample_vals >= upsample_thresh)[0] elif (ambig_attribute is not None): non_ambig_indices = np.where(ambig_vals == 0)[0] np.random.seed(1234) indices_for_training = np.random.choice(non_ambig_indices, sample_size) else: np.random.seed(1234) indices_for_training = np.random.choice( np.arange(label_vals.shape[0]), sample_size) print("got indices for region") for index in indices_for_training: try: counts.append(np.sum(label_vals[index - flank:index + flank])) except: #ran off array edge continue return counts
def get_tiledb_predict_generator(args): global test_generator if args.upsample_ratio_list_predict is not None: upsample_ratio_predict = args.upsample_ratio_list_predict[0] print( "warning! only a single ratio for upsampling supported for tiledb as of now" ) else: upsample_ratio_predict = None import tiledb tdb_config = get_default_config() tdb_ctx = tiledb.Ctx(config=tdb_config) #you can only specify one (or neither) or args.fold or args.predict chroms assert (args.fold is None) or (args.predict_chroms is None) if args.fold is None: predict_chroms = args.predict_chroms else: predict_chroms = get_chroms(args, split='test') test_generator = TiledbPredictGenerator( ref_fasta=args.ref_fasta, batch_size=args.batch_size, tdb_array=args.tdb_array, tdb_partition_attribute_for_upsample=args. tdb_partition_attribute_for_upsample, tdb_partition_thresh_for_upsample=args. tdb_partition_thresh_for_upsample, tdb_partition_datasets_for_upsample=args. tdb_partition_datasets_for_upsample, upsample_ratio=upsample_ratio_predict, num_threads=args.upsample_threads, tdb_ambig_attribute=args.tdb_ambig_attribute, tdb_input_source_attribute=args.tdb_input_source_attribute, tdb_input_flank=args.tdb_input_flank, tdb_input_min=args.tdb_input_min, tdb_input_max=args.tdb_input_max, tdb_output_source_attribute=args.tdb_output_source_attribute, tdb_output_flank=args.tdb_output_flank, tdb_output_min=args.tdb_output_min, tdb_output_max=args.tdb_output_max, num_inputs=args.num_inputs, num_outputs=args.num_outputs, tdb_input_aggregation=args.tdb_input_aggregation, tdb_input_transformation=args.tdb_input_transformation, pseudocount=args.tdb_transformation_pseudocount, tdb_output_aggregation=args.tdb_output_aggregation, tdb_output_transformation=args.tdb_output_transformation, tiledb_stride=args.tiledb_stride, chrom_sizes=args.chrom_sizes, chroms=predict_chroms, tdb_input_datasets=args.tdb_input_datasets, tdb_output_datasets=args.tdb_output_datasets, tdb_config=tdb_config, tdb_ctx=tdb_ctx, bed_regions=args.bed_regions, bed_regions_center=args.bed_regions_center, add_revcomp=args.revcomp) print("created TiledbPredictGenerator") return test_generator
def test_ingest_csv_dense_array_apppend( udf_uri, array_name, key, secret, namespace, bucket, config ): tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, "increment"), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, mode="ingest", full_domain=True, sparse=False, name=udf_uri, # "unittest/test_ingest_csv" --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.DenseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) number_of_rows = data.shape[0] assert number_of_rows == 20 tiledb.cloud.udf.exec( "s3://{}/inputs/{}.csv".format(bucket, array_name), "tiledb://{}/s3://{}/{}.tdb".format(namespace, bucket, array_name), key, secret, mode="append", row_start_idx=number_of_rows, name=udf_uri, # unittest/test_ingest_csv --> TileDB-Inc/ingest_csv ) time.sleep(10) with tiledb.DenseArray( "tiledb://{}/{}.tdb".format(namespace, array_name), "r", ctx=tiledb.Ctx(config) ) as A: data = pd.DataFrame(A[:]) for col, attribute in enumerate(("a", "b", "c"), 1): assert_array_equal( data[attribute], np.array([row * 10 + col for row in range(1, 21)] * 2), )
def read_array_s3(rest_adress, array_uri, token): config = tiledb.Config() config["rest.token"] = token config["rest.server_address"] = rest_adress config["vfs.s3.region"] = "eu-central-1" ctx = tiledb.Ctx(config) with tiledb.SparseArray(array_uri, ctx=ctx) as A: print(A[:]["title"])
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="input cxg directory") parser.add_argument("output", help="output cxg directory") parser.add_argument("--overwrite", action="store_true", help="replace output cxg directory") parser.add_argument("--verbose", "-v", action="count", default=0, help="verbose output") parser.add_argument( "--sparse-threshold", "-s", type=float, default=5.0, # default is 5% non-zero values help= "The X array will be sparse if the percent of non-zeros falls below this value", ) args = parser.parse_args() if os.path.exists(args.output): print("output dir exists:", args.output) if args.overwrite: print("output dir removed:", args.output) shutil.rmtree(args.output) else: print("use the overwrite option to remove the output directory") sys.exit(1) if not os.path.isdir(args.input): print("input is not a directory", args.input) sys.exit(1) shutil.copytree(args.input, args.output, ignore=shutil.ignore_patterns("X", "X_col_shift")) ctx = tiledb.Ctx({ "sm.num_reader_threads": 32, "sm.num_writer_threads": 32, "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024, }) with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r", ctx=ctx) as X_in: is_sparse = cxgtool.save_X(args.output, X_in, ctx, args.sparse_threshold, expect_sparse=True) if is_sparse is False: print("The array is not sparse, cleaning up, abort.") shutil.rmtree(args.output) sys.exit(1)
def testFromTileDB(self): ctx = tiledb.Ctx() for sparse in (True, False): dom = tiledb.Domain( tiledb.Dim(ctx=ctx, name="i", domain=(1, 30), tile=7, dtype=np.int32), tiledb.Dim(ctx=ctx, name="j", domain=(1, 20), tile=3, dtype=np.int32), tiledb.Dim(ctx=ctx, name="k", domain=(1, 10), tile=4, dtype=np.int32), ctx=ctx, ) schema = tiledb.ArraySchema(ctx=ctx, domain=dom, sparse=sparse, attrs=[tiledb.Attr(ctx=ctx, name='a', dtype=np.float32)]) tempdir = tempfile.mkdtemp() try: # create tiledb array array_type = tiledb.DenseArray if not sparse else tiledb.SparseArray array_type.create(tempdir, schema) tensor = fromtiledb(tempdir) self.assertIsInstance(tensor.op, TensorTileDBDataSource) self.assertEqual(tensor.op.issparse(), sparse) self.assertEqual(tensor.shape, (30, 20, 10)) self.assertEqual(tensor.extra_params.raw_chunk_size, (7, 3, 4)) self.assertIsNone(tensor.op.tiledb_config) self.assertEqual(tensor.op.tiledb_uri, tempdir) self.assertIsNone(tensor.op.tiledb_key) self.assertIsNone(tensor.op.tiledb_timestamp) tensor = tensor.tiles() self.assertEqual(len(tensor.chunks), 105) self.assertIsInstance(tensor.chunks[0].op, TensorTileDBDataSource) self.assertEqual(tensor.chunks[0].op.issparse(), sparse) self.assertEqual(tensor.chunks[0].shape, (7, 3, 4)) self.assertIsNone(tensor.chunks[0].op.tiledb_config) self.assertEqual(tensor.chunks[0].op.tiledb_uri, tempdir) self.assertIsNone(tensor.chunks[0].op.tiledb_key) self.assertIsNone(tensor.chunks[0].op.tiledb_timestamp) self.assertEqual(tensor.chunks[0].op.tiledb_dim_starts, (1, 1, 1)) # test axis_offsets of chunk op self.assertEqual(tensor.chunks[0].op.axis_offsets, (0, 0, 0)) self.assertEqual(tensor.chunks[1].op.axis_offsets, (0, 0, 4)) self.assertEqual(tensor.cix[0, 2, 2].op.axis_offsets, (0, 6, 8)) self.assertEqual(tensor.cix[0, 6, 2].op.axis_offsets, (0, 18, 8)) self.assertEqual(tensor.cix[4, 6, 2].op.axis_offsets, (28, 18, 8)) tensor2 = fromtiledb(tempdir, ctx=ctx) self.assertEqual(tensor2.op.tiledb_config, ctx.config().dict()) tensor2 = tensor2.tiles() self.assertEqual(tensor2.chunks[0].op.tiledb_config, ctx.config().dict()) finally: shutil.rmtree(tempdir)