def main(): ctx = tiledb.Ctx() try: tiledb.group_create(ctx, "mygroup") tiledb.group_create(ctx, "mygroup") except tiledb.TileDBError as err: print("TileDB exception: {!r}".format(err))
def write_anndata_embeddings_to_cxg(self, output_cxg_directory, ctx): def is_valid_embedding(adata, embedding_name, embedding_array): """ Returns true if this layout data is a valid array for front-end presentation with the following criteria: * ndarray, with shape (n_obs, >= 2), dtype float/int/uint * follows ScanPy embedding naming conventions * with all values finite or NaN (no +Inf or -Inf) """ is_valid = isinstance(embedding_name, str) and embedding_name.startswith( "X_") and len(embedding_name) > 2 is_valid = is_valid and isinstance( embedding_array, np.ndarray) and embedding_array.dtype.kind in "fiu" is_valid = is_valid and embedding_array.shape[ 0] == adata.n_obs and embedding_array.shape[1] >= 2 is_valid = is_valid and not np.any( np.isinf(embedding_array)) and not np.all( np.isnan(embedding_array)) return is_valid embedding_container = f"{output_cxg_directory}/emb" tiledb.group_create(embedding_container, ctx=ctx) for embedding_name, embedding_values in self.anndata.obsm.items(): if is_valid_embedding(self.anndata, embedding_name, embedding_values): embedding_name = f"{embedding_container}/{embedding_name[2:]}" convert_ndarray_to_cxg_dense_array(embedding_name, embedding_values, ctx) logging.info(f"\t\t...{embedding_name} embedding created")
def write_cxg( adata, container, title, var_names=None, obs_names=None, about=None, extract_colors=False, sparse_threshold=5.0 ): if not adata.var.index.is_unique: raise ValueError("Variable index is not unique - unable to convert.") if not adata.obs.index.is_unique: raise ValueError("Observation index is not unique - unable to convert.") """ TileDB bug TileDB-Inc/TileDB#1575 requires that we sanitize all column names prior to saving. This can be reverted when the bug is fixed. """ log(0, "Warning: sanitizing all dataframe column names.") clean_all_column_names(adata) ctx = tiledb.Ctx( { "sm.num_reader_threads": 32, "sm.num_writer_threads": 32, "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024, } ) tiledb.group_create(container, ctx=ctx) log(1, f"\t...group created, with name {container}") # dataset metadata metadata_dict = dict(cxg_version=CXG_VERSION, cxg_properties=json.dumps({"title": title, "about": about})) if extract_colors: try: metadata_dict["cxg_category_colors"] = json.dumps( convert_anndata_category_colors_to_cxg_category_colors(adata) ) except ColorFormatException: log( 0, "Warning: failed to extract colors from h5ad file! " "Fix the h5ad file or rerun with --disable-custom-colors. See help for details.", ) save_metadata(container, metadata_dict) log(1, "\t...dataset metadata saved") # var/gene dataframe save_dataframe(container, "var", adata.var, var_names, ctx=ctx) log(1, "\t...var dataframe created") # obs/cell dataframe save_dataframe(container, "obs", adata.obs, obs_names, ctx=ctx) log(1, "\t...obs dataframe created") # embeddings e_container = f"{container}/emb" tiledb.group_create(e_container, ctx=ctx) save_embeddings(e_container, adata, ctx) log(1, "\t...embeddings created") # X matrix save_X(container, adata.X, ctx, sparse_threshold) log(1, "\t...X created")
def handle_group(self, group_name, group): print(f"group {group} type {type(group)}") # TODO group attrs? print("group name", group_name) path = os.path.join( self.root, group_name[1:] if group_name[0] == '/' else group_name) os.makedirs(path, exist_ok=False) tiledb.group_create(path) print(f"made_group {group_name} at {path}") with open(os.path.join(path, 'attrs.json'), 'w') as fp: json.dump({k: v for k, v in group.attrs.items()}, fp, default=HDF5AttrsEncoder(self.file).default) print(f'wrote group attrs for {group_name}')
def to_cxg(self, output_cxg_directory, sparse_threshold, convert_anndata_colors_to_cxg_colors=True): """ Writes the following attributes of the anndata to CXG: 1) the metadata as metadata attached to an empty DenseArray, 2) the obs DataFrame as a DenseArray, 3) the var DataFrame as a DenseArray, 4) all valid embeddings stored in obsm, each one as a DenseArray, 5) the main X matrix of the anndata as either a SparseArray or DenseArray based on the `sparse_threshold`, and optionally 6) the column shift of the main X matrix that might turn an otherwise Dense matrix into a Sparse matrix. """ logging.info("Beginning writing to CXG.") ctx = tiledb.Ctx({ "sm.num_reader_threads": 32, "sm.num_writer_threads": 32, "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024, }) tiledb.group_create(output_cxg_directory, ctx=ctx) logging.info(f"\t...group created, with name {output_cxg_directory}") convert_dictionary_to_cxg_group( output_cxg_directory, self.generate_cxg_metadata(convert_anndata_colors_to_cxg_colors)) logging.info("\t...dataset metadata saved") convert_dataframe_to_cxg_array(output_cxg_directory, "obs", self.obs, self.obs_index_column_name, ctx) logging.info("\t...dataset obs dataframe saved") convert_dataframe_to_cxg_array(output_cxg_directory, "var", self.var, self.var_index_column_name, ctx) logging.info("\t...dataset var dataframe saved") self.write_anndata_embeddings_to_cxg(output_cxg_directory, ctx) logging.info("\t...dataset embeddings saved") self.write_anndata_x_matrix_to_cxg(output_cxg_directory, ctx, sparse_threshold) logging.info("\t...dataset X matrix saved") logging.info("Completed writing to CXG.")
def to_tiledb(self, uri: Union[str, PurePath]) -> None: uri = URL(uri) if not isinstance(uri, PurePath) else uri if tiledb.object_type(str(uri)) != "group": tiledb.group_create(str(uri)) headers_uri = str(uri / "headers") if tiledb.object_type(headers_uri) != "array": dims = self._get_dims(TRACE_FIELDS_SIZE) header_schema = tiledb.ArraySchema( domain=tiledb.Domain(*dims), sparse=False, attrs=[ tiledb.Attr(f.name, f.dtype, filters=TRACE_FIELD_FILTERS) for f in TRACE_FIELDS ], ) with self._tiledb_array(headers_uri, header_schema) as tdb: self._fill_headers(tdb) data_uri = str(uri / "data") if tiledb.object_type(data_uri) != "array": samples = len(self.segy_file.samples) sample_dtype = self.segy_file.dtype sample_size = sample_dtype.itemsize dims = list(self._get_dims(sample_size * samples)) dims.append( tiledb.Dim( name="samples", domain=(0, samples - 1), dtype=dims[0].dtype, tile=np.clip(self.tile_size // sample_size, 1, samples), )) data_schema = tiledb.ArraySchema( domain=tiledb.Domain(*dims), sparse=False, attrs=[ tiledb.Attr("trace", sample_dtype, filters=(tiledb.LZ4Filter(), )) ], ) with self._tiledb_array(data_uri, data_schema) as tdb: self._fill_data(tdb)
def setUp(self): super().setUp() ctx = tiledb.Ctx() self.group1 = self.path("group1") self.group2 = self.path("group1/group2") self.group3 = self.path("group1/group3") self.group4 = self.path("group1/group3/group4") tiledb.group_create(ctx, self.group1) tiledb.group_create(ctx, self.group2) tiledb.group_create(ctx, self.group3) tiledb.group_create(ctx, self.group4)
def ingest_single_threaded(args): if type(args)==type({}): args=args_object_from_args_dict(args) overwrite=args.overwrite tile_size=args.tile_size attribute_config=args.attribute_config updating=False attribute_info=get_attribute_info(args.attribute_config) tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t') print("loaded tiledb metadata") chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t') print("loaded chrom sizes") #check if the tiledb_group exists, and if not, create it if tiledb.object_type(args.tiledb_group) is not 'group': group_uri=tiledb.group_create(args.tiledb_group) print("created tiledb group") else: group_uri=args.tiledb_group print("tiledb group already exists") for task_index,task_row in tiledb_metadata.iterrows(): dataset=task_row['dataset'] #read in filenames for bigwigs data_dict=open_data_for_parsing(task_row,attribute_info) array_outf_prefix="/".join([args.tiledb_group,dataset]) for chrom_index, chrom_row in chrom_sizes.iterrows(): chrom=chrom_row[0] size=chrom_row[1] array_out_name='.'.join([array_outf_prefix,chrom]) if tiledb.object_type(array_out_name) == "array": if overwrite==False: raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting") else: print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten") updating=True else: #create the array: create_new_array(size=size, attribute_config=attribute_config, array_out_name=array_out_name, tile_size=tile_size) print("created new array:"+str(array_out_name)) print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2)) process_chrom(data_dict,attribute_info,chrom,size,array_out_name,updating,args) print("wrote chrom array for task:"+str(dataset))
def inject_config(self, env_config: ConfigEnvironment) -> None: super(DefaultScenario, self).inject_config(env_config) # Initialize TileDB storage needed for the scenario specific data exp_tmp_dir = self._env_config.temp_dir( experiment_id=self.experiment_id) if self._env_config is not None and exp_tmp_dir is not None: abs_path = pathlib.Path(exp_tmp_dir).resolve().joinpath( 'def_tdb_arrays') self.__tiledb_group_name = abs_path.as_uri() self.__tiledb_stats_array = abs_path.joinpath('stats').as_uri() # Create the tileDB group of arrays used by this scenario tdb_gtype = tiledb.object_type(self.__tiledb_group_name) if tdb_gtype is None: # Group does not exist tiledb.group_create(self.__tiledb_group_name) elif tdb_gtype == 'array': # Exist but an array tiledb.remove(self.__tiledb_group_name) # Remove the array tiledb.group_create( self.__tiledb_group_name) # Create a group instead self._clear_arrays() self._mdde_result_folder_root = env_config.result_dir(self, get_root=True)
def create_domains(self, data_array_name='data', domains_mapping=None): """ Create one TileDB domain for each unique shape / dimensions combination in the input Data Model. Each domain will contain: * one multi-attr array, where the attrs are all the data variables described by this combination of dimensions, and * one array for each of the dimension-describing coordinates for this combination of dimensions. """ self._make_shape_domains() if domains_mapping is None: domains_mapping = self.domains_mapping for domain_name, domain_var_names in domains_mapping.items(): domain_coord_names = domain_name.split(self.domain_separator) # Create group. group_dirname = self.array_path.construct_path(domain_name, '') # XXX This might be failing because the TileDB root dir doesn't exist... # For a POSIX path we must explicitly create the group directory. if self.array_filepath is not None: # TODO why is this necessary? Shouldn't tiledb create if this dir does not exist? self._create_tdb_directory(group_dirname) tiledb.group_create(group_dirname, ctx=self.ctx) # Create and write arrays for each domain-describing coordinate. self.create_domain_arrays(domain_coord_names, domain_name, coords=True) self.populate_domain_arrays(domain_coord_names, domain_name) # Get data vars in this domain and create and populate a multi-attr array. self.create_multiattr_array(domain_var_names, domain_coord_names, domain_name, data_array_name) self.populate_multiattr_array(data_array_name, domain_var_names, domain_name)
def ingest(args): try: if type(args) == type({}): args = args_object_from_args_dict(args) overwrite = args.overwrite chrom_threads = args.chrom_threads batch_size = args.batch_size tile_size = args.tile_size attribute_config = args.attribute_config updating = False attribute_info = get_attribute_info(args.attribute_config) tiledb_metadata = pd.read_csv(args.tiledb_metadata, header=0, sep='\t') print("loaded tiledb metadata") chrom_sizes = pd.read_csv(args.chrom_sizes, header=None, sep='\t') print("loaded chrom sizes") #check if the tiledb_group exists, and if not, create it if tiledb.object_type(args.tiledb_group) is not 'group': group_uri = tiledb.group_create(args.tiledb_group) print("created tiledb group") else: group_uri = args.tiledb_group print("tiledb group already exists") for task_index, task_row in tiledb_metadata.iterrows(): dataset = task_row['dataset'] #read in filenames for bigwigs data_dict = open_data_for_parsing(task_row, attribute_info) array_outf_prefix = "/".join([args.tiledb_group, dataset]) pool_inputs = [] for chrom_index, chrom_row in chrom_sizes.iterrows(): chrom = chrom_row[0] size = chrom_row[1] array_out_name = '.'.join([array_outf_prefix, chrom]) if tiledb.object_type(array_out_name) == "array": if overwrite == False: raise Exception( "array:" + str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting" ) else: print( "warning: the array: " + str(array_out_name) + " already exists. You provided the --overwrite flag, so it will be updated/overwritten" ) updating = True else: #create the array: create_new_array(size=size, attribute_config=attribute_config, array_out_name=array_out_name, tile_size=tile_size) print("created new array:" + str(array_out_name)) pool_inputs.append((data_dict, attribute_info, chrom, size, array_out_name, updating, args)) with Pool(chrom_threads, initializer=init_worker) as pool: #with ThreadPool(chrom_threads) as pool: print("made pool") res = pool.map(process_chrom, pool_inputs) pool.close() pool.join() print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2)) print("wrote chrom array for task:" + str(dataset)) except KeyboardInterrupt: print('detected keyboard interrupt') #shutdown the pool pool.terminate() # Kill remaining child processes kill_child_processes(os.getpid()) raise except Exception as e: print(repr(e)) #shutdown the pool pool.terminate() # Kill remaining child processes kill_child_processes(os.getpid()) raise e
def main(): ctx = tiledb.Ctx() tiledb.group_create(ctx, "my_group") tiledb.group_create(ctx, "my_group/dense_arrays") tiledb.group_create(ctx, "my_group/sparse_arrays")