コード例 #1
0
ファイル: tiledb_error.py プロジェクト: umvarma/TileDB-Py
def main():
    ctx = tiledb.Ctx()
    try:
        tiledb.group_create(ctx, "mygroup")
        tiledb.group_create(ctx, "mygroup")
    except tiledb.TileDBError as err:
        print("TileDB exception: {!r}".format(err))
コード例 #2
0
    def write_anndata_embeddings_to_cxg(self, output_cxg_directory, ctx):
        def is_valid_embedding(adata, embedding_name, embedding_array):
            """
            Returns true if this layout data is a valid array for front-end presentation with the following criteria:
                * ndarray, with shape (n_obs, >= 2), dtype float/int/uint
                * follows ScanPy embedding naming conventions
                * with all values finite or NaN (no +Inf or -Inf)
            """

            is_valid = isinstance(embedding_name,
                                  str) and embedding_name.startswith(
                                      "X_") and len(embedding_name) > 2
            is_valid = is_valid and isinstance(
                embedding_array,
                np.ndarray) and embedding_array.dtype.kind in "fiu"
            is_valid = is_valid and embedding_array.shape[
                0] == adata.n_obs and embedding_array.shape[1] >= 2
            is_valid = is_valid and not np.any(
                np.isinf(embedding_array)) and not np.all(
                    np.isnan(embedding_array))
            return is_valid

        embedding_container = f"{output_cxg_directory}/emb"
        tiledb.group_create(embedding_container, ctx=ctx)

        for embedding_name, embedding_values in self.anndata.obsm.items():
            if is_valid_embedding(self.anndata, embedding_name,
                                  embedding_values):
                embedding_name = f"{embedding_container}/{embedding_name[2:]}"
                convert_ndarray_to_cxg_dense_array(embedding_name,
                                                   embedding_values, ctx)
                logging.info(f"\t\t...{embedding_name} embedding created")
コード例 #3
0
ファイル: cxgtool.py プロジェクト: saeedseyyedi/cellxgene
def write_cxg(
    adata, container, title, var_names=None, obs_names=None, about=None, extract_colors=False, sparse_threshold=5.0
):
    if not adata.var.index.is_unique:
        raise ValueError("Variable index is not unique - unable to convert.")
    if not adata.obs.index.is_unique:
        raise ValueError("Observation index is not unique - unable to convert.")

    """
    TileDB bug TileDB-Inc/TileDB#1575 requires that we sanitize all column names
    prior to saving.  This can be reverted when the bug is fixed.
    """
    log(0, "Warning: sanitizing all dataframe column names.")
    clean_all_column_names(adata)

    ctx = tiledb.Ctx(
        {
            "sm.num_reader_threads": 32,
            "sm.num_writer_threads": 32,
            "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024,
        }
    )

    tiledb.group_create(container, ctx=ctx)
    log(1, f"\t...group created, with name {container}")

    # dataset metadata
    metadata_dict = dict(cxg_version=CXG_VERSION, cxg_properties=json.dumps({"title": title, "about": about}))
    if extract_colors:
        try:
            metadata_dict["cxg_category_colors"] = json.dumps(
                convert_anndata_category_colors_to_cxg_category_colors(adata)
            )
        except ColorFormatException:
            log(
                0,
                "Warning: failed to extract colors from h5ad file! "
                "Fix the h5ad file or rerun with --disable-custom-colors. See help for details.",
            )
    save_metadata(container, metadata_dict)
    log(1, "\t...dataset metadata saved")

    # var/gene dataframe
    save_dataframe(container, "var", adata.var, var_names, ctx=ctx)
    log(1, "\t...var dataframe created")

    # obs/cell dataframe
    save_dataframe(container, "obs", adata.obs, obs_names, ctx=ctx)
    log(1, "\t...obs dataframe created")

    # embeddings
    e_container = f"{container}/emb"
    tiledb.group_create(e_container, ctx=ctx)
    save_embeddings(e_container, adata, ctx)
    log(1, "\t...embeddings created")

    # X matrix
    save_X(container, adata.X, ctx, sparse_threshold)
    log(1, "\t...X created")
コード例 #4
0
 def handle_group(self, group_name, group):
     print(f"group {group} type {type(group)}")
     # TODO group attrs?
     print("group name", group_name)
     path = os.path.join(
         self.root, group_name[1:] if group_name[0] == '/' else group_name)
     os.makedirs(path, exist_ok=False)
     tiledb.group_create(path)
     print(f"made_group {group_name} at {path}")
     with open(os.path.join(path, 'attrs.json'), 'w') as fp:
         json.dump({k: v
                    for k, v in group.attrs.items()},
                   fp,
                   default=HDF5AttrsEncoder(self.file).default)
     print(f'wrote group attrs for {group_name}')
コード例 #5
0
    def to_cxg(self,
               output_cxg_directory,
               sparse_threshold,
               convert_anndata_colors_to_cxg_colors=True):
        """
        Writes the following attributes of the anndata to CXG: 1) the metadata as metadata attached to an empty
        DenseArray, 2) the obs DataFrame as a DenseArray, 3) the var DataFrame as a DenseArray, 4) all valid
        embeddings stored in obsm, each one as a DenseArray, 5) the main X matrix of the anndata as either a
        SparseArray or DenseArray based on the `sparse_threshold`, and optionally 6) the column shift of the main X
        matrix that might turn an otherwise Dense matrix into a Sparse matrix.
        """

        logging.info("Beginning writing to CXG.")
        ctx = tiledb.Ctx({
            "sm.num_reader_threads":
            32,
            "sm.num_writer_threads":
            32,
            "sm.consolidation.buffer_size":
            1 * 1024 * 1024 * 1024,
        })

        tiledb.group_create(output_cxg_directory, ctx=ctx)
        logging.info(f"\t...group created, with name {output_cxg_directory}")

        convert_dictionary_to_cxg_group(
            output_cxg_directory,
            self.generate_cxg_metadata(convert_anndata_colors_to_cxg_colors))
        logging.info("\t...dataset metadata saved")

        convert_dataframe_to_cxg_array(output_cxg_directory, "obs", self.obs,
                                       self.obs_index_column_name, ctx)
        logging.info("\t...dataset obs dataframe saved")

        convert_dataframe_to_cxg_array(output_cxg_directory, "var", self.var,
                                       self.var_index_column_name, ctx)
        logging.info("\t...dataset var dataframe saved")

        self.write_anndata_embeddings_to_cxg(output_cxg_directory, ctx)
        logging.info("\t...dataset embeddings saved")

        self.write_anndata_x_matrix_to_cxg(output_cxg_directory, ctx,
                                           sparse_threshold)
        logging.info("\t...dataset X matrix saved")

        logging.info("Completed writing to CXG.")
コード例 #6
0
ファイル: convert.py プロジェクト: TileDB-Inc/TileDB-Segy
    def to_tiledb(self, uri: Union[str, PurePath]) -> None:
        uri = URL(uri) if not isinstance(uri, PurePath) else uri

        if tiledb.object_type(str(uri)) != "group":
            tiledb.group_create(str(uri))

        headers_uri = str(uri / "headers")
        if tiledb.object_type(headers_uri) != "array":
            dims = self._get_dims(TRACE_FIELDS_SIZE)
            header_schema = tiledb.ArraySchema(
                domain=tiledb.Domain(*dims),
                sparse=False,
                attrs=[
                    tiledb.Attr(f.name, f.dtype, filters=TRACE_FIELD_FILTERS)
                    for f in TRACE_FIELDS
                ],
            )
            with self._tiledb_array(headers_uri, header_schema) as tdb:
                self._fill_headers(tdb)

        data_uri = str(uri / "data")
        if tiledb.object_type(data_uri) != "array":
            samples = len(self.segy_file.samples)
            sample_dtype = self.segy_file.dtype
            sample_size = sample_dtype.itemsize
            dims = list(self._get_dims(sample_size * samples))
            dims.append(
                tiledb.Dim(
                    name="samples",
                    domain=(0, samples - 1),
                    dtype=dims[0].dtype,
                    tile=np.clip(self.tile_size // sample_size, 1, samples),
                ))
            data_schema = tiledb.ArraySchema(
                domain=tiledb.Domain(*dims),
                sparse=False,
                attrs=[
                    tiledb.Attr("trace",
                                sample_dtype,
                                filters=(tiledb.LZ4Filter(), ))
                ],
            )
            with self._tiledb_array(data_uri, data_schema) as tdb:
                self._fill_data(tdb)
コード例 #7
0
    def setUp(self):
        super().setUp()

        ctx = tiledb.Ctx()
        self.group1 = self.path("group1")
        self.group2 = self.path("group1/group2")
        self.group3 = self.path("group1/group3")
        self.group4 = self.path("group1/group3/group4")

        tiledb.group_create(ctx, self.group1)
        tiledb.group_create(ctx, self.group2)
        tiledb.group_create(ctx, self.group3)
        tiledb.group_create(ctx, self.group4)
コード例 #8
0
ファイル: __init__.py プロジェクト: amtseng/seqdataloader
def ingest_single_threaded(args):
    if type(args)==type({}):
        args=args_object_from_args_dict(args)
        
    overwrite=args.overwrite
    tile_size=args.tile_size
    attribute_config=args.attribute_config
    updating=False

    attribute_info=get_attribute_info(args.attribute_config) 
    tiledb_metadata=pd.read_csv(args.tiledb_metadata,header=0,sep='\t')

    print("loaded tiledb metadata")
    chrom_sizes=pd.read_csv(args.chrom_sizes,header=None,sep='\t')
    print("loaded chrom sizes")

    #check if the tiledb_group exists, and if not, create it
    if tiledb.object_type(args.tiledb_group) is not 'group':        
        group_uri=tiledb.group_create(args.tiledb_group)
        print("created tiledb group") 
    else:
        group_uri=args.tiledb_group
        print("tiledb group already exists")
        
    for task_index,task_row in tiledb_metadata.iterrows():
        dataset=task_row['dataset']    
        #read in filenames for bigwigs
        data_dict=open_data_for_parsing(task_row,attribute_info)
        array_outf_prefix="/".join([args.tiledb_group,dataset])
        
        for chrom_index, chrom_row in chrom_sizes.iterrows():
            chrom=chrom_row[0]
            size=chrom_row[1]
            array_out_name='.'.join([array_outf_prefix,chrom])
            
            if tiledb.object_type(array_out_name) == "array":
                if overwrite==False:
                    raise Exception("array:"+str(array_out_name) + "already exists; use the --overwrite flag to overwrite it. Exiting")
                else:
                    print("warning: the array: "+str(array_out_name)+" already exists. You provided the --overwrite flag, so it will be updated/overwritten")
                    updating=True
            else:
                #create the array:
                create_new_array(size=size,
                                 attribute_config=attribute_config,
                                 array_out_name=array_out_name,
                                 tile_size=tile_size)
                
                print("created new array:"+str(array_out_name))
            print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2))
            process_chrom(data_dict,attribute_info,chrom,size,array_out_name,updating,args)
            print("wrote chrom array for task:"+str(dataset))
コード例 #9
0
ファイル: default_scenario.py プロジェクト: akharitonov/mdde
    def inject_config(self, env_config: ConfigEnvironment) -> None:
        super(DefaultScenario, self).inject_config(env_config)
        # Initialize TileDB storage needed for the scenario specific data
        exp_tmp_dir = self._env_config.temp_dir(
            experiment_id=self.experiment_id)
        if self._env_config is not None and exp_tmp_dir is not None:
            abs_path = pathlib.Path(exp_tmp_dir).resolve().joinpath(
                'def_tdb_arrays')
            self.__tiledb_group_name = abs_path.as_uri()
            self.__tiledb_stats_array = abs_path.joinpath('stats').as_uri()
        # Create the tileDB group of arrays used by this scenario
        tdb_gtype = tiledb.object_type(self.__tiledb_group_name)
        if tdb_gtype is None:  # Group does not exist
            tiledb.group_create(self.__tiledb_group_name)
        elif tdb_gtype == 'array':  # Exist but an array
            tiledb.remove(self.__tiledb_group_name)  # Remove the array
            tiledb.group_create(
                self.__tiledb_group_name)  # Create a group instead
        self._clear_arrays()

        self._mdde_result_folder_root = env_config.result_dir(self,
                                                              get_root=True)
コード例 #10
0
    def create_domains(self, data_array_name='data', domains_mapping=None):
        """
        Create one TileDB domain for each unique shape / dimensions combination
        in the input Data Model. Each domain will contain:
          * one multi-attr array, where the attrs are all the data variables described
            by this combination of dimensions, and
          * one array for each of the dimension-describing coordinates for this
            combination of dimensions.

        """
        self._make_shape_domains()
        if domains_mapping is None:
            domains_mapping = self.domains_mapping

        for domain_name, domain_var_names in domains_mapping.items():
            domain_coord_names = domain_name.split(self.domain_separator)

            # Create group.
            group_dirname = self.array_path.construct_path(domain_name, '')
            # XXX This might be failing because the TileDB root dir doesn't exist...
            # For a POSIX path we must explicitly create the group directory.
            if self.array_filepath is not None:
                # TODO why is this necessary? Shouldn't tiledb create if this dir does not exist?
                self._create_tdb_directory(group_dirname)

            tiledb.group_create(group_dirname, ctx=self.ctx)

            # Create and write arrays for each domain-describing coordinate.
            self.create_domain_arrays(domain_coord_names,
                                      domain_name,
                                      coords=True)
            self.populate_domain_arrays(domain_coord_names, domain_name)

            # Get data vars in this domain and create and populate a multi-attr array.
            self.create_multiattr_array(domain_var_names, domain_coord_names,
                                        domain_name, data_array_name)
            self.populate_multiattr_array(data_array_name, domain_var_names,
                                          domain_name)
コード例 #11
0
def ingest(args):
    try:
        if type(args) == type({}):
            args = args_object_from_args_dict(args)
        overwrite = args.overwrite
        chrom_threads = args.chrom_threads
        batch_size = args.batch_size
        tile_size = args.tile_size
        attribute_config = args.attribute_config
        updating = False

        attribute_info = get_attribute_info(args.attribute_config)
        tiledb_metadata = pd.read_csv(args.tiledb_metadata, header=0, sep='\t')

        print("loaded tiledb metadata")
        chrom_sizes = pd.read_csv(args.chrom_sizes, header=None, sep='\t')
        print("loaded chrom sizes")

        #check if the tiledb_group exists, and if not, create it
        if tiledb.object_type(args.tiledb_group) is not 'group':
            group_uri = tiledb.group_create(args.tiledb_group)
            print("created tiledb group")
        else:
            group_uri = args.tiledb_group
            print("tiledb group already exists")
        for task_index, task_row in tiledb_metadata.iterrows():
            dataset = task_row['dataset']
            #read in filenames for bigwigs
            data_dict = open_data_for_parsing(task_row, attribute_info)
            array_outf_prefix = "/".join([args.tiledb_group, dataset])
            pool_inputs = []
            for chrom_index, chrom_row in chrom_sizes.iterrows():
                chrom = chrom_row[0]
                size = chrom_row[1]
                array_out_name = '.'.join([array_outf_prefix, chrom])
                if tiledb.object_type(array_out_name) == "array":
                    if overwrite == False:
                        raise Exception(
                            "array:" + str(array_out_name) +
                            "already exists; use the --overwrite flag to overwrite it. Exiting"
                        )
                    else:
                        print(
                            "warning: the array: " + str(array_out_name) +
                            " already exists. You provided the --overwrite flag, so it will be updated/overwritten"
                        )
                        updating = True
                else:
                    #create the array:
                    create_new_array(size=size,
                                     attribute_config=attribute_config,
                                     array_out_name=array_out_name,
                                     tile_size=tile_size)
                    print("created new array:" + str(array_out_name))
                pool_inputs.append((data_dict, attribute_info, chrom, size,
                                    array_out_name, updating, args))
            with Pool(chrom_threads, initializer=init_worker) as pool:
                #with ThreadPool(chrom_threads) as pool:
                print("made pool")
                res = pool.map(process_chrom, pool_inputs)
            pool.close()
            pool.join()
            print('Gigs:', round(psutil.virtual_memory().used / (10**9), 2))
            print("wrote chrom array for task:" + str(dataset))
    except KeyboardInterrupt:
        print('detected keyboard interrupt')
        #shutdown the pool
        pool.terminate()
        # Kill remaining child processes
        kill_child_processes(os.getpid())
        raise
    except Exception as e:
        print(repr(e))
        #shutdown the pool
        pool.terminate()
        # Kill remaining child processes
        kill_child_processes(os.getpid())
        raise e
コード例 #12
0
def main():
    ctx = tiledb.Ctx()
    tiledb.group_create(ctx, "my_group")
    tiledb.group_create(ctx, "my_group/dense_arrays")
    tiledb.group_create(ctx, "my_group/sparse_arrays")