Exemple #1
0
    def execute(cls, ctx, op):
        tiledb_config = tiledb.Config(op.tiledb_config)
        uri = op.tiledb_uri
        key = op.tiledb_key

        tiledb.consolidate(config=tiledb_config, uri=uri, key=key)
        ctx[op.outputs[0].key] = ctx[op.inputs[0].key]
Exemple #2
0
def convert_ndarray_to_cxg_dense_array(ndarray_name, ndarray, ctx):
    """
    Saves contents of ndarray to the CXG output directory specified.

    Generally this function is used to convert dataset embeddings. Because embeddings are typically accessed with
    very large slices (or all of the embedding), they do not benefit from overly aggressive compression due to their
    format.  Given this, we use a large tile size (1000) but only default compression level.
    """
    def create_ndarray_array(ndarray_name, ndarray):
        filters = tiledb.FilterList([tiledb.ZstdFilter()])
        attrs = [tiledb.Attr(dtype=ndarray.dtype, filters=filters)]
        dimensions = [
            tiledb.Dim(domain=(0, ndarray.shape[dimension] - 1),
                       tile=min(ndarray.shape[dimension], 1000),
                       dtype=np.uint32) for dimension in range(ndarray.ndim)
        ]
        domain = tiledb.Domain(*dimensions)
        schema = tiledb.ArraySchema(domain=domain,
                                    sparse=False,
                                    attrs=attrs,
                                    capacity=1_000_000,
                                    cell_order="row-major",
                                    tile_order="row-major")
        tiledb.DenseArray.create(ndarray_name, schema)

    create_ndarray_array(ndarray_name, ndarray)

    with tiledb.DenseArray(ndarray_name, mode="w", ctx=ctx) as array:
        array[:] = ndarray

    tiledb.consolidate(ndarray_name, ctx=ctx)
Exemple #3
0
    def write_anndata_x_matrix_to_cxg(self, output_cxg_directory, ctx,
                                      sparse_threshold):
        matrix_container = f"{output_cxg_directory}/X"

        x_matrix_data = self.anndata.X
        is_sparse = is_matrix_sparse(x_matrix_data, sparse_threshold)
        if not is_sparse:
            col_shift = get_column_shift_encode_for_matrix(
                x_matrix_data, sparse_threshold)
            is_sparse = col_shift is not None
        else:
            col_shift = None

        if col_shift is not None:
            logging.info(
                "Converting matrix X as sparse matrix with column shift encoding"
            )
            x_col_shift_name = f"{output_cxg_directory}/X_col_shift"
            convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift,
                                               ctx)

        convert_matrix_to_cxg_array(matrix_container, x_matrix_data, is_sparse,
                                    ctx, col_shift)

        tiledb.consolidate(matrix_container, ctx=ctx)
        if hasattr(tiledb, "vacuum"):
            tiledb.vacuum(matrix_container)
Exemple #4
0
 def _tiledb_array(self, uri: str,
                   schema: tiledb.ArraySchema) -> Iterator[tiledb.Array]:
     tiledb.Array.create(uri, schema)
     with tiledb.open(uri, mode="w") as tdb:
         yield tdb
     tiledb.consolidate(uri, config=self.config)
     tiledb.vacuum(uri, config=self.config)
def consolidate_fragments(
    uri,
    amplification,
    buffer_size,
    step_max_frags,
    step_min_frags,
    step_size_ratio,
    steps,
    vacuum,
):
    """
    Consolidate the fragments in an array located at uri.
    """
    config = tiledb.Config()
    config["sm.consolidation.mode"] = "fragments"
    config["sm.consolidation.amplification"] = amplification
    config["sm.consolidation.buffer_size"] = buffer_size
    config["sm.consolidation.step_max_frags"] = step_max_frags
    config["sm.consolidation.step_min_frags"] = step_min_frags
    config["sm.consolidation.step_size_ratio"] = step_size_ratio
    config["sm.consolidation.steps"] = steps
    ctx = tiledb.Ctx(config)

    tiledb.consolidate(uri, ctx=ctx)

    print(vacuum)
    if vacuum:
        config = tiledb.Config({"sm.vacuum.mode": "fragments"})
        tiledb.vacuum(uri, ctx=tiledb.Ctx(config))
        print("here?")
Exemple #6
0
def save_embeddings(container, adata, ctx):
    for (name, value) in adata.obsm.items():
        if is_valid_embedding(adata, name, value):
            e_name = f"{container}/{name[2:]}"
            create_emb(e_name, value)
            with tiledb.DenseArray(e_name, mode="w", ctx=ctx) as A:
                A[:] = value
            tiledb.consolidate(e_name, ctx=ctx)
            log(1, f"\t\t...{name} embedding created")
def consolidate_array_metadata(uri, vacuum):
    """
    Consolidate the array metadata in an array located at uri.
    """
    config = tiledb.Config()
    config["sm.consolidation.mode"] = "array_meta"
    ctx = tiledb.Ctx(config)

    tiledb.consolidate(uri, ctx=ctx)

    if vacuum:
        config = tiledb.Config({"sm.vacuum.mode": "array_meta"})
        tiledb.vacuum(uri, ctx=tiledb.Ctx(config))
Exemple #8
0
def convert_dataframe_to_cxg_array(cxg_container, dataframe_name, dataframe,
                                   index_column_name, ctx):
    """
    Saves the contents of the dataframe to the CXG output directory specified.

    Current access patterns are oriented toward reading very large slices of the dataframe, one attribute at a time.
    Attribute data also tends to be (often) repetitive (bools, categories, strings). Given this, we use a large tile
    size (1000) and very aggressive compression levels.
    """
    def create_dataframe_array(array_name, dataframe):
        tiledb_filter = tiledb.FilterList([
            # Attempt aggressive compression as many of these dataframes are very repetitive strings, bools and
            # other non-float data.
            tiledb.ZstdFilter(level=22),
        ])
        attrs = [
            tiledb.Attr(name=column,
                        dtype=get_dtype_of_array(dataframe[column]),
                        filters=tiledb_filter) for column in dataframe
        ]
        domain = tiledb.Domain(
            tiledb.Dim(domain=(0, dataframe.shape[0] - 1),
                       tile=min(dataframe.shape[0], 1000),
                       dtype=np.uint32))
        schema = tiledb.ArraySchema(domain=domain,
                                    sparse=False,
                                    attrs=attrs,
                                    cell_order="row-major",
                                    tile_order="row-major")
        tiledb.DenseArray.create(array_name, schema)

    array_name = f"{cxg_container}/{dataframe_name}"

    create_dataframe_array(array_name, dataframe)

    with tiledb.DenseArray(array_name, mode="w", ctx=ctx) as array:
        value = {}
        schema_hints = {}
        for column_name, column_values in dataframe.items():
            dtype, hints = get_dtype_and_schema_of_array(column_values)
            value[column_name] = column_values.to_numpy(dtype=dtype)
            if hints:
                schema_hints.update({column_name: hints})

        schema_hints.update({"index": index_column_name})
        array[:] = value
        array.meta["cxg_schema"] = json.dumps(schema_hints)

    tiledb.consolidate(array_name, ctx=ctx)
Exemple #9
0
def save_dataframe(container, name, df, index_col_name, ctx):
    A_name = f"{container}/{name}"
    (df, index_col_name) = alias_index_col(df, name, index_col_name)
    create_dataframe(A_name, df, ctx=ctx)
    with tiledb.DenseArray(A_name, mode="w", ctx=ctx) as A:
        value = {}
        schema_hints = {}
        for k, v in df.items():
            dtype, hints = cxg_type(v)
            value[k] = v.to_numpy(dtype=dtype)
            if hints:
                schema_hints.update({k: hints})

        schema_hints.update({"index": index_col_name})
        A[:] = value
        A.meta["cxg_schema"] = json.dumps(schema_hints)

    tiledb.consolidate(A_name, ctx=ctx)
Exemple #10
0
def save_X(container, adata, ctx):
    # Save X count matrix
    X_name = f"{container}/X"
    shape = adata.X.shape
    create_X(X_name, shape)

    stride = min(int(np.power(10, np.around(np.log10(1e9 / shape[1])))), 10_000)
    with tiledb.DenseArray(X_name, mode="w", ctx=ctx) as X:
        for row in range(0, shape[0], stride):
            lim = min(row + stride, shape[0])
            a = adata.X[row:lim, :]
            if type(a) is not np.ndarray:
                a = a.toarray()
            X[row:lim, :] = a
            log(2, "\t...rows", row, "to", lim)
        tiledb.consolidate(X_name, ctx=ctx)

    tiledb.consolidate(X_name, ctx=ctx)
 def _run_consolidate(self, domain_names, data_array_name, verbose=False):
     # Consolidate at the end of the append operations to make the resultant
     # array more performant.
     config_key_name = "sm.consolidation.steps"
     config_key_value = 100
     if self.ctx is None:
         config = tiledb.Config({config_key_name: config_key_value})
         ctx = tiledb.Ctx(config)
     else:
         cfg_dict = self.ctx.config().dict()
         cfg_dict[config_key_name] = config_key_value
         ctx = tiledb.Ctx(config=tiledb.Config(cfg_dict))
     for i, domain_name in enumerate(domain_names):
         if verbose:
             print()  # Clear last carriage-returned print statement.
             print(f'Consolidating array: {i+1}/{len(domain_names)}',
                   end="\r")
         else:
             print('Consolidating...')
         array_path = self.array_path.construct_path(
             domain_name, data_array_name)
         tiledb.consolidate(array_path, ctx=ctx)
Exemple #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="input cxg directory")
    parser.add_argument("output", help="output cxg directory")
    parser.add_argument("--overwrite",
                        action="store_true",
                        help="replace output cxg directory")
    parser.add_argument("--verbose",
                        "-v",
                        action="count",
                        default=0,
                        help="verbose output")
    parser.add_argument(
        "--sparse-threshold",
        "-s",
        type=float,
        default=5.0,  # default is 5% non-zero values
        help=
        "The X array will be sparse if the percent of non-zeros falls below this value",
    )
    args = parser.parse_args()

    if os.path.exists(args.output):
        print("output dir exists:", args.output)
        if args.overwrite:
            print("output dir removed:", args.output)
            shutil.rmtree(args.output)
        else:
            print("use the overwrite option to remove the output directory")
            sys.exit(1)

    if not os.path.isdir(args.input):
        print("input is not a directory", args.input)
        sys.exit(1)

    shutil.copytree(args.input,
                    args.output,
                    ignore=shutil.ignore_patterns("X", "X_col_shift"))

    ctx = tiledb.Ctx({
        "sm.num_reader_threads": 32,
        "sm.num_writer_threads": 32,
        "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024,
    })

    with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r",
                           ctx=ctx) as X_in:
        x_matrix_data = X_in[:, :]
        matrix_container = args.output

        is_sparse = is_matrix_sparse(x_matrix_data, args.sparse_threshold)
        if not is_sparse:
            col_shift = get_column_shift_encode_for_matrix(
                x_matrix_data, args.sparse_threshold)
            is_sparse = col_shift is not None
        else:
            col_shift = None

        if col_shift is not None:
            x_col_shift_name = f"{args.output}/X_col_shift"
            convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift,
                                               ctx)
            tiledb.consolidate(matrix_container, ctx=ctx)
        if is_sparse:
            convert_matrix_to_cxg_array(matrix_container, x_matrix_data,
                                        is_sparse, ctx, col_shift)
            tiledb.consolidate(matrix_container, ctx=ctx)

    if not is_sparse:
        print("The array is not sparse, cleaning up, abort.")
        shutil.rmtree(args.output)
        sys.exit(1)
Exemple #13
0
def save_X(container, xdata, ctx, sparse_threshold, expect_sparse=False):
    # Save X count matrix
    X_name = f"{container}/X"

    shape = xdata.shape
    log(1, "\t...shape:", str(shape))

    col_shift = None
    if sparse_threshold == 100:
        is_sparse = True
    elif sparse_threshold == 0:
        is_sparse = False
    else:
        is_sparse, nnz, nelem = evaluate_for_sparse_encoding(xdata, sparse_threshold)
        percent = 100.0 * nnz / nelem
        if nelem != shape[0] * shape[1]:
            log(1, "\t...sparse=", is_sparse, "non-zeros percent (estimate): %6.2f" % percent)
        else:
            log(1, "\t...sparse=", is_sparse, "non-zeros:", nnz, "percent: %6.2f" % percent)

        is_sparse = percent < sparse_threshold
        if not is_sparse:
            col_shift, nnz, nelem = evaluate_for_sparse_column_shift_encoding(xdata, sparse_threshold)
            is_sparse = col_shift is not None
            percent = 100.0 * nnz / nelem
            if nelem != shape[0] * shape[1]:
                log(1, "\t...sparse=", is_sparse, "col shift non-zeros percent (estimate): %6.2f" % percent)
            else:
                log(1, "\t...sparse=", is_sparse, "col shift non-zeros:", nnz, "percent: %6.2f" % percent)

    if expect_sparse is True and is_sparse is False:
        return False

    create_X(X_name, shape, is_sparse)
    stride = min(int(np.power(10, np.around(np.log10(1e9 / shape[1])))), 10_000)
    if is_sparse:
        if col_shift is not None:
            log(1, "\t...output X as sparse matrix with column shift encoding")
            X_col_shift_name = f"{container}/X_col_shift"
            filters = tiledb.FilterList([tiledb.ZstdFilter()])
            attrs = [tiledb.Attr(dtype=np.float32, filters=filters)]
            domain = tiledb.Domain(tiledb.Dim(domain=(0, shape[1] - 1), tile=min(shape[1], 5000), dtype=np.uint32))
            schema = tiledb.ArraySchema(domain=domain, attrs=attrs)
            tiledb.DenseArray.create(X_col_shift_name, schema)
            with tiledb.DenseArray(X_col_shift_name, mode="w", ctx=ctx) as X_col_shift:
                X_col_shift[:] = col_shift
            tiledb.consolidate(X_col_shift_name, ctx=ctx)
        else:
            log(1, "\t...output X as sparse matrix")

        with tiledb.SparseArray(X_name, mode="w", ctx=ctx) as X:
            nnz = 0
            for row in range(0, shape[0], stride):
                lim = min(row + stride, shape[0])
                a = xdata[row:lim, :]
                if type(a) is not np.ndarray:
                    a = a.toarray()
                if col_shift is not None:
                    a = a - col_shift
                indices = np.nonzero(a)
                trow = indices[0] + row
                nnz += indices[0].shape[0]
                X[trow, indices[1]] = a[indices[0], indices[1]]
                log(2, "\t...rows", lim, "of", shape[0], "nnz", nnz, "sparse", nnz / (lim * shape[1]))

    else:
        log(1, "\t...output X as dense matrix")
        with tiledb.DenseArray(X_name, mode="w", ctx=ctx) as X:
            for row in range(0, shape[0], stride):
                lim = min(row + stride, shape[0])
                a = xdata[row:lim, :]
                if type(a) is not np.ndarray:
                    a = a.toarray()
                X[row:lim, :] = a
                log(2, "\t...rows", row, "to", lim)

    tiledb.consolidate(X_name, ctx=ctx)
    if hasattr(tiledb, "vacuum"):
        tiledb.vacuum(X_name)

    return is_sparse