Esempio n. 1
0
    def write_anndata_x_matrix_to_cxg(self, output_cxg_directory, ctx,
                                      sparse_threshold):
        matrix_container = f"{output_cxg_directory}/X"

        x_matrix_data = self.anndata.X
        is_sparse = is_matrix_sparse(x_matrix_data, sparse_threshold)
        if not is_sparse:
            col_shift = get_column_shift_encode_for_matrix(
                x_matrix_data, sparse_threshold)
            is_sparse = col_shift is not None
        else:
            col_shift = None

        if col_shift is not None:
            logging.info(
                "Converting matrix X as sparse matrix with column shift encoding"
            )
            x_col_shift_name = f"{output_cxg_directory}/X_col_shift"
            convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift,
                                               ctx)

        convert_matrix_to_cxg_array(matrix_container, x_matrix_data, is_sparse,
                                    ctx, col_shift)

        tiledb.consolidate(matrix_container, ctx=ctx)
        if hasattr(tiledb, "vacuum"):
            tiledb.vacuum(matrix_container)
    def test__is_matrix_sparse__partially_populated_dense_matrix_returns_false(self):
        matrix = np.zeros([2, 2])
        matrix[0][0] = 1.0
        matrix[0][1] = 2.2
        matrix[1][1] = 3.7

        self.assertFalse(is_matrix_sparse(matrix, 50))
    def test__is_matrix_sparse_with_column_shift_encoding__giant_matrix_returns_false_early(self):
        matrix = np.random.rand(20000, 20)

        with self.assertLogs(level="INFO") as logger:
            self.assertFalse(is_matrix_sparse(matrix, 1))

            # Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of
            # non-zero elements in the matrix.
            self.assertIn("Percentage of non-zero elements (estimate)", logger.output[0])
    def test__is_matrix_sparse__zero_and_one_hundred_percent_threshold(self):
        matrix = np.array([1, 2, 3])

        self.assertFalse(is_matrix_sparse(matrix, 0))
        self.assertTrue(is_matrix_sparse(matrix, 100))
    def test__is_matrix_sparse__partially_populated_sparse_matrix_returns_true(self):
        matrix = np.zeros([3, 4])
        matrix[2][3] = 1.0
        matrix[1][1] = 2.2

        self.assertTrue(is_matrix_sparse(matrix, 50))
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="input cxg directory")
    parser.add_argument("output", help="output cxg directory")
    parser.add_argument("--overwrite",
                        action="store_true",
                        help="replace output cxg directory")
    parser.add_argument("--verbose",
                        "-v",
                        action="count",
                        default=0,
                        help="verbose output")
    parser.add_argument(
        "--sparse-threshold",
        "-s",
        type=float,
        default=5.0,  # default is 5% non-zero values
        help=
        "The X array will be sparse if the percent of non-zeros falls below this value",
    )
    args = parser.parse_args()

    if os.path.exists(args.output):
        print("output dir exists:", args.output)
        if args.overwrite:
            print("output dir removed:", args.output)
            shutil.rmtree(args.output)
        else:
            print("use the overwrite option to remove the output directory")
            sys.exit(1)

    if not os.path.isdir(args.input):
        print("input is not a directory", args.input)
        sys.exit(1)

    shutil.copytree(args.input,
                    args.output,
                    ignore=shutil.ignore_patterns("X", "X_col_shift"))

    ctx = tiledb.Ctx({
        "sm.num_reader_threads": 32,
        "sm.num_writer_threads": 32,
        "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024,
    })

    with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r",
                           ctx=ctx) as X_in:
        x_matrix_data = X_in[:, :]
        matrix_container = args.output

        is_sparse = is_matrix_sparse(x_matrix_data, args.sparse_threshold)
        if not is_sparse:
            col_shift = get_column_shift_encode_for_matrix(
                x_matrix_data, args.sparse_threshold)
            is_sparse = col_shift is not None
        else:
            col_shift = None

        if col_shift is not None:
            x_col_shift_name = f"{args.output}/X_col_shift"
            convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift,
                                               ctx)
            tiledb.consolidate(matrix_container, ctx=ctx)
        if is_sparse:
            convert_matrix_to_cxg_array(matrix_container, x_matrix_data,
                                        is_sparse, ctx, col_shift)
            tiledb.consolidate(matrix_container, ctx=ctx)

    if not is_sparse:
        print("The array is not sparse, cleaning up, abort.")
        shutil.rmtree(args.output)
        sys.exit(1)