Ejemplo n.º 1
0
    def write_anndata_x_matrix_to_cxg(self, output_cxg_directory, ctx,
                                      sparse_threshold):
        matrix_container = f"{output_cxg_directory}/X"

        x_matrix_data = self.anndata.X
        is_sparse = is_matrix_sparse(x_matrix_data, sparse_threshold)
        if not is_sparse:
            col_shift = get_column_shift_encode_for_matrix(
                x_matrix_data, sparse_threshold)
            is_sparse = col_shift is not None
        else:
            col_shift = None

        if col_shift is not None:
            logging.info(
                "Converting matrix X as sparse matrix with column shift encoding"
            )
            x_col_shift_name = f"{output_cxg_directory}/X_col_shift"
            convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift,
                                               ctx)

        convert_matrix_to_cxg_array(matrix_container, x_matrix_data, is_sparse,
                                    ctx, col_shift)

        tiledb.consolidate(matrix_container, ctx=ctx)
        if hasattr(tiledb, "vacuum"):
            tiledb.vacuum(matrix_container)
    def test__convert_matrix_to_cxg_array__sparse_array_only_store_nonzeros_empty_array(
            self):
        matrix = np.zeros([3, 2])
        matrix_name = f"{self.testing_cxg_temp_directory}/awesome_zero_matrix_{uuid4()}"

        convert_matrix_to_cxg_array(matrix_name, matrix, True, tiledb.Ctx())

        actual_stored_array = tiledb.open(matrix_name)

        self.assertTrue(path.isdir(matrix_name))
        self.assertTrue(isinstance(actual_stored_array, tiledb.SparseArray))
        self.assertTrue(actual_stored_array[:, :][""].size == 0)
    def test__convert_matrix_to_cxg_array__dense_array_writes_successfully(
            self):
        matrix = np.float32(np.random.rand(3, 2))
        matrix_name = f"{self.testing_cxg_temp_directory}/awesome_matrix_{uuid4()}"

        convert_matrix_to_cxg_array(matrix_name, matrix, False, tiledb.Ctx())

        actual_stored_array = tiledb.open(matrix_name)

        self.assertTrue(path.isdir(matrix_name))
        self.assertTrue(isinstance(actual_stored_array, tiledb.DenseArray))
        self.assertTrue((actual_stored_array[:, :] == matrix).all())
    def test__convert_matrix_to_cxg_array__sparse_array_only_store_nonzeros(
            self):
        matrix = np.zeros([3, 3])
        matrix[0, 0] = 1
        matrix[1, 1] = 1
        matrix[2, 2] = 2
        matrix_name = f"{self.testing_cxg_temp_directory}/awesome_sparse_matrix_{uuid4()}"

        convert_matrix_to_cxg_array(matrix_name, matrix, True, tiledb.Ctx())

        actual_stored_array = tiledb.open(matrix_name)

        self.assertTrue(path.isdir(matrix_name))
        self.assertTrue(isinstance(actual_stored_array, tiledb.SparseArray))
        self.assertTrue(actual_stored_array[0, 0][''] == 1)
        self.assertTrue(actual_stored_array[1, 1][''] == 1)
        self.assertTrue(actual_stored_array[2, 2][''] == 2)
        self.assertTrue(actual_stored_array[:, :][''].size == 3)
    def test__convert_matrix_to_cxg_array__sparse_array_with_column_encoding_empty_array(
            self):
        matrix_name = f"{self.testing_cxg_temp_directory}/awesome_column_shift_matrix_{uuid4()}"
        matrix = np.ones((3, 2))
        # The column shift will be equal to the matrix since subtracting the column shift from the matrix will create
        # a matrix of zeros which is sparse.
        column_shift = np.ones((3, 2))

        convert_matrix_to_cxg_array(
            matrix_name,
            matrix,
            True,
            tiledb.Ctx(),
            column_shift_for_sparse_encoding=column_shift)

        actual_stored_array = tiledb.open(matrix_name)

        self.assertTrue(path.isdir(matrix_name))
        self.assertTrue(isinstance(actual_stored_array, tiledb.SparseArray))
        self.assertTrue(actual_stored_array[:, :][""].size == 0)
    def test__convert_matrix_to_cxg_array__sparse_array_with_column_encoding_partial_array(
            self):
        matrix_name = f"{self.testing_cxg_temp_directory}/awesome_column_shift_matrix_{uuid4()}"
        matrix = np.ones((2, 2))
        # Only column shift the first column of ones.
        column_shift = np.array([[1, 0], [1, 0]])

        convert_matrix_to_cxg_array(
            matrix_name,
            matrix,
            True,
            tiledb.Ctx(),
            column_shift_for_sparse_encoding=column_shift)

        actual_stored_array = tiledb.open(matrix_name)

        self.assertTrue(path.isdir(matrix_name))
        self.assertTrue(isinstance(actual_stored_array, tiledb.SparseArray))
        self.assertTrue(actual_stored_array[0, 1][""] == 1)
        self.assertTrue(actual_stored_array[1, 1][""] == 1)
        self.assertTrue(actual_stored_array[:, :][""].size == 2)
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="input cxg directory")
    parser.add_argument("output", help="output cxg directory")
    parser.add_argument("--overwrite",
                        action="store_true",
                        help="replace output cxg directory")
    parser.add_argument("--verbose",
                        "-v",
                        action="count",
                        default=0,
                        help="verbose output")
    parser.add_argument(
        "--sparse-threshold",
        "-s",
        type=float,
        default=5.0,  # default is 5% non-zero values
        help=
        "The X array will be sparse if the percent of non-zeros falls below this value",
    )
    args = parser.parse_args()

    if os.path.exists(args.output):
        print("output dir exists:", args.output)
        if args.overwrite:
            print("output dir removed:", args.output)
            shutil.rmtree(args.output)
        else:
            print("use the overwrite option to remove the output directory")
            sys.exit(1)

    if not os.path.isdir(args.input):
        print("input is not a directory", args.input)
        sys.exit(1)

    shutil.copytree(args.input,
                    args.output,
                    ignore=shutil.ignore_patterns("X", "X_col_shift"))

    ctx = tiledb.Ctx({
        "sm.num_reader_threads": 32,
        "sm.num_writer_threads": 32,
        "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024,
    })

    with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r",
                           ctx=ctx) as X_in:
        x_matrix_data = X_in[:, :]
        matrix_container = args.output

        is_sparse = is_matrix_sparse(x_matrix_data, args.sparse_threshold)
        if not is_sparse:
            col_shift = get_column_shift_encode_for_matrix(
                x_matrix_data, args.sparse_threshold)
            is_sparse = col_shift is not None
        else:
            col_shift = None

        if col_shift is not None:
            x_col_shift_name = f"{args.output}/X_col_shift"
            convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift,
                                               ctx)
            tiledb.consolidate(matrix_container, ctx=ctx)
        if is_sparse:
            convert_matrix_to_cxg_array(matrix_container, x_matrix_data,
                                        is_sparse, ctx, col_shift)
            tiledb.consolidate(matrix_container, ctx=ctx)

    if not is_sparse:
        print("The array is not sparse, cleaning up, abort.")
        shutil.rmtree(args.output)
        sys.exit(1)