def write_anndata_x_matrix_to_cxg(self, output_cxg_directory, ctx, sparse_threshold): matrix_container = f"{output_cxg_directory}/X" x_matrix_data = self.anndata.X is_sparse = is_matrix_sparse(x_matrix_data, sparse_threshold) if not is_sparse: col_shift = get_column_shift_encode_for_matrix( x_matrix_data, sparse_threshold) is_sparse = col_shift is not None else: col_shift = None if col_shift is not None: logging.info( "Converting matrix X as sparse matrix with column shift encoding" ) x_col_shift_name = f"{output_cxg_directory}/X_col_shift" convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift, ctx) convert_matrix_to_cxg_array(matrix_container, x_matrix_data, is_sparse, ctx, col_shift) tiledb.consolidate(matrix_container, ctx=ctx) if hasattr(tiledb, "vacuum"): tiledb.vacuum(matrix_container)
def test__convert_matrix_to_cxg_array__sparse_array_only_store_nonzeros_empty_array( self): matrix = np.zeros([3, 2]) matrix_name = f"{self.testing_cxg_temp_directory}/awesome_zero_matrix_{uuid4()}" convert_matrix_to_cxg_array(matrix_name, matrix, True, tiledb.Ctx()) actual_stored_array = tiledb.open(matrix_name) self.assertTrue(path.isdir(matrix_name)) self.assertTrue(isinstance(actual_stored_array, tiledb.SparseArray)) self.assertTrue(actual_stored_array[:, :][""].size == 0)
def test__convert_matrix_to_cxg_array__dense_array_writes_successfully( self): matrix = np.float32(np.random.rand(3, 2)) matrix_name = f"{self.testing_cxg_temp_directory}/awesome_matrix_{uuid4()}" convert_matrix_to_cxg_array(matrix_name, matrix, False, tiledb.Ctx()) actual_stored_array = tiledb.open(matrix_name) self.assertTrue(path.isdir(matrix_name)) self.assertTrue(isinstance(actual_stored_array, tiledb.DenseArray)) self.assertTrue((actual_stored_array[:, :] == matrix).all())
def test__convert_matrix_to_cxg_array__sparse_array_with_column_encoding_empty_array( self): matrix_name = f"{self.testing_cxg_temp_directory}/awesome_column_shift_matrix_{uuid4()}" matrix = np.ones((3, 2)) # The column shift will be equal to the matrix since subtracting the column shift from the matrix will create # a matrix of zeros which is sparse. column_shift = np.ones((3, 2)) convert_matrix_to_cxg_array( matrix_name, matrix, True, tiledb.Ctx(), column_shift_for_sparse_encoding=column_shift) actual_stored_array = tiledb.open(matrix_name) self.assertTrue(path.isdir(matrix_name)) self.assertTrue(isinstance(actual_stored_array, tiledb.SparseArray)) self.assertTrue(actual_stored_array[:, :][""].size == 0)
def test__convert_matrix_to_cxg_array__sparse_array_with_column_encoding_partial_array( self): matrix_name = f"{self.testing_cxg_temp_directory}/awesome_column_shift_matrix_{uuid4()}" matrix = np.ones((2, 2)) # Only column shift the first column of ones. column_shift = np.array([[1, 0], [1, 0]]) convert_matrix_to_cxg_array( matrix_name, matrix, True, tiledb.Ctx(), column_shift_for_sparse_encoding=column_shift) actual_stored_array = tiledb.open(matrix_name) self.assertTrue(path.isdir(matrix_name)) self.assertTrue(isinstance(actual_stored_array, tiledb.SparseArray)) self.assertTrue(actual_stored_array[0, 1][""] == 1) self.assertTrue(actual_stored_array[1, 1][""] == 1) self.assertTrue(actual_stored_array[:, :][""].size == 2)
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="input cxg directory") parser.add_argument("output", help="output cxg directory") parser.add_argument("--overwrite", action="store_true", help="replace output cxg directory") parser.add_argument("--verbose", "-v", action="count", default=0, help="verbose output") parser.add_argument( "--sparse-threshold", "-s", type=float, default=5.0, # default is 5% non-zero values help= "The X array will be sparse if the percent of non-zeros falls below this value", ) args = parser.parse_args() if os.path.exists(args.output): print("output dir exists:", args.output) if args.overwrite: print("output dir removed:", args.output) shutil.rmtree(args.output) else: print("use the overwrite option to remove the output directory") sys.exit(1) if not os.path.isdir(args.input): print("input is not a directory", args.input) sys.exit(1) shutil.copytree(args.input, args.output, ignore=shutil.ignore_patterns("X", "X_col_shift")) ctx = tiledb.Ctx({ "sm.num_reader_threads": 32, "sm.num_writer_threads": 32, "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024, }) with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r", ctx=ctx) as X_in: x_matrix_data = X_in[:, :] matrix_container = args.output is_sparse = is_matrix_sparse(x_matrix_data, args.sparse_threshold) if not is_sparse: col_shift = get_column_shift_encode_for_matrix( x_matrix_data, args.sparse_threshold) is_sparse = col_shift is not None else: col_shift = None if col_shift is not None: x_col_shift_name = f"{args.output}/X_col_shift" convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift, ctx) tiledb.consolidate(matrix_container, ctx=ctx) if is_sparse: convert_matrix_to_cxg_array(matrix_container, x_matrix_data, is_sparse, ctx, col_shift) tiledb.consolidate(matrix_container, ctx=ctx) if not is_sparse: print("The array is not sparse, cleaning up, abort.") shutil.rmtree(args.output) sys.exit(1)