def convert_to_cxg( input_file, output_directory, backed, title, about, sparse_threshold, obs_names, var_names, disable_custom_colors, disable_corpora_schema, overwrite, ): """ Convert a dataset file into CXG. """ h5ad_data_file = H5ADDataFile( input_file, backed, title, about, obs_names, var_names, use_corpora_schema=not disable_corpora_schema) # Get the directory that will hold all the CXG files cxg_output_container = get_output_directory(input_file, output_directory, overwrite) h5ad_data_file.to_cxg( cxg_output_container, sparse_threshold, convert_anndata_colors_to_cxg_colors=not disable_custom_colors)
def test__create_h5ad_data_file__non_h5ad_raises_exception(self): non_h5ad_filename = "my_fancy_dataset.csv" with self.assertRaises(Exception) as exception_context: H5ADDataFile(non_h5ad_filename) self.assertIn("File must be an H5AD", str(exception_context.exception))
def test__create_h5ad_data_file__inputted_dataset_title_and_about_overrides_extracted( self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename, dataset_about="override_about", dataset_title="override_title") self.assertEqual(h5ad_file.dataset_title, "override_title") self.assertEqual(h5ad_file.dataset_about, "override_about")
def sparse_diffexp(self, apply_col_shift): with tempfile.TemporaryDirectory() as dirname: # create a sparse matrix h5adfile_path = os.path.join(dirname, "sparse.h5ad") create_test_h5ad(h5adfile_path, 2000, 2000, 10, apply_col_shift) h5ad_file_to_convert = H5ADDataFile(h5adfile_path, use_corpora_schema=False) sparsename = os.path.join(dirname, "sparse.cxg") h5ad_file_to_convert.to_cxg(sparsename, 11, True) adaptor_anndata = self.load_dataset(h5adfile_path, extra_dataset_config=dict(embeddings__names=[])) adaptor_sparse = self.load_dataset(sparsename) assert adaptor_sparse.open_array("X").schema.sparse assert adaptor_sparse.has_array("X_col_shift") == apply_col_shift densename = os.path.join(dirname, "dense.cxg") h5ad_file_to_convert.to_cxg(densename, True, 0) adaptor_dense = self.load_dataset(densename) assert not adaptor_dense.open_array("X").schema.sparse assert not adaptor_dense.has_array("X_col_shift") maskA = self.get_mask(adaptor_anndata, 1, 10) maskB = self.get_mask(adaptor_anndata, 2, 10) diffexp_results_anndata = diffexp_generic.diffexp_ttest(adaptor_anndata, maskA, maskB, 10) diffexp_results_sparse = diffexp_cxg.diffexp_ttest(adaptor_sparse, maskA, maskB, 10) diffexp_results_dense = diffexp_cxg.diffexp_ttest(adaptor_dense, maskA, maskB, 10) self.compare_diffexp_results(diffexp_results_anndata, diffexp_results_sparse) self.compare_diffexp_results(diffexp_results_anndata, diffexp_results_dense) topcols = np.array([x[0] for x in diffexp_results_anndata]) cols_anndata = self.get_X_col(adaptor_anndata, topcols) cols_sparse = self.get_X_col(adaptor_sparse, topcols) cols_dense = self.get_X_col(adaptor_dense, topcols) assert cols_anndata.shape[0] == adaptor_sparse.get_shape()[0] assert cols_anndata.shape[1] == len(diffexp_results_anndata) def convert(mat, cols): return decode_matrix_fbs(encode_matrix_fbs(mat, col_idx=cols)).to_numpy() cols_anndata = convert(cols_anndata, topcols) cols_sparse = convert(cols_sparse, topcols) cols_dense = convert(cols_dense, topcols) x = adaptor_sparse.get_X_array() assert x.shape == adaptor_sparse.get_shape() for row in range(cols_anndata.shape[0]): for col in range(cols_anndata.shape[1]): vanndata = cols_anndata[row][col] vsparse = cols_sparse[row][col] vdense = cols_dense[row][col] self.assertTrue(np.isclose(vanndata, vsparse, 1e-6, 1e-6)) self.assertTrue(np.isclose(vanndata, vdense, 1e-6, 1e-6))
def test__create_h5ad_data_file__no_copy_if_obs_and_var_index_names_specified( self): h5ad_file = H5ADDataFile( self.sample_h5ad_filename, use_corpora_schema=False, obs_index_column_name="float_category", vars_index_column_name="int_category", ) self.assertNotIn("name_0", h5ad_file.obs.columns) self.assertNotIn("name_0", h5ad_file.var.columns)
def test__create_h5ad_data_file__copies_index_of_obs_and_var_to_column( self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename, use_corpora_schema=False) # The automatic name chosen for the index should be "name_0" self.assertNotIn("name_0", self.sample_anndata.obs.columns) self.assertIn("name_0", h5ad_file.obs.columns) self.assertNotIn("name_0", self.sample_anndata.var.columns) self.assertIn("name_0", h5ad_file.var.columns)
def test__create_h5ad_data_file__obs_and_var_index_names_specified_doesnt_exist_raises_exception( self): with self.assertRaises(Exception) as exception_context: H5ADDataFile( self.sample_h5ad_filename, use_corpora_schema=False, obs_index_column_name="unknown_category", vars_index_column_name="i_dont_exist", ) self.assertIn("does not exist", str(exception_context.exception))
def test__create_h5ad_data_file__assert_warning_outputted_if_dataset_title_or_about_given( self): with self.assertLogs(level="WARN") as logger: H5ADDataFile( self.sample_h5ad_filename, dataset_title="My Awesome Dataset", dataset_about="http://www.awesomedataset.com", use_corpora_schema=False, ) self.assertIn("will override any metadata that is extracted", logger.output[0])
def test__create_h5ad_data_file__obs_and_var_index_names_specified_not_unique_raises_exception( self): with self.assertRaises(Exception) as exception_context: H5ADDataFile( self.sample_h5ad_filename, use_corpora_schema=False, obs_index_column_name="float_category", vars_index_column_name="bool_category", ) self.assertIn("Please prepare data to contain unique values", str(exception_context.exception))
def test__to_cxg__with_sparse_column_encoding(self): anndata = self._create_sample_anndata_dataset() anndata.X = np.ones((3, 4)) sparse_with_column_shift_filename = self._write_anndata_to_file( anndata) h5ad_file = H5ADDataFile(sparse_with_column_shift_filename) h5ad_file.to_cxg(self.sample_output_directory, 50) self._validate_expected_generated_list_of_tiledb_files( has_column_encoding=True) # Clean up remove(sparse_with_column_shift_filename)
def test__create_h5ad_data_file__reads_anndata_successfully(self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename, use_corpora_schema=False) self.assertTrue((h5ad_file.anndata.X == self.sample_anndata.X).all()) self.assertEqual(h5ad_file.anndata.obs.sort_index(inplace=True), self.sample_anndata.obs.sort_index(inplace=True)) self.assertEqual(h5ad_file.anndata.var.sort_index(inplace=True), self.sample_anndata.var.sort_index(inplace=True)) for key in h5ad_file.anndata.obsm.keys(): self.assertIn(key, self.sample_anndata.obsm.keys()) self.assertTrue( (h5ad_file.anndata.obsm[key] == self.sample_anndata.obsm[key] ).all()) for key in self.sample_anndata.obsm.keys(): self.assertIn(key, h5ad_file.anndata.obsm.keys()) self.assertTrue( (h5ad_file.anndata.obsm[key] == self.sample_anndata.obsm[key] ).all())
def test__to_cxg__simple_anndata_with_corpora_and_dense(self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename) h5ad_file.to_cxg(self.sample_output_directory, 0) self._validate_expected_generated_list_of_tiledb_files()
def test__to_cxg__simple_anndata_no_corpora_and_sparse(self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename, use_corpora_schema=False) h5ad_file.to_cxg(self.sample_output_directory, 100) self._validate_expected_generated_list_of_tiledb_files()
def test__create_h5ad_data_file__extract_about_and_title_from_dataset( self): h5ad_file = H5ADDataFile(self.sample_h5ad_filename) self.assertEqual(h5ad_file.dataset_title, "random_link_name") self.assertEqual(h5ad_file.dataset_about, "www.link.com")