Exemple #1
0
def convert_to_cxg(
    input_file,
    output_directory,
    backed,
    title,
    about,
    sparse_threshold,
    obs_names,
    var_names,
    disable_custom_colors,
    disable_corpora_schema,
    overwrite,
):
    """
    Convert a dataset file into CXG.
    """

    h5ad_data_file = H5ADDataFile(
        input_file,
        backed,
        title,
        about,
        obs_names,
        var_names,
        use_corpora_schema=not disable_corpora_schema)

    # Get the directory that will hold all the CXG files
    cxg_output_container = get_output_directory(input_file, output_directory,
                                                overwrite)

    h5ad_data_file.to_cxg(
        cxg_output_container,
        sparse_threshold,
        convert_anndata_colors_to_cxg_colors=not disable_custom_colors)
Exemple #2
0
    def test__create_h5ad_data_file__non_h5ad_raises_exception(self):
        non_h5ad_filename = "my_fancy_dataset.csv"

        with self.assertRaises(Exception) as exception_context:
            H5ADDataFile(non_h5ad_filename)

        self.assertIn("File must be an H5AD", str(exception_context.exception))
Exemple #3
0
    def test__create_h5ad_data_file__inputted_dataset_title_and_about_overrides_extracted(
            self):
        h5ad_file = H5ADDataFile(self.sample_h5ad_filename,
                                 dataset_about="override_about",
                                 dataset_title="override_title")

        self.assertEqual(h5ad_file.dataset_title, "override_title")
        self.assertEqual(h5ad_file.dataset_about, "override_about")
Exemple #4
0
    def test__create_h5ad_data_file__no_copy_if_obs_and_var_index_names_specified(
            self):
        h5ad_file = H5ADDataFile(self.sample_h5ad_filename,
                                 use_corpora_schema=False,
                                 obs_index_column_name="float_category",
                                 vars_index_column_name="int_category")

        self.assertNotIn("name_0", h5ad_file.obs.columns)
        self.assertNotIn("name_0", h5ad_file.var.columns)
Exemple #5
0
    def test__create_h5ad_data_file__obs_and_var_index_names_specified_doesnt_exist_raises_exception(
            self):
        with self.assertRaises(Exception) as exception_context:
            H5ADDataFile(self.sample_h5ad_filename,
                         use_corpora_schema=False,
                         obs_index_column_name="unknown_category",
                         vars_index_column_name="i_dont_exist")

        self.assertIn("does not exist", str(exception_context.exception))
Exemple #6
0
    def test__create_h5ad_data_file__assert_warning_outputted_if_dataset_title_or_about_given(
            self):
        with self.assertLogs(level="WARN") as logger:
            H5ADDataFile(self.sample_h5ad_filename,
                         dataset_title="My Awesome Dataset",
                         dataset_about="http://www.awesomedataset.com",
                         use_corpora_schema=False)

        self.assertIn("will override any metadata that is extracted",
                      logger.output[0])
Exemple #7
0
    def test__create_h5ad_data_file__obs_and_var_index_names_specified_not_unique_raises_exception(
            self):

        with self.assertRaises(Exception) as exception_context:
            H5ADDataFile(self.sample_h5ad_filename,
                         use_corpora_schema=False,
                         obs_index_column_name="float_category",
                         vars_index_column_name="bool_category")

        self.assertIn("Please prepare data to contain unique values",
                      str(exception_context.exception))
Exemple #8
0
    def test__create_h5ad_data_file__copies_index_of_obs_and_var_to_column(
            self):
        h5ad_file = H5ADDataFile(self.sample_h5ad_filename,
                                 use_corpora_schema=False)

        # The automatic name chosen for the index should be "name_0"
        self.assertNotIn("name_0", self.sample_anndata.obs.columns)
        self.assertIn("name_0", h5ad_file.obs.columns)

        self.assertNotIn("name_0", self.sample_anndata.var.columns)
        self.assertIn("name_0", h5ad_file.var.columns)
Exemple #9
0
    def test__to_cxg__with_sparse_column_encoding(self):
        anndata = self._create_sample_anndata_dataset()
        anndata.X = np.ones((3, 4))
        sparse_with_column_shift_filename = self._write_anndata_to_file(
            anndata)

        h5ad_file = H5ADDataFile(sparse_with_column_shift_filename)
        h5ad_file.to_cxg(self.sample_output_directory, 50)

        self._validate_expected_generated_list_of_tiledb_files(
            has_column_encoding=True)

        # Clean up
        remove(sparse_with_column_shift_filename)
Exemple #10
0
    def test__create_h5ad_data_file__reads_anndata_successfully(self):
        h5ad_file = H5ADDataFile(self.sample_h5ad_filename,
                                 use_corpora_schema=False)

        self.assertTrue((h5ad_file.anndata.X == self.sample_anndata.X).all())
        self.assertEqual(h5ad_file.anndata.obs.sort_index(inplace=True),
                         self.sample_anndata.obs.sort_index(inplace=True))
        self.assertEqual(h5ad_file.anndata.var.sort_index(inplace=True),
                         self.sample_anndata.var.sort_index(inplace=True))

        for key in h5ad_file.anndata.obsm.keys():
            self.assertIn(key, self.sample_anndata.obsm.keys())
            self.assertTrue(
                (h5ad_file.anndata.obsm[key] == self.sample_anndata.obsm[key]
                 ).all())

        for key in self.sample_anndata.obsm.keys():
            self.assertIn(key, h5ad_file.anndata.obsm.keys())
            self.assertTrue(
                (h5ad_file.anndata.obsm[key] == self.sample_anndata.obsm[key]
                 ).all())
Exemple #11
0
    def sparse_diffexp(self, apply_col_shift):
        with tempfile.TemporaryDirectory() as dirname:
            # create a sparse matrix
            h5adfile_path = os.path.join(dirname, "sparse.h5ad")
            create_test_h5ad(h5adfile_path, 2000, 2000, 10, apply_col_shift)

            h5ad_file_to_convert = H5ADDataFile(h5adfile_path,
                                                use_corpora_schema=False)

            sparsename = os.path.join(dirname, "sparse.cxg")
            h5ad_file_to_convert.to_cxg(sparsename, 11, True)

            adaptor_anndata = self.load_dataset(
                h5adfile_path, extra_dataset_config=dict(embeddings__names=[]))

            adaptor_sparse = self.load_dataset(sparsename)
            assert adaptor_sparse.open_array("X").schema.sparse
            assert adaptor_sparse.has_array("X_col_shift") == apply_col_shift

            densename = os.path.join(dirname, "dense.cxg")
            h5ad_file_to_convert.to_cxg(densename, True, 0)
            adaptor_dense = self.load_dataset(densename)
            assert not adaptor_dense.open_array("X").schema.sparse
            assert not adaptor_dense.has_array("X_col_shift")

            maskA = self.get_mask(adaptor_anndata, 1, 10)
            maskB = self.get_mask(adaptor_anndata, 2, 10)

            diffexp_results_anndata = diffexp_generic.diffexp_ttest(
                adaptor_anndata, maskA, maskB, 10)
            diffexp_results_sparse = diffexp_cxg.diffexp_ttest(
                adaptor_sparse, maskA, maskB, 10)
            diffexp_results_dense = diffexp_cxg.diffexp_ttest(
                adaptor_dense, maskA, maskB, 10)

            self.compare_diffexp_results(diffexp_results_anndata,
                                         diffexp_results_sparse)
            self.compare_diffexp_results(diffexp_results_anndata,
                                         diffexp_results_dense)

            topcols = np.array([x[0] for x in diffexp_results_anndata])
            cols_anndata = self.get_X_col(adaptor_anndata, topcols)
            cols_sparse = self.get_X_col(adaptor_sparse, topcols)
            cols_dense = self.get_X_col(adaptor_dense, topcols)
            assert cols_anndata.shape[0] == adaptor_sparse.get_shape()[0]
            assert cols_anndata.shape[1] == len(diffexp_results_anndata)

            def convert(mat, cols):
                return decode_matrix_fbs(encode_matrix_fbs(
                    mat, col_idx=cols)).to_numpy()

            cols_anndata = convert(cols_anndata, topcols)
            cols_sparse = convert(cols_sparse, topcols)
            cols_dense = convert(cols_dense, topcols)

            x = adaptor_sparse.get_X_array()
            assert x.shape == adaptor_sparse.get_shape()

            for row in range(cols_anndata.shape[0]):
                for col in range(cols_anndata.shape[1]):
                    vanndata = cols_anndata[row][col]
                    vsparse = cols_sparse[row][col]
                    vdense = cols_dense[row][col]
                    self.assertTrue(np.isclose(vanndata, vsparse, 1e-6, 1e-6))
                    self.assertTrue(np.isclose(vanndata, vdense, 1e-6, 1e-6))
Exemple #12
0
    def test__to_cxg__simple_anndata_with_corpora_and_dense(self):
        h5ad_file = H5ADDataFile(self.sample_h5ad_filename)
        h5ad_file.to_cxg(self.sample_output_directory, 0)

        self._validate_expected_generated_list_of_tiledb_files()
Exemple #13
0
    def test__to_cxg__simple_anndata_no_corpora_and_sparse(self):
        h5ad_file = H5ADDataFile(self.sample_h5ad_filename,
                                 use_corpora_schema=False)
        h5ad_file.to_cxg(self.sample_output_directory, 100)

        self._validate_expected_generated_list_of_tiledb_files()
Exemple #14
0
    def test__create_h5ad_data_file__extract_about_and_title_from_dataset(
            self):
        h5ad_file = H5ADDataFile(self.sample_h5ad_filename)

        self.assertEqual(h5ad_file.dataset_title, "random_link_name")
        self.assertEqual(h5ad_file.dataset_about, "www.link.com")