Esempio n. 1
0
    def generate_request_hash(self) -> str:
        """
        Generates a request hash uniquely identifying a request by its input parameters.
        Requires cell query results to exist, else raises MatrixQueryResultsNotFound.
        :return: str Request hash
        """
        cell_manifest_key = f"s3://{os.environ['MATRIX_QUERY_RESULTS_BUCKET']}/{self.request_id}/cell_metadata_manifest"
        reader = CellQueryResultsReader(cell_manifest_key)

        logger.info(f"Generating request hash from {cell_manifest_key}")

        h = hashlib.md5()
        h.update(self.feature.encode())
        h.update(self.format.encode())

        for field in self.metadata_fields:
            h.update(field.encode())

        n_slices = len(reader.manifest['part_urls'])
        for i in range(n_slices):
            logger.info(f"[Slice {i}] start.")
            cell_df = reader.load_slice(i)
            for key in cell_df.index:
                h.update(key.encode())
            logger.info(f"[Slice {i}] Hashed all {len(cell_df.index)} keys.")
            del cell_df

        request_hash = h.hexdigest()
        logger.info(f"Successfully generated request hash {request_hash}.")

        return request_hash
Esempio n. 2
0
    def test_load_empty_results(self, mock_parse_manifest):

        mock_parse_manifest.return_value = {"record_count": 0}
        cell_query_results_reader = CellQueryResultsReader("test_manifest_key")

        results = cell_query_results_reader.load_results()
        self.assertEqual(results.shape, (0, 0))
Esempio n. 3
0
    def test_load_slice(self, mock_open, mock_pd_read_csv):
        manifest_file_path = "tests/functional/res/cell_metadata_manifest"
        with open(manifest_file_path) as f:
            mock_open.return_value = f
            reader = CellQueryResultsReader("test_manifest_key")

            reader.load_slice(3)

        pandas_args = mock_pd_read_csv.call_args[-2]
        pandas_kwargs = mock_pd_read_csv.call_args[-1]

        self.assertIn("project.project_core.project_short_name", pandas_kwargs["names"])
        self.assertTrue(pandas_args[0].startswith("s3://"))
Esempio n. 4
0
    def test_load_results(self, mock_parse_manifest, mock_load_slice):
        mock_parse_manifest.return_value = {
            "columns": ["a", "b", "c"],
            "part_urls": ["A", "B", "C"],
            "record_count": 5
        }
        test_df = pandas.DataFrame()
        mock_load_slice.return_value = test_df
        reader = CellQueryResultsReader("test_manifest_key")
        reader.load_results()

        expected_calls = [mock.call(0), mock.call(1), mock.call(2)]
        mock_load_slice.assert_has_calls(expected_calls)
Esempio n. 5
0
    def test_empty_results(self, mock_parse_manifest,
                           mock_upload_converted_matrix,
                           mock_complete_subtask_execution,
                           mock_complete_request, mock_creation_date,
                           mock_remove):

        mock_creation_date.return_value = date.to_string(
            datetime.datetime.utcnow())

        self.matrix_converter.query_results = {
            QueryType.CELL: CellQueryResultsReader("test_manifest_key"),
            QueryType.EXPRESSION:
            ExpressionQueryResultsReader("test_manifest_key"),
            QueryType.FEATURE: FeatureQueryResultsReader("test_manifest_key")
        }

        mock_parse_manifest.return_value = {"record_count": 0}

        self.matrix_converter.local_output_filename = "unit_test_empty_loom.loom"
        self.matrix_converter.run()

        self.assertEqual(os.path.getsize("unit_test_empty_loom.loom"), 0)

        mock_complete_subtask_execution.assert_called_once_with(
            Subtask.CONVERTER)
        mock_complete_request.assert_called_once()

        os.remove("unit_test_empty_loom.loom")
Esempio n. 6
0
    def run(self):
        try:
            LOGGER.debug(f"Beginning matrix conversion run for {self.args.request_id}")
            self.query_results = {
                QueryType.CELL: CellQueryResultsReader(self.args.cell_metadata_manifest_key),
                QueryType.EXPRESSION: ExpressionQueryResultsReader(self.args.expression_manifest_key),
                QueryType.FEATURE: FeatureQueryResultsReader(self.args.gene_metadata_manifest_key)
            }

            LOGGER.debug(f"Beginning conversion to {self.format}")
            local_converted_path = getattr(self, f"_to_{self.format}")()
            LOGGER.debug(f"Conversion to {self.format} completed")

            LOGGER.debug(f"Beginning upload to S3")
            self._upload_converted_matrix(local_converted_path, self.target_path)
            LOGGER.debug("Upload to S3 complete, job finished")

            os.remove(local_converted_path)

            self.request_tracker.complete_subtask_execution(Subtask.CONVERTER)
            self.request_tracker.complete_request(duration=(date.get_datetime_now()
                                                            - date.to_datetime(self.request_tracker.creation_date))
                                                  .total_seconds())
        except Exception as e:
            LOGGER.info(f"Matrix Conversion failed on {self.args.request_id} with error {str(e)}")
            self.request_tracker.log_error(str(e))
            raise e
Esempio n. 7
0
    def test__n_slices(self, mock_open):
        manifest_file_path = "tests/functional/res/cell_metadata_manifest"
        with open(manifest_file_path) as f:
            mock_open.return_value = f
            self.matrix_converter.query_results = {
                QueryType.CELL: CellQueryResultsReader("test_manifest_key")
            }

        self.assertEqual(self.matrix_converter._n_slices(), 8)
Esempio n. 8
0
    def test__to_csv(self, mock_parse_manifest, mock_load_cell_results,
                     mock_write_gene_dataframe, mock_make_directory,
                     mock_generate_dfs):

        results_dir = "unit_test__to_csv"
        os.makedirs(results_dir)
        mock_make_directory.return_value = results_dir

        test_data = self._create_test_data()
        mock_write_gene_dataframe.return_value = test_data["genes_df"]

        mock_load_cell_results.return_value = test_data["cells_df"]

        expression_manifest = {
            "record_count": sum(d.shape[0] for d in test_data["expr_dfs"])
        }
        mock_parse_manifest.return_value = expression_manifest

        mock_generate_dfs.return_value = iter(test_data["expr_dfs"])

        self.matrix_converter.query_results = {
            QueryType.CELL: CellQueryResultsReader("test_manifest_key"),
            QueryType.EXPRESSION:
            ExpressionQueryResultsReader("test_manifest_key")
        }

        test_data["genes_df"].to_csv(os.path.join(results_dir, "genes.csv"),
                                     index_label="featurekey")
        self.matrix_converter.local_output_filename = "unit_test__to_csv.zip"
        zip_path = self.matrix_converter._to_csv()

        with zipfile.ZipFile(zip_path) as z:
            z.extractall()

        df = pandas.read_csv(os.path.join(results_dir, "expression.csv"),
                             header=0,
                             index_col="cellkey")

        self.assertAlmostEqual(
            df.sum().sum(),
            sum(d["exprvalue"].sum() for d in test_data["expr_dfs"]), 2)

        # Every cell has 20 genes with non-zero expression. Check first and
        # last cells to makes sure that the expression matches
        self.assertAlmostEqual(
            df.sum(axis=1)[0],
            test_data["expr_dfs"][0]['exprvalue'][:20].sum(), 2)
        self.assertAlmostEqual(
            df.sum(axis=1)[1],
            test_data["expr_dfs"][0]['exprvalue'][20:40].sum(), 2)
        self.assertAlmostEqual(
            df.sum(axis=1).tail(1).item(),
            test_data["expr_dfs"][-1]['exprvalue'][-20:].sum(), 2)

        shutil.rmtree(results_dir)
        os.remove(zip_path)
Esempio n. 9
0
    def test__to_loom(self, mock_parse_manifest, mock_load_gene_results,
                      mock_load_cell_results, mock_generate_dfs):

        working_dir = "unit_test__to_loom"
        self.matrix_converter.working_dir = working_dir

        test_data = self._create_test_data()

        self.matrix_converter.query_results = {
            QueryType.CELL: CellQueryResultsReader("test_manifest_key"),
            QueryType.EXPRESSION:
            ExpressionQueryResultsReader("test_manifest_key"),
            QueryType.FEATURE: FeatureQueryResultsReader("test_manifest_key")
        }
        self.matrix_converter.query_results[QueryType.CELL].manifest = {
            "record_count": test_data["cells_df"].shape[0]
        }

        mock_load_gene_results.return_value = test_data["genes_df"]
        mock_load_cell_results.return_value = test_data["cells_df"]

        expression_manifest = {
            "record_count": sum(d.shape[0] for d in test_data["expr_dfs"])
        }
        mock_parse_manifest.return_value = expression_manifest

        mock_generate_dfs.return_value = iter(test_data["expr_dfs"])

        self.matrix_converter.local_output_filename = "unit_test__to_loom.loom"
        loom_path = self.matrix_converter._to_loom()

        ds = loompy.connect(loom_path)

        self.assertAlmostEqual(
            ds[:, :].sum(),
            sum(d["exprvalue"].sum() for d in test_data["expr_dfs"]), -1)

        # Every cell has 20 genes with non-zero expression. Check first and
        # last cells to makes sure that the expression matches
        self.assertAlmostEqual(
            ds[:, 0].sum(), test_data["expr_dfs"][0]['exprvalue'][:20].sum(),
            1)
        self.assertAlmostEqual(
            ds[:, 1].sum(), test_data["expr_dfs"][0]['exprvalue'][20:40].sum(),
            1)
        self.assertAlmostEqual(
            ds[:, -1].sum(),
            test_data["expr_dfs"][-1]['exprvalue'][-20:].sum(), 1)

        shutil.rmtree(working_dir)
    def generate_request_hash(self) -> str:
        """
        Generates a request hash uniquely identifying a request by its input parameters.
        Requires cell query results to exist, else raises MatrixQueryResultsNotFound.
        :return: str Request hash
        """
        cell_manifest_key = f"s3://{os.environ['MATRIX_QUERY_RESULTS_BUCKET']}/{self.request_id}/cell_metadata_manifest"
        reader = CellQueryResultsReader(cell_manifest_key)
        cell_df = reader.load_results()
        cellkeys = cell_df.index

        h = hashlib.md5()
        h.update(self.feature.encode())
        h.update(self.format.encode())

        for field in self.metadata_fields:
            h.update(field.encode())

        for key in cellkeys:
            h.update(key.encode())

        request_hash = h.hexdigest()

        return request_hash
Esempio n. 11
0
    def test__generate_expression_dfs(self, mock_load_slice,
                                      mock_parse_manifest):

        mock_parse_manifest.return_value = {
            "part_urls": ["url1"],
            "columns": ["cellkey", "featurekey", "exprvalue"],
            "record_count": 2624879
        }

        self.matrix_converter.query_results = {
            QueryType.CELL:
            CellQueryResultsReader("test_cell_manifest_key"),
            QueryType.EXPRESSION:
            ExpressionQueryResultsReader("test_expression_manifest_key")
        }

        # Create some fake gene and cell values. We'll have 2027 cells each
        # with 647 expressed genes. This makes sure the test hits some jagged
        # edges.
        genes = itertools.cycle(("gene_" + str(n) for n in range(647)))
        cells = itertools.chain.from_iterable(
            (itertools.repeat("cell_" + str(n), 647) for n in range(2027)))

        full_expr_df = pandas.DataFrame(
            columns=["cellkey", "featurekey", "exprvalue"],
            data=[[c, f, random.randrange(1, 10000)]
                  for c, f in zip(cells, genes)])
        # load_slice splits on 1000000 rows
        chunk1_df = full_expr_df[:999615]
        chunk2_df = full_expr_df[999615:]

        # Have load slice return two different chunks
        mock_load_slice.return_value = iter([chunk1_df, chunk2_df])

        # Keep track of how many unique cells we see and the sum of expression
        # values
        cell_counter = 0
        expr_sum = 0
        for cell_df in self.matrix_converter._generate_expression_dfs(50):
            num_cells = len(set(cell_df["cellkey"]))
            self.assertLessEqual(num_cells, 50)
            cell_counter += num_cells
            expr_sum += cell_df["exprvalue"].sum()

        # Verify we saw every cell and all the expression values
        self.assertEqual(cell_counter, 2027)
        self.assertEqual(expr_sum, full_expr_df["exprvalue"].sum())
Esempio n. 12
0
    def test__to_mtx(self, mock_parse_manifest, mock_load_cell_results,
                     mock_write_gene_dataframe_10x, mock_write_gene_dataframe,
                     mock_make_directory, mock_generate_dfs):

        results_dir = "unit_test__to_mtx"
        os.makedirs(results_dir)
        mock_make_directory.return_value = results_dir

        test_data = self._create_test_data()
        mock_write_gene_dataframe.return_value = test_data["genes_df"]
        mock_write_gene_dataframe_10x.return_value = test_data["genes_df"]

        mock_load_cell_results.return_value = test_data["cells_df"]

        expression_manifest = {
            "record_count": sum(d.shape[0] for d in test_data["expr_dfs"])
        }
        mock_parse_manifest.return_value = expression_manifest

        mock_generate_dfs.return_value = iter(test_data["expr_dfs"])

        self.matrix_converter.query_results = {
            QueryType.CELL: CellQueryResultsReader("test_manifest_key"),
            QueryType.EXPRESSION:
            ExpressionQueryResultsReader("test_manifest_key")
        }

        test_data["genes_df"].to_csv(os.path.join(results_dir,
                                                  "features.tsv.gz"),
                                     index_label="featurekey",
                                     sep="\t",
                                     compression="gzip")
        test_data["genes_df"].to_csv(os.path.join(results_dir, "genes.tsv.gz"),
                                     index_label="featurekey",
                                     sep="\t",
                                     compression="gzip")
        self.matrix_converter.local_output_filename = "unit_test__to_mtx.zip"
        zip_path = self.matrix_converter._to_mtx()

        with zipfile.ZipFile(zip_path) as z:
            z.extractall()

        matrix = scipy.io.mmread(os.path.join(results_dir,
                                              "matrix.mtx.gz")).todense()
        self.assertAlmostEqual(
            matrix.sum(),
            sum(d["exprvalue"].sum() for d in test_data["expr_dfs"]), 2)

        # Every cell has 20 genes with non-zero expression. Check first and
        # last cells to makes sure that the expression matches
        self.assertAlmostEqual(
            matrix[:, 0].sum(),
            test_data["expr_dfs"][0]['exprvalue'][:20].sum(), 2)
        self.assertAlmostEqual(
            matrix[:, 1].sum(),
            test_data["expr_dfs"][0]['exprvalue'][20:40].sum(), 2)
        self.assertAlmostEqual(
            matrix[:, -1].sum(),
            test_data["expr_dfs"][-1]['exprvalue'][-20:].sum(), 2)

        shutil.rmtree(results_dir)
        os.remove(zip_path)