Exemple #1
0
 def test_load_censoring(self):
     kwargs = copy(self.kwargs)
     kwargs["bucket_size"] = 365
     loader = ConvSccsFeatureDriver(**kwargs)
     result = loader._load_censoring()
     expected = np.array([[3], [3], [1], [2]])
     np.testing.assert_array_equal(result, expected)
Exemple #2
0
    def test_properties_exposures(self):
        loader = ConvSccsFeatureDriver(**self.kwargs)
        exposures_ = loader.exposures
        self.assertTrue(
            data_frame_equality(exposures_.subjects, self.exposures.subjects))
        self.assertTrue(
            data_frame_equality(exposures_.events, self.exposures.events))

        mock_dataframe = MagicMock()
        mock_dataframe.take = lambda x: True
        mock_cohort = MagicMock()
        mock_cohort.subjects = mock_dataframe
        mock_cohort.events = mock_dataframe
        with patch.object(
                ConvSccsFeatureDriver,
                "_find_events_not_in_followup_bounds",
                return_value=mock_cohort,
        ) as mock_find_events:
            with patch.object(
                    ConvSccsFeatureDriver,
                    "_log_invalid_events_cohort",
                    return_value="Ooops, error here!",
            ) as mock_log_invalid:
                loader_ = ConvSccsFeatureDriver(**self.kwargs)
                loader_.run_checks = True
                with self.assertRaises(ValueError) as context:
                    loader_.exposures = mock_cohort
                mock_find_events.assert_called_once_with(mock_cohort)
                mock_log_invalid.assert_called_once_with(
                    mock_cohort, log_invalid_events=True)
                self.assertTrue("Ooops, error here!" == str(context.exception))
Exemple #3
0
 def test_properties_outcomes_split_column(self):
     loader = ConvSccsFeatureDriver(**self.kwargs)
     self.assertEqual(loader.outcomes_split_column, "value")
     with self.assertRaises(ValueError) as context:
         loader.outcomes_split_column = "foo"
     self.assertTrue(
         "outcomes_split_column should be either 'category', 'groupID', or 'value'"
         in str(context.exception))
Exemple #4
0
    def test_properties_bucket_rounding(self):
        loader = ConvSccsFeatureDriver(**self.kwargs)

        self.assertEqual(loader.bucket_rounding,
                         self.kwargs["bucket_rounding"])

        with self.assertRaises(ValueError) as context:
            loader.bucket_rounding = "foo"
        self.assertTrue(
            "bucket_rounding should be equal to either 'ceil' or 'floor'" in
            str(context.exception))
Exemple #5
0
 def test_create_csr_matrix(self):
     df_row = pd.Series({
         "rowIndexes": [0, 1, 2, 2],
         "colIndexes": [0, 1, 0, 1]
     })
     loader = ConvSccsFeatureDriver(**self.kwargs)
     m = loader._create_csr_matrix(df_row, csr_matrix_shape=(5, 2))
     expected = np.array([[1, 0], [0, 1], [1, 1], [0, 0], [0, 0]])
     self.assertTrue(np.array_equal(m.toarray(), expected))
     self.assertTrue(m.shape == (5, 2))
     self.assertTrue(type(m) == csr_matrix)
Exemple #6
0
    def test_properties_mapping(self):
        loader = ConvSccsFeatureDriver(**self.kwargs)
        loader._feature_mapping = ["feature 1", "feature 2"]
        loader._outcome_mapping = ["outcome 1"]
        self.assertListEqual(loader.mappings[0], ["feature 1", "feature 2"])
        self.assertListEqual(loader.mappings[1], ["outcome 1"])

        with self.assertRaises(PermissionError) as context:
            loader.mappings = "some value"
        self.assertTrue("mappings should not be set manually,"
                        "they are computed from initial cohorts." in str(
                            context.exception))
Exemple #7
0
 def test_load_labels(self):
     kwargs = copy(self.kwargs)
     kwargs["bucket_size"] = 365
     loader = ConvSccsFeatureDriver(**kwargs)
     result = loader._load_labels()
     expected = [
         np.array([[1], [0], [0], [0]], dtype=np.int64),
         np.array([[1], [0], [0], [0]], dtype=np.int64),
         np.array([[0], [1], [0], [0]], dtype=np.int64),
         np.array([[0], [1], [0], [0]], dtype=np.int64),
     ]
     for i, exp in enumerate(expected):
         np.testing.assert_array_equal(result[i], exp)
Exemple #8
0
    def test_properties_censoring(self):
        with patch.object(ConvSccsFeatureDriver,
                          "_load_censoring",
                          return_value="some censoring") as mocked_method:
            loader_ = ConvSccsFeatureDriver(**self.kwargs)
            self.assertEqual(loader_.censoring, "some censoring")
            mocked_method.assert_called_once_with()

            with self.assertRaises(PermissionError) as context:
                loader_.censoring = "some value"
            self.assertTrue("censoring should not be set manually,"
                            "it is computed from initial cohorts." in str(
                                context.exception))
Exemple #9
0
    def test_properties_features(self):
        with patch.object(ConvSccsFeatureDriver,
                          "_load_features",
                          return_value="some features") as mocked_method:
            loader_ = ConvSccsFeatureDriver(**self.kwargs)
            result = loader_.features
            mocked_method.assert_called_once_with()
            self.assertEqual(result, "some features")

            with self.assertRaises(PermissionError) as context:
                loader_.features = "some value"
            self.assertTrue("features should not be set manually,"
                            "they are computed from initial cohorts." in str(
                                context.exception))
Exemple #10
0
    def test_get_bucketized_events(self):
        kwargs = copy(self.kwargs)
        kwargs["bucket_size"] = 365
        loader = ConvSccsFeatureDriver(**kwargs)
        features_, n_cols, mapping = loader._get_bucketized_events(
            loader.exposures, "value")

        expected = np.array([[0, 0, 0], [3, 1, 0], [4, 1, 0],
                             [2, 0, 0]]).astype("int")

        self.assertEqual(n_cols, 1)
        self.assertListEqual(mapping, ["foo"])
        np.testing.assert_array_equal(
            features_.toPandas().values.astype("int"), expected)
Exemple #11
0
 def test_discretize_start_end(self):
     with patch.object(
             ConvSccsFeatureDriver,
             "_discretize_time",
             return_value=sf.lit("mocked_value"),
     ) as mocked_method:
         loader_ = ConvSccsFeatureDriver(**self.kwargs)
         loader_._discretize_start_end(self.exposures.events)
         colnames = ["start", "end"]
         # Check that '_distcretize_time' has been called on the right cols
         for i, call in enumerate(mocked_method.call_args_list):
             expected_name = colnames[i]
             result = call[0][0]
             self.assertEqual(expected_name, result._jc.toString())
             self.assertTrue(isinstance(result, Column))
Exemple #12
0
 def test_load_features(self):
     kwargs = copy(self.kwargs)
     kwargs["bucket_size"] = 365
     loader = ConvSccsFeatureDriver(**kwargs)
     result = loader._load_features()
     result = [res.toarray() for res in result]
     expected = [
         np.array(
             [
                 [1, 0, 0, 0, 0, 1, 0],
                 [0, 0, 0, 0, 0, 1, 0],
                 [0, 0, 0, 0, 0, 1, 0],
                 [0, 0, 0, 0, 0, 0, 1],
             ],
             dtype=np.int64,
         ),
         np.array(
             [
                 [1, 0, 0, 1, 0, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0],
             ],
             dtype=np.int64,
         ),
         np.array(
             [
                 [0, 0, 0, 0, 0, 0, 0],
                 [1, 0, 0, 0, 0, 1, 0],
                 [0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 0, 0, 0],
             ],
             dtype=np.int64,
         ),
         np.array(
             [
                 [0, 0, 0, 0, 0, 0, 0],
                 [1, 0, 0, 0, 1, 0, 0],
                 [0, 0, 0, 0, 0, 1, 0],
                 [0, 0, 0, 0, 0, 0, 0],
             ],
             dtype=np.int64,
         ),
     ]
     for i, exp in enumerate(expected):
         np.testing.assert_array_equal(result[i], exp)
Exemple #13
0
    def test_get_csr_matrices(self):
        loader = ConvSccsFeatureDriver(**self.kwargs)
        no_row_index_df, _ = self.create_spark_df({"blah": [1, 2, 3]})
        no_col_index_df, _ = self.create_spark_df({
            "rowIndex": [1, 2, 3],
            "blah": [1, 2, 3]
        })
        csr_shape = (2, 5)

        with self.assertRaises(ValueError) as context:
            loader._get_csr_matrices(no_row_index_df, csr_shape)
        self.assertTrue(
            "rowIndex should be in events columns." in str(context.exception))

        with self.assertRaises(ValueError) as context:
            loader._get_csr_matrices(no_col_index_df, csr_shape)
        self.assertTrue(
            "colIndex should be in events columns." in str(context.exception))

        valid_df, _ = self.create_spark_df({
            "patientID": ["Alice", "Bob", "Alice", "Alice", "Bob"],
            "rowIndex": [0, 0, 0, 1, 1],
            "colIndex": [0, 1, 2, 3, 4],
        })
        result = loader._get_csr_matrices(valid_df, csr_shape)
        result = [res.toarray() for res in result]
        expected = [
            np.array([[1, 0, 1, 0, 0], [0, 0, 0, 1, 0]], dtype=np.int64),
            np.array([[0, 1, 0, 0, 0], [0, 0, 0, 0, 1]], dtype=np.int64),
        ]
        for i, exp in enumerate(expected):
            np.testing.assert_array_equal(result[i], exp)
Exemple #14
0
    def test_compute_longitudinal_age_groups(self):
        kwargs = copy(self.kwargs)
        kwargs["bucket_size"] = 365
        loader = ConvSccsFeatureDriver(**kwargs)

        with self.assertRaises(AssertionError) as context:
            bad_cohort = Cohort(
                "base_population",
                "base_population",
                self.base_population.subjects.select("patientID"),
                None,
            )
            _ = loader._compute_longitudinal_age_groups(bad_cohort,
                                                        col_offset=int(2))
        self.assertTrue(
            "Cohort subjects should have gender and birthdate information" in
            str(context.exception))

        features, mapping = loader._compute_longitudinal_age_groups(
            self.base_population, col_offset=int(2))
        expected_mapping = [
            "[55.0, 60.0)",
            "[60.0, 65.0)",
            "[65.0, 70.0)",
            "[70.0, 75.0)",
            "[75.0, 80.0)",
            "[80.0, 85.0)",
        ]
        expected_data = np.array([
            [3, 1, 6],
            [0, 0, 6],
            [0, 1, 6],
            [0, 2, 6],
            [0, 3, 7],
            [4, 1, 5],
            [4, 2, 6],
            [2, 0, 4],
            [2, 1, 5],
            [2, 2, 5],
            [2, 3, 5],
        ]).astype("int")
        self.assertListEqual(mapping, expected_mapping)
        np.testing.assert_array_equal(features.toPandas().values.astype("int"),
                                      expected_data)
Exemple #15
0
    def test_find_subjects_with_many_outcomes(self):
        invalid_events = {
            "patientID": ["0", "0", "1", "1", "2"],  # uuid
            "start": [
                pytz.datetime.datetime(1934, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2017, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 7, 27, tzinfo=pytz.UTC),
            ],
            "end": [
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2006, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2014, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
            ],
            "value": [0, 1, 2, 3, 4],
        }

        invalid_df, _ = self.create_spark_df(invalid_events)
        invalid_cohort = Cohort("some_cohort", "Some cohort",
                                invalid_df.select("patientID"), invalid_df)

        loader = ConvSccsFeatureDriver(**self.kwargs)
        invalid = loader._find_subjects_with_many_outcomes(invalid_cohort)

        self.assertEqual(
            invalid.name,
            "some_cohort_inconsistent_w_single_outcome_constraint")
        self.assertEqual(
            invalid.describe(),
            "Events are some_cohort_inconsistent_w_single_outcome_constraint. Events "
            "contain only events showing there are more than one outcome per patient.",
        )
        self.assertListEqual(
            sorted(invalid.subjects.toPandas().values.ravel().tolist()),
            sorted(["0", "1"]),
        )
        self.assertListEqual(
            sorted(invalid.events.toPandas().value.values.ravel().tolist()),
            sorted([0, 1, 2, 3]),
        )
Exemple #16
0
    def test_properties_final_cohort(self):
        loader = ConvSccsFeatureDriver(**self.kwargs)
        with self.assertRaises(PermissionError) as context:
            loader.final_cohort = "some value"
        self.assertTrue(
            "final_cohort should not be set manually,"
            "it is computed from initial cohorts." in str(context.exception))

        with self.assertRaises(AssertionError) as context:
            patients_wo_events, _ = self.create_spark_df(self.patients)
            patients_wo_events = patients_wo_events.select(
                (sf.col("patientID") + 1000).alias("patientID"),
                sf.col("gender"),
                sf.col("birthDate"),
                sf.col("deathDate"),
            )
            loader.base_population = Cohort("base_population",
                                            "base_population",
                                            patients_wo_events, None)
            loader.final_cohort.subjects.count()
        self.assertTrue("Final cohort is empty, please check that "
                        "the intersection of the provided cohorts "
                        "is nonempty" in str(context.exception))
Exemple #17
0
    def test_discretize_time(self):
        loader = ConvSccsFeatureDriver(**self.kwargs)
        result = loader._discretize_start_end(self.exposures.events).toPandas()
        np.testing.assert_array_equal(result.startBucket.values,
                                      np.array([4, 13, 17, 9]))
        np.testing.assert_array_equal(result.endBucket.values,
                                      np.array([np.nan, np.nan, np.nan, 21]))

        kwargs = copy(self.kwargs)
        kwargs["bucket_rounding"] = "floor"
        kwargs["study_end"] = pytz.datetime.datetime(2011,
                                                     7,
                                                     5,
                                                     tzinfo=pytz.UTC)
        loader_floor = ConvSccsFeatureDriver(**kwargs)
        some_events = {
            "patientID": ["0", "3", "4", "2"],  # uuid
            "start": [
                pytz.datetime.datetime(2010, 6, 8, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 3, 29, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 7, 4, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2010, 11, 23, tzinfo=pytz.UTC),
            ],
            "end": [
                None,
                None,
                None,
                pytz.datetime.datetime(2010, 11, 24, tzinfo=pytz.UTC),
            ],
        }
        data, _ = self.create_spark_df(some_events)
        result = loader_floor._discretize_start_end(data).toPandas()
        np.testing.assert_array_equal(result.startBucket.values,
                                      np.array([4, 13, 16, 9]))
        np.testing.assert_array_equal(result.endBucket.values,
                                      np.array([np.nan, np.nan, np.nan, 9]))
Exemple #18
0
    def test_load(self):
        mock_features = PropertyMock(return_value=[1, 2, 3])
        mock_labels = PropertyMock(return_value=[1, 2, 3])
        mock_censoring = PropertyMock(return_value=[1, 2, 3])
        with patch.object(ConvSccsFeatureDriver,
                          "features",
                          new_callable=mock_features):
            with patch.object(ConvSccsFeatureDriver,
                              "labels",
                              new_callable=mock_labels):
                with patch.object(ConvSccsFeatureDriver,
                                  "censoring",
                                  new_callable=mock_censoring):
                    loader_ = ConvSccsFeatureDriver(**self.kwargs)
                    loader_.load()
                    mock_features.assert_called_with()
                    mock_labels.assert_called_once_with()
                    mock_censoring.assert_called_once_with()

        mock_missing_labels = PropertyMock(return_value=[1, 2])
        mock_missing_censoring = PropertyMock(return_value=[1, 2])
        with patch.object(ConvSccsFeatureDriver,
                          "features",
                          new_callable=mock_features):
            with patch.object(ConvSccsFeatureDriver,
                              "labels",
                              new_callable=mock_missing_labels):
                with patch.object(ConvSccsFeatureDriver,
                                  "censoring",
                                  new_callable=mock_censoring):
                    loader_ = ConvSccsFeatureDriver(**self.kwargs)
                    with self.assertRaises(AssertionError) as context:
                        loader_.load()
                    self.assertTrue(
                        "Number of feature matrices does not match "
                        "number of label matrices. You might want to"
                        " investigate this" in str(context.exception))

        with patch.object(ConvSccsFeatureDriver,
                          "features",
                          new_callable=mock_features):
            with patch.object(ConvSccsFeatureDriver,
                              "labels",
                              new_callable=mock_labels):
                with patch.object(
                        ConvSccsFeatureDriver,
                        "censoring",
                        new_callable=mock_missing_censoring,
                ):
                    loader_ = ConvSccsFeatureDriver(**self.kwargs)
                    with self.assertRaises(AssertionError) as context:
                        loader_.load()
                    self.assertTrue(
                        "Number of feature matrices does not match "
                        "number of censoring values. You might want to"
                        " investigate this" in str(context.exception))
Exemple #19
0
    def test_properties_outcomes(self):
        loader = ConvSccsFeatureDriver(**self.kwargs)
        outcomes_ = loader.outcomes
        self.assertTrue(
            data_frame_equality(outcomes_.subjects, self.outcomes.subjects))
        self.assertTrue(
            data_frame_equality(outcomes_.events, self.outcomes.events))

        loader_ = ConvSccsFeatureDriver(**self.kwargs)
        loader_.run_checks = True

        bad_outcomes_df, _ = self.create_spark_df({
            "patientID": ["0", "4"],  # uuid
            "start": [
                pytz.datetime.datetime(2010, 6, 8, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 3, 29, tzinfo=pytz.UTC),
            ],
            "end":
            [None, pytz.datetime.datetime(2010, 11, 24, tzinfo=pytz.UTC)],
            "value": ["bar", "baz"],
            "category": ["outcome"] * 2,
            "groupID": [0] * 2,
            "weight": [1] * 2,
        })
        bad_outcomes_cohort = Cohort(
            "", "",
            bad_outcomes_df.select("patientID").distinct(), bad_outcomes_df)

        with self.assertRaises(AssertionError) as context:
            loader_.outcomes = bad_outcomes_cohort

        self.assertTrue(
            "There are more than one type of outcomes, check the 'value' field of "
            "outcomes cohort events." in str(context.exception))

        mock_dataframe = MagicMock()
        mock_dataframe.take = lambda x: True
        mock_cohort = MagicMock()
        mock_cohort.subjects = mock_dataframe
        mock_cohort.events = mock_dataframe
        mock_empty_df = MagicMock()
        mock_empty_df.take = lambda x: []
        mock_empty_cohort = MagicMock
        mock_empty_cohort.subjects = mock_empty_df
        mock_empty_cohort.events = mock_empty_df
        with patch.object(
                ConvSccsFeatureDriver,
                "_log_invalid_events_cohort",
                return_value="Ooops, error here!",
        ) as mock_log_invalid:
            with patch.object(
                    ConvSccsFeatureDriver,
                    "_find_events_not_in_followup_bounds",
                    return_value=mock_cohort,
            ) as mock_find_events_outcome_bounds:
                loader = ConvSccsFeatureDriver(**self.kwargs)
                loader.run_checks = True
                with self.assertRaises(ValueError) as context:
                    loader.outcomes = self.outcomes
                mock_find_events_outcome_bounds.assert_called_once_with(
                    self.outcomes)
                mock_log_invalid.assert_called_once_with(
                    mock_cohort, log_invalid_events=True)
                self.assertTrue("Ooops, error here!" == str(context.exception))
        with patch.object(
                ConvSccsFeatureDriver,
                "_log_invalid_events_cohort",
                return_value="Ooops, error here!",
        ) as mock_log_invalid:
            with patch.object(
                    ConvSccsFeatureDriver,
                    "_find_events_not_in_followup_bounds",
                    return_value=mock_empty_cohort,
            ) as mock_did_not_find_outcome_bounds:
                with patch.object(
                        ConvSccsFeatureDriver,
                        "_find_subjects_with_many_outcomes",
                        return_value=mock_cohort,
                ) as mock_find_many_outcomes:
                    loader = ConvSccsFeatureDriver(**self.kwargs)
                    loader.run_checks = True
                    with self.assertRaises(ValueError) as context:
                        loader.outcomes = self.outcomes
                    mock_did_not_find_outcome_bounds.assert_called_once_with(
                        self.outcomes)
                    mock_find_many_outcomes.assert_called_once_with(
                        self.outcomes)
                    mock_log_invalid.assert_called_once_with(
                        mock_cohort, log_invalid_subjects=True)
                    self.assertTrue(
                        "Ooops, error here!" == str(context.exception))