def test_union(self): patients_pd = pd.DataFrame({"patientID": [1, 2, 3]}) events_pd = pd.DataFrame({ "patientID": [1, 2, 3], "value": ["DP", "DAS", "DR"] }) patients = self.spark.createDataFrame(patients_pd) events = self.spark.createDataFrame(events_pd) cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients, events) cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients, events) result = cohort1.union(cohort2) expected_patients = self.spark.createDataFrame( pd.concat([patients_pd, patients_pd])) expected_events = self.spark.createDataFrame( pd.concat([events_pd, events_pd])) expected = Cohort("result", "result", expected_patients, expected_events) self.assertEqual(result, expected)
def test_dump_metadata(self, mock_writing): df, _ = self.create_spark_df({"patientID": [1, 2]}) cohort_1 = Cohort("test", "test", df, None) df_events, _ = self.create_spark_df({ "patientID": [1, 2], "category": ["test", "test"] }) cohort_2 = Cohort("events", "events", df, df_events) cc = CohortCollection({"test": cohort_1, "events": cohort_2}) expected = sorted({ "operations": [ { "output_type": "events", "name": "events", "output_path": "../../output/events/data", "population_path": "../../output/events/subjects", }, { "output_type": "patients", "output_path": "../../output/test/subjects", "name": "test", }, ] }) result = sorted(cc.save("../../output")) self.assertEqual(expected, result)
def test_difference_all(self): patients_1, patients_pd_1 = self.create_spark_df({"patientID": [1, 2]}) events_1, events_pd_1 = self.create_spark_df( OrderedDict([("patientID", [1, 2]), ("value", ["DP", "DAS"])])) patients_2, patients_pd_2 = self.create_spark_df({"patientID": [1]}) events_2, events_pd_2 = self.create_spark_df( OrderedDict([("patientID", [1]), ("value", ["DP"])])) patients_3, patients_pd_3 = self.create_spark_df({"patientID": [1, 3]}) cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients_1, events_1) cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients_2, events_2) cohort3 = Cohort("imb_fractures", "imb_fractures", patients_3, None) result = Cohort.difference_all([cohort1, cohort2, cohort3]) patients_4, _ = self.create_spark_df({"patientID": [2]}) expected = Cohort("hospit_fractures", "hospit_fractures", patients_4, None) self.assertEqual(result, expected)
def test_cohort_collection_from_cohort_flow(self): input = """ { "intermediate_operations": { "operation": { "type": "union", "name": "outcome", "parents": ["liberal_fractures", "hospit_fractures"] } }, "cohorts": [ "extract_patients", "exposures", "filter_patients", "outcome" ] } """ df, _ = self.create_spark_df({"patientID": [1, 2, 3]}) cc = CohortCollection({ "liberal_fractures": Cohort("liberal_fractures", "liberal_fractures", df, None), "hospit_fractures": Cohort("hospit_fractures", "hospit_fractures", df, None), }) result = cohort_collection_from_cohort_flow(cc, input) self.assertSetEqual( set(result.cohorts.keys()), {"liberal_fractures", "hospit_fractures", "outcome"}, )
def test_is_duration_events(self): schema = StructType([ StructField("patientID", IntegerType(), True), StructField("start", TimestampType(), True), StructField("end", TimestampType(), True), ]) patients_pd = pd.DataFrame({"patientID": [1, 2, 3]}) patients = self.spark.createDataFrame(patients_pd) cohort1 = Cohort("patients", "patients", patients, None) self.assertFalse(cohort1.is_duration_events()) data = [(1, datetime(1993, 10, 9), datetime(1993, 10, 9))] events = self.spark.createDataFrame(data=data, schema=schema) cohort2 = Cohort("patients", "patients", patients, events) self.assertTrue(cohort2.is_duration_events()) data = [(1, datetime(1993, 10, 9), None), (2, datetime(1993, 10, 9), None)] events = self.spark.createDataFrame(data=data, schema=schema) cohort2 = Cohort("patients", "patients", patients, events) self.assertFalse(cohort2.is_duration_events())
def test_add_age_information(self): subjects, df = self.create_spark_df( {"birthDate": [datetime(1993, 10, 9), datetime(1992, 3, 14)]}) input = Cohort("liberal_fractures", "liberal_fractures", subjects, None) input.add_age_information(datetime(2013, 1, 1)) result = input expected_subjects, _ = self.create_spark_df({"age": [19, 20]}) expected = Cohort("liberal_fractures", "liberal_fractures", expected_subjects, None) self.assertTrue( data_frame_equality(result.subjects.select("age"), expected.subjects.select("age")))
def load(input: Dict) -> "CohortCollection": """Load a CohortCollection object from a dict.""" operations = input["operations"] return CohortCollection({ operation["name"]: Cohort.load(operation) for operation in operations })
def _find_inconsistent_start_end_ordering(cohort: Cohort) -> Cohort: events = cohort.events invalid_events = events.where(sf.col("start") >= sf.col("end")) return Cohort( cohort.name + "_inconsistent_w_start_end_ordering", "events where start >= end dates are inconsistent", invalid_events.select("patientID").distinct(), invalid_events, )
def test_intersect(self): patients_1, patients_pd_1 = self.create_spark_df({"patientID": [1, 2]}) events_1, events_pd_1 = self.create_spark_df( OrderedDict([("patientID", [1, 2]), ("value", ["DP", "DAS"])])) patients_2, patients_pd_2 = self.create_spark_df({"patientID": [1]}) events_2, events_pd_2 = self.create_spark_df( OrderedDict([("patientID", [1]), ("value", ["DP"])])) cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients_1, events_1) cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients_2, events_2) result = cohort1.intersection(cohort2) expected = cohort2 self.assertEqual(result, expected)
def plot_mean_duration_per_value(figure: Figure, cohort: Cohort) -> Figure: assert cohort.is_duration_events() df = agg_by_col(cohort.events, frozenset(["value"]), "duration", "mean").sort_values("value") ax = figure.gca() ax.barh(range(len(df.value)), df["avg(duration)"].values) ax.set_yticklabels(df.value.values) ax.set_yticks(range(len(df.value))) return figure
def _find_subjects_with_age_inconsistent_w_age_groups( self, cohort: Cohort) -> Cohort: """Check if min and max age_groups are consistent with subjects ages.""" if not cohort.has_subject_information(): raise ValueError("Cohort should have subject information.") duplicate = copy(cohort) duplicate.add_age_information( self.age_reference_date) # add starting age study_length = (np.ceil( (self.study_end - self.study_start).days / 365.25) if self.is_using_longitudinal_age_groups else 0) min_starting_age = min(self.age_groups) max_starting_age = max(self.age_groups) - np.ceil(study_length) invalid_subjects = duplicate.subjects.where( ~sf.col("age").between(min_starting_age, max_starting_age)) return Cohort( cohort.name + "_inconsistent_w_ages_and_age_groups", "subjects inconsistent with age groups", invalid_subjects, )
def get_from_description(self, description: Dict) -> Cohort: # TODO : this should be a method called from cohorts operation_type = description["type"] # type: str if operation_type.lower() not in ALLOWED_OPERATIONS: raise KeyError( "{} not permitted. Only available operations: {}".format( operation_type, ALLOWED_OPERATIONS)) else: parents = [self.get(parent) for parent in description["parents"]] new_cohort = Cohort.union_all(parents) new_cohort.name = description["name"] return new_cohort
def test_save_cohort(self, mock_method): df, _ = self.create_spark_df({"patientID": [1, 2]}) cohort = Cohort("test", "test", df, None) self.assertEqual( { "name": "test", "output_path": "../../output/test/subjects", "output_type": "patients", }, cohort.save_cohort("../../output"), ) df_events, _ = self.create_spark_df({ "patientID": [1, 2], "category": ["test", "test"] }) cohort_2 = Cohort("events", "events", df, df_events) self.assertEqual( { "name": "events", "output_path": "../../output/events/data", "population_path": "../../output/events/subjects", "output_type": "events", }, cohort_2.save_cohort("../../output"), )
def _find_events_not_in_study_dates(self, cohort: Cohort) -> Cohort: # between returns false when col is null invalid_events = cohort.events.where( ~(sf.col("start").between(sf.lit(self.study_start), sf.lit(self.study_end)) & sf.col("end").between(sf.lit(self.study_start), sf.lit(self.study_end)))) return Cohort( cohort.name + "_inconsistent_w_study_dates", "events inconsistent with study dates", invalid_events.select("patientID").distinct(), invalid_events, )
def test_difference(self): patients_1, _ = self.create_spark_df({"patientID": [1, 2]}) events_1, _ = self.create_spark_df( OrderedDict([("patientID", [1, 2]), ("value", ["DP", "DAS"])])) patients_2, _ = self.create_spark_df({"patientID": [1]}) events_2, _ = self.create_spark_df( OrderedDict([("patientID", [1]), ("value", ["DP"])])) cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients_1, events_1) cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients_2, events_2) result = cohort1.difference(cohort2) patients_3, _ = self.create_spark_df({"patientID": [2]}) events_3, _ = self.create_spark_df( OrderedDict([("patientID", [2]), ("value", ["DAS"])])) expected = Cohort("hospit_fractures", "hospit_fractures", patients_3, events_3) self.assertEqual(result, expected)
def _find_events_not_in_followup_bounds(self, cohort: Cohort) -> Cohort: fups = copy(self.followups) fups.events = rename_df_columns(fups.events, prefix="fup_") events = cohort.events.join(fups.events, "patientID") # between returns false when col is null invalid_events = events.where( ~(sf.col("start").between(sf.col("fup_start"), sf.col("fup_end")) & sf.col("end").between(sf.col("fup_start"), sf.col("fup_end")))) return Cohort( cohort.name + "_inconsistent_w_followup_bounds", "events inconsistent with followup bounds", invalid_events.select("patientID").distinct(), invalid_events, )
def plot_duration_distribution_per_month_as_bar(figure: Figure, cohort: Cohort) -> Figure: assert cohort.is_duration_events() df = event_duration_agg(cohort, "count").sort_values("duration") df.duration = np.ceil(df.duration / 30) df.duration = df.duration.astype("int32") df = df.groupby("duration").sum().reset_index() ax = figure.gca() ax.bar(range(len(df)), df["count(1)"].values) ax.set_xticklabels(df.duration.values) ax.set_xticks(range(len(df))) return figure
def _find_subjects_with_many_outcomes(cohort: Cohort) -> Cohort: subjects_w_many_outcomes = ( cohort.events.groupby("patientId").count().where( sf.col("count") > 1).select("patientId").drop_duplicates()) # between returns false when col is null invalid_events = cohort.events.join(subjects_w_many_outcomes, "patientId").sort("patientId") return Cohort( cohort.name + "_inconsistent_w_single_outcome_constraint", "events showing there are more than one outcome per patient", subjects_w_many_outcomes, invalid_events, )
def test_eq(self): df, _ = self.create_spark_df({"patientID": [1, 2]}) cohort_1 = Cohort("test", "test", df, None) df_events, _ = self.create_spark_df({ "patientID": [1, 2], "category": ["test", "test"] }) cohort_2 = Cohort("events", "events", df, df_events) cc1 = CohortCollection({"test": cohort_1, "events": cohort_2}) cc2 = CohortCollection({"test": cohort_1, "events": cohort_2}) self.assertEqual(cc1, cc2) cc3 = CohortCollection({"test1": cohort_1, "events": cohort_2}) self.assertNotEqual(cc1, cc3) df, _ = self.create_spark_df({"patientID": [1, 45]}) cohort_3 = Cohort("test", "test", df, None) cc4 = CohortCollection({"test": cohort_3, "events": cohort_2}) self.assertNotEqual(cc1, cc4) self.assertNotEqual(cc1, df)
def plot_duration_distribution_per_day_as_line(figure: Figure, cohort: Cohort) -> Figure: assert cohort.is_duration_events() df = event_duration_agg(cohort, "count").sort_values("duration") ax = figure.gca() ax.plot(df.duration, df["count(1)"]) ax.set_yscale("log") major = IndexLocator(365, +0.0) minor = IndexLocator(30, +0.0) ax.xaxis.set_minor_locator(minor) ax.xaxis.set_major_locator(major) ax.grid(True, which="major", axis="x") return figure
def test_union_all(self): patients_pd = pd.DataFrame({"patientID": [1, 2, 3]}) events_pd = pd.DataFrame({ "patientID": [1, 2, 3], "value": ["DP", "DAS", "DR"] }) patients = self.spark.createDataFrame(patients_pd) events = self.spark.createDataFrame(events_pd) cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients, events) cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients, events) cohort3 = Cohort("fractures", "fractures", patients, None) result = Cohort.union_all([cohort1, cohort2, cohort3]) expected_patients = self.spark.createDataFrame( pd.concat([patients_pd] * 3)) expected = Cohort("result", "result", expected_patients, None) self.assertEqual(result, expected)
def test_compute_longitudinal_age_groups(self): kwargs = copy(self.kwargs) kwargs["bucket_size"] = 365 loader = ConvSccsFeatureDriver(**kwargs) with self.assertRaises(AssertionError) as context: bad_cohort = Cohort( "base_population", "base_population", self.base_population.subjects.select("patientID"), None, ) _ = loader._compute_longitudinal_age_groups(bad_cohort, col_offset=int(2)) self.assertTrue( "Cohort subjects should have gender and birthdate information" in str(context.exception)) features, mapping = loader._compute_longitudinal_age_groups( self.base_population, col_offset=int(2)) expected_mapping = [ "[55.0, 60.0)", "[60.0, 65.0)", "[65.0, 70.0)", "[70.0, 75.0)", "[75.0, 80.0)", "[80.0, 85.0)", ] expected_data = np.array([ [3, 1, 6], [0, 0, 6], [0, 1, 6], [0, 2, 6], [0, 3, 7], [4, 1, 5], [4, 2, 6], [2, 0, 4], [2, 1, 5], [2, 2, 5], [2, 3, 5], ]).astype("int") self.assertListEqual(mapping, expected_mapping) np.testing.assert_array_equal(features.toPandas().values.astype("int"), expected_data)
def test_find_subjects_with_many_outcomes(self): invalid_events = { "patientID": ["0", "0", "1", "1", "2"], # uuid "start": [ pytz.datetime.datetime(1934, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2017, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 7, 27, tzinfo=pytz.UTC), ], "end": [ pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2006, 6, 20, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2014, 6, 20, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC), ], "value": [0, 1, 2, 3, 4], } invalid_df, _ = self.create_spark_df(invalid_events) invalid_cohort = Cohort("some_cohort", "Some cohort", invalid_df.select("patientID"), invalid_df) loader = ConvSccsFeatureDriver(**self.kwargs) invalid = loader._find_subjects_with_many_outcomes(invalid_cohort) self.assertEqual( invalid.name, "some_cohort_inconsistent_w_single_outcome_constraint") self.assertEqual( invalid.describe(), "Events are some_cohort_inconsistent_w_single_outcome_constraint. Events " "contain only events showing there are more than one outcome per patient.", ) self.assertListEqual( sorted(invalid.subjects.toPandas().values.ravel().tolist()), sorted(["0", "1"]), ) self.assertListEqual( sorted(invalid.events.toPandas().value.values.ravel().tolist()), sorted([0, 1, 2, 3]), )
def test_find_events_not_in_study_dates(self): invalid_events = { "patientID": ["0", "1", "2", "2", "3"], # uuid "start": [ pytz.datetime.datetime(1934, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2017, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 7, 27, tzinfo=pytz.UTC), ], "end": [ pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2006, 6, 20, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2014, 6, 20, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC), ], "value": [0, 1, 2, 3, 4], } invalid_df, _ = self.create_spark_df(invalid_events) invalid_cohort = Cohort("some_cohort", "Some cohort", invalid_df.select("patientID"), invalid_df) loader = BaseFeatureDriver(**self.kwargs) invalid = loader._find_events_not_in_study_dates(invalid_cohort) self.assertEqual(invalid.name, "some_cohort_inconsistent_w_study_dates") self.assertEqual( invalid.describe(), "Events are some_cohort_inconsistent_w_study_dates. Events contain only " "events inconsistent with study dates.", ) self.assertListEqual( sorted(invalid.subjects.toPandas().values.ravel().tolist()), sorted(["0", "1", "2"]), ) self.assertListEqual( sorted(invalid.events.toPandas().value.values.ravel().tolist()), sorted([0, 1, 2, 3]), )
def test_find_events_not_in_followup_bounds(self): data = { "patientID": ["0", "6", "2", "3", "5"], # uuid "start": [ pytz.datetime.datetime(2011, 7, 1, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2017, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 4, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2006, 7, 27, tzinfo=pytz.UTC), ], "end": [ pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2006, 6, 20, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2014, 6, 20, tzinfo=pytz.UTC), pytz.datetime.datetime(2015, 7, 27, tzinfo=pytz.UTC), ], "value": [0, 1, 2, 3, 4], } df, _ = self.create_spark_df(data) cohort = Cohort("some_cohort", "Some cohort", df.select("patientID").distinct(), df) loader = BaseFeatureDriver(**self.kwargs) invalid = loader._find_events_not_in_followup_bounds(cohort) self.assertEqual(invalid.name, "some_cohort_inconsistent_w_followup_bounds") self.assertEqual( invalid.describe(), "Events are some_cohort_inconsistent_w_followup_bounds. Events contain " "only events inconsistent with followup bounds.", ) self.assertListEqual( sorted(invalid.subjects.toPandas().values.ravel().tolist()), sorted(["2", "3"]), ) self.assertListEqual( sorted(invalid.events.toPandas().value.values.ravel().tolist()), sorted([2, 3]), )
def test_find_inconsistent_start_end_ordering(self): events = { "patientID": ["0", "1", "2", "2"], # uuid "start": [ pytz.datetime.datetime(2011, 7, 2, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 9, 30, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 7, 2, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 9, 30, tzinfo=pytz.UTC), ], "end": [ pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2013, 6, 20, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 6, 20, tzinfo=pytz.UTC), ], "value": [0, 1, 2, 3], } df, _ = self.create_spark_df(events) cohort = Cohort("some_cohort", "Some cohort", df.select("patientID").distinct(), df) invalid = BaseFeatureDriver._find_inconsistent_start_end_ordering( cohort) self.assertEqual(invalid.name, "some_cohort_inconsistent_w_start_end_ordering") self.assertEqual( invalid.describe(), "Events are some_cohort_inconsistent_w_start_end_ordering. Events contain " "only events where start >= end dates are inconsistent.", ) self.assertListEqual( sorted(invalid.subjects.toPandas().values.ravel().tolist()), sorted(["2"])) self.assertListEqual( sorted(invalid.events.toPandas().value.values.ravel().tolist()), sorted([3]))
def test_properties_final_cohort(self): loader = ConvSccsFeatureDriver(**self.kwargs) with self.assertRaises(PermissionError) as context: loader.final_cohort = "some value" self.assertTrue( "final_cohort should not be set manually," "it is computed from initial cohorts." in str(context.exception)) with self.assertRaises(AssertionError) as context: patients_wo_events, _ = self.create_spark_df(self.patients) patients_wo_events = patients_wo_events.select( (sf.col("patientID") + 1000).alias("patientID"), sf.col("gender"), sf.col("birthDate"), sf.col("deathDate"), ) loader.base_population = Cohort("base_population", "base_population", patients_wo_events, None) loader.final_cohort.subjects.count() self.assertTrue("Final cohort is empty, please check that " "the intersection of the provided cohorts " "is nonempty" in str(context.exception))
def test_has_subject_information(self): patients_1, _ = self.create_spark_df({"patientID": [1, 2]}) cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients_1) patients_2, _ = self.create_spark_df({ "patientID": [1, 2], "gender": [1, 1], "birthDate": [ pd.to_datetime("1993-10-09"), pd.to_datetime("1992-03-14"), ], "deathDate": [ pd.to_datetime("1993-10-09"), pd.to_datetime("1992-03-14"), ], }) cohort2 = Cohort("liberal_fractures", "liberal_fractures", patients_2, None) self.assertFalse(cohort1.has_subject_information()) self.assertTrue(cohort2.has_subject_information())
def test_properties_outcomes(self): loader = ConvSccsFeatureDriver(**self.kwargs) outcomes_ = loader.outcomes self.assertTrue( data_frame_equality(outcomes_.subjects, self.outcomes.subjects)) self.assertTrue( data_frame_equality(outcomes_.events, self.outcomes.events)) loader_ = ConvSccsFeatureDriver(**self.kwargs) loader_.run_checks = True bad_outcomes_df, _ = self.create_spark_df({ "patientID": ["0", "4"], # uuid "start": [ pytz.datetime.datetime(2010, 6, 8, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 3, 29, tzinfo=pytz.UTC), ], "end": [None, pytz.datetime.datetime(2010, 11, 24, tzinfo=pytz.UTC)], "value": ["bar", "baz"], "category": ["outcome"] * 2, "groupID": [0] * 2, "weight": [1] * 2, }) bad_outcomes_cohort = Cohort( "", "", bad_outcomes_df.select("patientID").distinct(), bad_outcomes_df) with self.assertRaises(AssertionError) as context: loader_.outcomes = bad_outcomes_cohort self.assertTrue( "There are more than one type of outcomes, check the 'value' field of " "outcomes cohort events." in str(context.exception)) mock_dataframe = MagicMock() mock_dataframe.take = lambda x: True mock_cohort = MagicMock() mock_cohort.subjects = mock_dataframe mock_cohort.events = mock_dataframe mock_empty_df = MagicMock() mock_empty_df.take = lambda x: [] mock_empty_cohort = MagicMock mock_empty_cohort.subjects = mock_empty_df mock_empty_cohort.events = mock_empty_df with patch.object( ConvSccsFeatureDriver, "_log_invalid_events_cohort", return_value="Ooops, error here!", ) as mock_log_invalid: with patch.object( ConvSccsFeatureDriver, "_find_events_not_in_followup_bounds", return_value=mock_cohort, ) as mock_find_events_outcome_bounds: loader = ConvSccsFeatureDriver(**self.kwargs) loader.run_checks = True with self.assertRaises(ValueError) as context: loader.outcomes = self.outcomes mock_find_events_outcome_bounds.assert_called_once_with( self.outcomes) mock_log_invalid.assert_called_once_with( mock_cohort, log_invalid_events=True) self.assertTrue("Ooops, error here!" == str(context.exception)) with patch.object( ConvSccsFeatureDriver, "_log_invalid_events_cohort", return_value="Ooops, error here!", ) as mock_log_invalid: with patch.object( ConvSccsFeatureDriver, "_find_events_not_in_followup_bounds", return_value=mock_empty_cohort, ) as mock_did_not_find_outcome_bounds: with patch.object( ConvSccsFeatureDriver, "_find_subjects_with_many_outcomes", return_value=mock_cohort, ) as mock_find_many_outcomes: loader = ConvSccsFeatureDriver(**self.kwargs) loader.run_checks = True with self.assertRaises(ValueError) as context: loader.outcomes = self.outcomes mock_did_not_find_outcome_bounds.assert_called_once_with( self.outcomes) mock_find_many_outcomes.assert_called_once_with( self.outcomes) mock_log_invalid.assert_called_once_with( mock_cohort, log_invalid_subjects=True) self.assertTrue( "Ooops, error here!" == str(context.exception))
def setUp(self): super().setUp() self.study_start = pytz.datetime.datetime(2010, 2, 5, tzinfo=pytz.UTC) self.study_end = pytz.datetime.datetime(2013, 10, 12, tzinfo=pytz.UTC) self.age_reference_date = pytz.datetime.datetime(2011, 9, 21, tzinfo=pytz.UTC) self.age_groups = [55, 65, 60, 75, 70, 80, 85] self.sorted_age_groups = sorted(self.age_groups) self.patients = { "patientID": ["0", "1", "2", "3", "4"], # uuid "gender": [1, 2, 2, 2, 1], # in {1, 2} "birthDate": [ pytz.datetime.datetime(1934, 7, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(1951, 5, 1, tzinfo=pytz.UTC), pytz.datetime.datetime(1942, 1, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(1933, 10, 3, tzinfo=pytz.UTC), pytz.datetime.datetime(1937, 12, 31, tzinfo=pytz.UTC), ], "deathDate": [ None, None, None, pytz.datetime.datetime(2011, 6, 20, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 12, 10, tzinfo=pytz.UTC), ], # can be null } self.followup_events = { "patientID": ["0", "3", "4", "2"], # uuid "start": [ pytz.datetime.datetime(2010, 6, 5, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 3, 27, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 7, 2, tzinfo=pytz.UTC), pytz.datetime.datetime(2010, 11, 21, tzinfo=pytz.UTC), ], "end": [ pytz.datetime.datetime(2013, 10, 12, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 6, 20, tzinfo=pytz.UTC), pytz.datetime.datetime(2012, 7, 3, tzinfo=pytz.UTC), pytz.datetime.datetime(2013, 10, 12, tzinfo=pytz.UTC), ], "endReason": ["ObservationEnd", "Trackloss", "Death", "ObservationEnd"], } self.exposure_events = { "patientID": ["0", "3", "4", "2"], # uuid "start": [ pytz.datetime.datetime(2010, 6, 7, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 3, 28, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 7, 3, tzinfo=pytz.UTC), pytz.datetime.datetime(2010, 11, 22, tzinfo=pytz.UTC), ], "end": [ None, None, None, pytz.datetime.datetime(2011, 11, 22, tzinfo=pytz.UTC), ], "value": ["foo"] * 4, "category": ["exposure"] * 4, "groupID": [0] * 4, "weight": [1] * 4, } self.outcome_events = { "patientID": ["0", "3", "4", "2"], # uuid "start": [ pytz.datetime.datetime(2010, 6, 8, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 3, 29, tzinfo=pytz.UTC), pytz.datetime.datetime(2011, 7, 4, tzinfo=pytz.UTC), pytz.datetime.datetime(2010, 11, 23, tzinfo=pytz.UTC), ], "end": [ None, None, None, pytz.datetime.datetime(2010, 11, 24, tzinfo=pytz.UTC), ], "value": ["bar"] * 4, "category": ["outcome"] * 4, "groupID": [0] * 4, "weight": [1] * 4, } patients_df, _ = self.create_spark_df(self.patients) fup_events_df, _ = self.create_spark_df(self.followup_events) exp_events_df, _ = self.create_spark_df(self.exposure_events) out_events_df, _ = self.create_spark_df(self.outcome_events) self.base_population = Cohort("base_population", "base_population", patients_df, None) self.followups = Cohort( "followups", "followups", fup_events_df.select("patientID").distinct(), fup_events_df, ) self.exposures = Cohort( "exposures", "exposures", exp_events_df.select("patientID").distinct(), exp_events_df, ) self.outcomes = Cohort( "outcomes", "outcomes", out_events_df.select("patientID").distinct(), out_events_df, ) self.bucket_size = 30 self.kwargs = { "base_population": self.base_population, "followups": self.followups, "exposures": self.exposures, "outcomes": self.outcomes, "bucket_size": self.bucket_size, "study_start": self.study_start, "study_end": self.study_end, "age_reference_date": self.age_reference_date, "age_groups": self.sorted_age_groups, "bucket_rounding": "ceil", "run_checks": False, }