def test_union(self):
        patients_pd = pd.DataFrame({"patientID": [1, 2, 3]})
        events_pd = pd.DataFrame({
            "patientID": [1, 2, 3],
            "value": ["DP", "DAS", "DR"]
        })

        patients = self.spark.createDataFrame(patients_pd)

        events = self.spark.createDataFrame(events_pd)
        cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients,
                         events)

        cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients,
                         events)

        result = cohort1.union(cohort2)

        expected_patients = self.spark.createDataFrame(
            pd.concat([patients_pd, patients_pd]))
        expected_events = self.spark.createDataFrame(
            pd.concat([events_pd, events_pd]))
        expected = Cohort("result", "result", expected_patients,
                          expected_events)
        self.assertEqual(result, expected)
Beispiel #2
0
    def test_dump_metadata(self, mock_writing):
        df, _ = self.create_spark_df({"patientID": [1, 2]})
        cohort_1 = Cohort("test", "test", df, None)
        df_events, _ = self.create_spark_df({
            "patientID": [1, 2],
            "category": ["test", "test"]
        })

        cohort_2 = Cohort("events", "events", df, df_events)

        cc = CohortCollection({"test": cohort_1, "events": cohort_2})
        expected = sorted({
            "operations": [
                {
                    "output_type": "events",
                    "name": "events",
                    "output_path": "../../output/events/data",
                    "population_path": "../../output/events/subjects",
                },
                {
                    "output_type": "patients",
                    "output_path": "../../output/test/subjects",
                    "name": "test",
                },
            ]
        })

        result = sorted(cc.save("../../output"))
        self.assertEqual(expected, result)
    def test_difference_all(self):
        patients_1, patients_pd_1 = self.create_spark_df({"patientID": [1, 2]})
        events_1, events_pd_1 = self.create_spark_df(
            OrderedDict([("patientID", [1, 2]), ("value", ["DP", "DAS"])]))

        patients_2, patients_pd_2 = self.create_spark_df({"patientID": [1]})
        events_2, events_pd_2 = self.create_spark_df(
            OrderedDict([("patientID", [1]), ("value", ["DP"])]))

        patients_3, patients_pd_3 = self.create_spark_df({"patientID": [1, 3]})

        cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients_1,
                         events_1)

        cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients_2,
                         events_2)

        cohort3 = Cohort("imb_fractures", "imb_fractures", patients_3, None)

        result = Cohort.difference_all([cohort1, cohort2, cohort3])

        patients_4, _ = self.create_spark_df({"patientID": [2]})
        expected = Cohort("hospit_fractures", "hospit_fractures", patients_4,
                          None)
        self.assertEqual(result, expected)
    def test_cohort_collection_from_cohort_flow(self):
        input = """
        {
            "intermediate_operations": {
                "operation": {
                    "type": "union",
                    "name": "outcome",
                    "parents": ["liberal_fractures", "hospit_fractures"]
                }
            },
            "cohorts": [
                "extract_patients",
                "exposures",
                "filter_patients",
                "outcome"
            ]
        }
        """

        df, _ = self.create_spark_df({"patientID": [1, 2, 3]})

        cc = CohortCollection({
            "liberal_fractures":
            Cohort("liberal_fractures", "liberal_fractures", df, None),
            "hospit_fractures":
            Cohort("hospit_fractures", "hospit_fractures", df, None),
        })

        result = cohort_collection_from_cohort_flow(cc, input)

        self.assertSetEqual(
            set(result.cohorts.keys()),
            {"liberal_fractures", "hospit_fractures", "outcome"},
        )
    def test_is_duration_events(self):
        schema = StructType([
            StructField("patientID", IntegerType(), True),
            StructField("start", TimestampType(), True),
            StructField("end", TimestampType(), True),
        ])

        patients_pd = pd.DataFrame({"patientID": [1, 2, 3]})
        patients = self.spark.createDataFrame(patients_pd)

        cohort1 = Cohort("patients", "patients", patients, None)
        self.assertFalse(cohort1.is_duration_events())

        data = [(1, datetime(1993, 10, 9), datetime(1993, 10, 9))]

        events = self.spark.createDataFrame(data=data, schema=schema)

        cohort2 = Cohort("patients", "patients", patients, events)
        self.assertTrue(cohort2.is_duration_events())

        data = [(1, datetime(1993, 10, 9), None),
                (2, datetime(1993, 10, 9), None)]

        events = self.spark.createDataFrame(data=data, schema=schema)

        cohort2 = Cohort("patients", "patients", patients, events)
        self.assertFalse(cohort2.is_duration_events())
    def test_add_age_information(self):
        subjects, df = self.create_spark_df(
            {"birthDate": [datetime(1993, 10, 9),
                           datetime(1992, 3, 14)]})

        input = Cohort("liberal_fractures", "liberal_fractures", subjects,
                       None)

        input.add_age_information(datetime(2013, 1, 1))
        result = input
        expected_subjects, _ = self.create_spark_df({"age": [19, 20]})
        expected = Cohort("liberal_fractures", "liberal_fractures",
                          expected_subjects, None)
        self.assertTrue(
            data_frame_equality(result.subjects.select("age"),
                                expected.subjects.select("age")))
 def load(input: Dict) -> "CohortCollection":
     """Load a CohortCollection object from a dict."""
     operations = input["operations"]
     return CohortCollection({
         operation["name"]: Cohort.load(operation)
         for operation in operations
     })
Beispiel #8
0
 def _find_inconsistent_start_end_ordering(cohort: Cohort) -> Cohort:
     events = cohort.events
     invalid_events = events.where(sf.col("start") >= sf.col("end"))
     return Cohort(
         cohort.name + "_inconsistent_w_start_end_ordering",
         "events where start >= end dates are inconsistent",
         invalid_events.select("patientID").distinct(),
         invalid_events,
     )
    def test_intersect(self):
        patients_1, patients_pd_1 = self.create_spark_df({"patientID": [1, 2]})
        events_1, events_pd_1 = self.create_spark_df(
            OrderedDict([("patientID", [1, 2]), ("value", ["DP", "DAS"])]))

        patients_2, patients_pd_2 = self.create_spark_df({"patientID": [1]})
        events_2, events_pd_2 = self.create_spark_df(
            OrderedDict([("patientID", [1]), ("value", ["DP"])]))

        cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients_1,
                         events_1)

        cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients_2,
                         events_2)

        result = cohort1.intersection(cohort2)

        expected = cohort2
        self.assertEqual(result, expected)
def plot_mean_duration_per_value(figure: Figure, cohort: Cohort) -> Figure:
    assert cohort.is_duration_events()

    df = agg_by_col(cohort.events, frozenset(["value"]), "duration",
                    "mean").sort_values("value")
    ax = figure.gca()
    ax.barh(range(len(df.value)), df["avg(duration)"].values)
    ax.set_yticklabels(df.value.values)
    ax.set_yticks(range(len(df.value)))
    return figure
Beispiel #11
0
 def _find_subjects_with_age_inconsistent_w_age_groups(
         self, cohort: Cohort) -> Cohort:
     """Check if min and max age_groups are consistent with subjects ages."""
     if not cohort.has_subject_information():
         raise ValueError("Cohort should have subject information.")
     duplicate = copy(cohort)
     duplicate.add_age_information(
         self.age_reference_date)  # add starting age
     study_length = (np.ceil(
         (self.study_end - self.study_start).days /
         365.25) if self.is_using_longitudinal_age_groups else 0)
     min_starting_age = min(self.age_groups)
     max_starting_age = max(self.age_groups) - np.ceil(study_length)
     invalid_subjects = duplicate.subjects.where(
         ~sf.col("age").between(min_starting_age, max_starting_age))
     return Cohort(
         cohort.name + "_inconsistent_w_ages_and_age_groups",
         "subjects inconsistent with age groups",
         invalid_subjects,
     )
 def get_from_description(self, description: Dict) -> Cohort:
     # TODO : this should be a method called from cohorts
     operation_type = description["type"]  # type: str
     if operation_type.lower() not in ALLOWED_OPERATIONS:
         raise KeyError(
             "{} not permitted. Only available operations: {}".format(
                 operation_type, ALLOWED_OPERATIONS))
     else:
         parents = [self.get(parent) for parent in description["parents"]]
         new_cohort = Cohort.union_all(parents)
         new_cohort.name = description["name"]
         return new_cohort
    def test_save_cohort(self, mock_method):
        df, _ = self.create_spark_df({"patientID": [1, 2]})
        cohort = Cohort("test", "test", df, None)
        self.assertEqual(
            {
                "name": "test",
                "output_path": "../../output/test/subjects",
                "output_type": "patients",
            },
            cohort.save_cohort("../../output"),
        )

        df_events, _ = self.create_spark_df({
            "patientID": [1, 2],
            "category": ["test", "test"]
        })

        cohort_2 = Cohort("events", "events", df, df_events)
        self.assertEqual(
            {
                "name": "events",
                "output_path": "../../output/events/data",
                "population_path": "../../output/events/subjects",
                "output_type": "events",
            },
            cohort_2.save_cohort("../../output"),
        )
Beispiel #14
0
 def _find_events_not_in_study_dates(self, cohort: Cohort) -> Cohort:
     # between returns false when col is null
     invalid_events = cohort.events.where(
         ~(sf.col("start").between(sf.lit(self.study_start),
                                   sf.lit(self.study_end))
           & sf.col("end").between(sf.lit(self.study_start),
                                   sf.lit(self.study_end))))
     return Cohort(
         cohort.name + "_inconsistent_w_study_dates",
         "events inconsistent with study dates",
         invalid_events.select("patientID").distinct(),
         invalid_events,
     )
    def test_difference(self):
        patients_1, _ = self.create_spark_df({"patientID": [1, 2]})
        events_1, _ = self.create_spark_df(
            OrderedDict([("patientID", [1, 2]), ("value", ["DP", "DAS"])]))

        patients_2, _ = self.create_spark_df({"patientID": [1]})
        events_2, _ = self.create_spark_df(
            OrderedDict([("patientID", [1]), ("value", ["DP"])]))

        cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients_1,
                         events_1)

        cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients_2,
                         events_2)

        result = cohort1.difference(cohort2)

        patients_3, _ = self.create_spark_df({"patientID": [2]})
        events_3, _ = self.create_spark_df(
            OrderedDict([("patientID", [2]), ("value", ["DAS"])]))
        expected = Cohort("hospit_fractures", "hospit_fractures", patients_3,
                          events_3)
        self.assertEqual(result, expected)
Beispiel #16
0
 def _find_events_not_in_followup_bounds(self, cohort: Cohort) -> Cohort:
     fups = copy(self.followups)
     fups.events = rename_df_columns(fups.events, prefix="fup_")
     events = cohort.events.join(fups.events, "patientID")
     # between returns false when col is null
     invalid_events = events.where(
         ~(sf.col("start").between(sf.col("fup_start"), sf.col("fup_end"))
           & sf.col("end").between(sf.col("fup_start"), sf.col("fup_end"))))
     return Cohort(
         cohort.name + "_inconsistent_w_followup_bounds",
         "events inconsistent with followup bounds",
         invalid_events.select("patientID").distinct(),
         invalid_events,
     )
def plot_duration_distribution_per_month_as_bar(figure: Figure,
                                                cohort: Cohort) -> Figure:
    assert cohort.is_duration_events()

    df = event_duration_agg(cohort, "count").sort_values("duration")
    df.duration = np.ceil(df.duration / 30)
    df.duration = df.duration.astype("int32")
    df = df.groupby("duration").sum().reset_index()
    ax = figure.gca()
    ax.bar(range(len(df)), df["count(1)"].values)
    ax.set_xticklabels(df.duration.values)
    ax.set_xticks(range(len(df)))

    return figure
Beispiel #18
0
    def _find_subjects_with_many_outcomes(cohort: Cohort) -> Cohort:
        subjects_w_many_outcomes = (
            cohort.events.groupby("patientId").count().where(
                sf.col("count") > 1).select("patientId").drop_duplicates())
        # between returns false when col is null
        invalid_events = cohort.events.join(subjects_w_many_outcomes,
                                            "patientId").sort("patientId")

        return Cohort(
            cohort.name + "_inconsistent_w_single_outcome_constraint",
            "events showing there are more than one outcome per patient",
            subjects_w_many_outcomes,
            invalid_events,
        )
Beispiel #19
0
    def test_eq(self):
        df, _ = self.create_spark_df({"patientID": [1, 2]})
        cohort_1 = Cohort("test", "test", df, None)
        df_events, _ = self.create_spark_df({
            "patientID": [1, 2],
            "category": ["test", "test"]
        })

        cohort_2 = Cohort("events", "events", df, df_events)

        cc1 = CohortCollection({"test": cohort_1, "events": cohort_2})
        cc2 = CohortCollection({"test": cohort_1, "events": cohort_2})
        self.assertEqual(cc1, cc2)

        cc3 = CohortCollection({"test1": cohort_1, "events": cohort_2})
        self.assertNotEqual(cc1, cc3)

        df, _ = self.create_spark_df({"patientID": [1, 45]})
        cohort_3 = Cohort("test", "test", df, None)
        cc4 = CohortCollection({"test": cohort_3, "events": cohort_2})

        self.assertNotEqual(cc1, cc4)

        self.assertNotEqual(cc1, df)
def plot_duration_distribution_per_day_as_line(figure: Figure,
                                               cohort: Cohort) -> Figure:
    assert cohort.is_duration_events()

    df = event_duration_agg(cohort, "count").sort_values("duration")
    ax = figure.gca()
    ax.plot(df.duration, df["count(1)"])
    ax.set_yscale("log")

    major = IndexLocator(365, +0.0)
    minor = IndexLocator(30, +0.0)
    ax.xaxis.set_minor_locator(minor)
    ax.xaxis.set_major_locator(major)
    ax.grid(True, which="major", axis="x")
    return figure
    def test_union_all(self):
        patients_pd = pd.DataFrame({"patientID": [1, 2, 3]})
        events_pd = pd.DataFrame({
            "patientID": [1, 2, 3],
            "value": ["DP", "DAS", "DR"]
        })

        patients = self.spark.createDataFrame(patients_pd)

        events = self.spark.createDataFrame(events_pd)
        cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients,
                         events)

        cohort2 = Cohort("hospit_fractures", "hospit_fractures", patients,
                         events)

        cohort3 = Cohort("fractures", "fractures", patients, None)

        result = Cohort.union_all([cohort1, cohort2, cohort3])

        expected_patients = self.spark.createDataFrame(
            pd.concat([patients_pd] * 3))
        expected = Cohort("result", "result", expected_patients, None)
        self.assertEqual(result, expected)
Beispiel #22
0
    def test_compute_longitudinal_age_groups(self):
        kwargs = copy(self.kwargs)
        kwargs["bucket_size"] = 365
        loader = ConvSccsFeatureDriver(**kwargs)

        with self.assertRaises(AssertionError) as context:
            bad_cohort = Cohort(
                "base_population",
                "base_population",
                self.base_population.subjects.select("patientID"),
                None,
            )
            _ = loader._compute_longitudinal_age_groups(bad_cohort,
                                                        col_offset=int(2))
        self.assertTrue(
            "Cohort subjects should have gender and birthdate information" in
            str(context.exception))

        features, mapping = loader._compute_longitudinal_age_groups(
            self.base_population, col_offset=int(2))
        expected_mapping = [
            "[55.0, 60.0)",
            "[60.0, 65.0)",
            "[65.0, 70.0)",
            "[70.0, 75.0)",
            "[75.0, 80.0)",
            "[80.0, 85.0)",
        ]
        expected_data = np.array([
            [3, 1, 6],
            [0, 0, 6],
            [0, 1, 6],
            [0, 2, 6],
            [0, 3, 7],
            [4, 1, 5],
            [4, 2, 6],
            [2, 0, 4],
            [2, 1, 5],
            [2, 2, 5],
            [2, 3, 5],
        ]).astype("int")
        self.assertListEqual(mapping, expected_mapping)
        np.testing.assert_array_equal(features.toPandas().values.astype("int"),
                                      expected_data)
Beispiel #23
0
    def test_find_subjects_with_many_outcomes(self):
        invalid_events = {
            "patientID": ["0", "0", "1", "1", "2"],  # uuid
            "start": [
                pytz.datetime.datetime(1934, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2017, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 7, 27, tzinfo=pytz.UTC),
            ],
            "end": [
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2006, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2014, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
            ],
            "value": [0, 1, 2, 3, 4],
        }

        invalid_df, _ = self.create_spark_df(invalid_events)
        invalid_cohort = Cohort("some_cohort", "Some cohort",
                                invalid_df.select("patientID"), invalid_df)

        loader = ConvSccsFeatureDriver(**self.kwargs)
        invalid = loader._find_subjects_with_many_outcomes(invalid_cohort)

        self.assertEqual(
            invalid.name,
            "some_cohort_inconsistent_w_single_outcome_constraint")
        self.assertEqual(
            invalid.describe(),
            "Events are some_cohort_inconsistent_w_single_outcome_constraint. Events "
            "contain only events showing there are more than one outcome per patient.",
        )
        self.assertListEqual(
            sorted(invalid.subjects.toPandas().values.ravel().tolist()),
            sorted(["0", "1"]),
        )
        self.assertListEqual(
            sorted(invalid.events.toPandas().value.values.ravel().tolist()),
            sorted([0, 1, 2, 3]),
        )
Beispiel #24
0
    def test_find_events_not_in_study_dates(self):
        invalid_events = {
            "patientID": ["0", "1", "2", "2", "3"],  # uuid
            "start": [
                pytz.datetime.datetime(1934, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2017, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 7, 27, tzinfo=pytz.UTC),
            ],
            "end": [
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2006, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2014, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
            ],
            "value": [0, 1, 2, 3, 4],
        }

        invalid_df, _ = self.create_spark_df(invalid_events)
        invalid_cohort = Cohort("some_cohort", "Some cohort",
                                invalid_df.select("patientID"), invalid_df)

        loader = BaseFeatureDriver(**self.kwargs)
        invalid = loader._find_events_not_in_study_dates(invalid_cohort)

        self.assertEqual(invalid.name,
                         "some_cohort_inconsistent_w_study_dates")
        self.assertEqual(
            invalid.describe(),
            "Events are some_cohort_inconsistent_w_study_dates. Events contain only "
            "events inconsistent with study dates.",
        )
        self.assertListEqual(
            sorted(invalid.subjects.toPandas().values.ravel().tolist()),
            sorted(["0", "1", "2"]),
        )
        self.assertListEqual(
            sorted(invalid.events.toPandas().value.values.ravel().tolist()),
            sorted([0, 1, 2, 3]),
        )
Beispiel #25
0
    def test_find_events_not_in_followup_bounds(self):
        data = {
            "patientID": ["0", "6", "2", "3", "5"],  # uuid
            "start": [
                pytz.datetime.datetime(2011, 7, 1, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2017, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 4, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2006, 7, 27, tzinfo=pytz.UTC),
            ],
            "end": [
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2006, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2014, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2015, 7, 27, tzinfo=pytz.UTC),
            ],
            "value": [0, 1, 2, 3, 4],
        }

        df, _ = self.create_spark_df(data)
        cohort = Cohort("some_cohort", "Some cohort",
                        df.select("patientID").distinct(), df)
        loader = BaseFeatureDriver(**self.kwargs)
        invalid = loader._find_events_not_in_followup_bounds(cohort)
        self.assertEqual(invalid.name,
                         "some_cohort_inconsistent_w_followup_bounds")
        self.assertEqual(
            invalid.describe(),
            "Events are some_cohort_inconsistent_w_followup_bounds. Events contain "
            "only events inconsistent with followup bounds.",
        )
        self.assertListEqual(
            sorted(invalid.subjects.toPandas().values.ravel().tolist()),
            sorted(["2", "3"]),
        )
        self.assertListEqual(
            sorted(invalid.events.toPandas().value.values.ravel().tolist()),
            sorted([2, 3]),
        )
Beispiel #26
0
    def test_find_inconsistent_start_end_ordering(self):
        events = {
            "patientID": ["0", "1", "2", "2"],  # uuid
            "start": [
                pytz.datetime.datetime(2011, 7, 2, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 9, 30, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 7, 2, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 9, 30, tzinfo=pytz.UTC),
            ],
            "end": [
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2013, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 6, 20, tzinfo=pytz.UTC),
            ],
            "value": [0, 1, 2, 3],
        }

        df, _ = self.create_spark_df(events)
        cohort = Cohort("some_cohort", "Some cohort",
                        df.select("patientID").distinct(), df)

        invalid = BaseFeatureDriver._find_inconsistent_start_end_ordering(
            cohort)

        self.assertEqual(invalid.name,
                         "some_cohort_inconsistent_w_start_end_ordering")
        self.assertEqual(
            invalid.describe(),
            "Events are some_cohort_inconsistent_w_start_end_ordering. Events contain "
            "only events where start >= end dates are inconsistent.",
        )
        self.assertListEqual(
            sorted(invalid.subjects.toPandas().values.ravel().tolist()),
            sorted(["2"]))
        self.assertListEqual(
            sorted(invalid.events.toPandas().value.values.ravel().tolist()),
            sorted([3]))
Beispiel #27
0
    def test_properties_final_cohort(self):
        loader = ConvSccsFeatureDriver(**self.kwargs)
        with self.assertRaises(PermissionError) as context:
            loader.final_cohort = "some value"
        self.assertTrue(
            "final_cohort should not be set manually,"
            "it is computed from initial cohorts." in str(context.exception))

        with self.assertRaises(AssertionError) as context:
            patients_wo_events, _ = self.create_spark_df(self.patients)
            patients_wo_events = patients_wo_events.select(
                (sf.col("patientID") + 1000).alias("patientID"),
                sf.col("gender"),
                sf.col("birthDate"),
                sf.col("deathDate"),
            )
            loader.base_population = Cohort("base_population",
                                            "base_population",
                                            patients_wo_events, None)
            loader.final_cohort.subjects.count()
        self.assertTrue("Final cohort is empty, please check that "
                        "the intersection of the provided cohorts "
                        "is nonempty" in str(context.exception))
    def test_has_subject_information(self):
        patients_1, _ = self.create_spark_df({"patientID": [1, 2]})
        cohort1 = Cohort("liberal_fractures", "liberal_fractures", patients_1)
        patients_2, _ = self.create_spark_df({
            "patientID": [1, 2],
            "gender": [1, 1],
            "birthDate": [
                pd.to_datetime("1993-10-09"),
                pd.to_datetime("1992-03-14"),
            ],
            "deathDate": [
                pd.to_datetime("1993-10-09"),
                pd.to_datetime("1992-03-14"),
            ],
        })

        cohort2 = Cohort("liberal_fractures", "liberal_fractures", patients_2,
                         None)

        self.assertFalse(cohort1.has_subject_information())
        self.assertTrue(cohort2.has_subject_information())
Beispiel #29
0
    def test_properties_outcomes(self):
        loader = ConvSccsFeatureDriver(**self.kwargs)
        outcomes_ = loader.outcomes
        self.assertTrue(
            data_frame_equality(outcomes_.subjects, self.outcomes.subjects))
        self.assertTrue(
            data_frame_equality(outcomes_.events, self.outcomes.events))

        loader_ = ConvSccsFeatureDriver(**self.kwargs)
        loader_.run_checks = True

        bad_outcomes_df, _ = self.create_spark_df({
            "patientID": ["0", "4"],  # uuid
            "start": [
                pytz.datetime.datetime(2010, 6, 8, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 3, 29, tzinfo=pytz.UTC),
            ],
            "end":
            [None, pytz.datetime.datetime(2010, 11, 24, tzinfo=pytz.UTC)],
            "value": ["bar", "baz"],
            "category": ["outcome"] * 2,
            "groupID": [0] * 2,
            "weight": [1] * 2,
        })
        bad_outcomes_cohort = Cohort(
            "", "",
            bad_outcomes_df.select("patientID").distinct(), bad_outcomes_df)

        with self.assertRaises(AssertionError) as context:
            loader_.outcomes = bad_outcomes_cohort

        self.assertTrue(
            "There are more than one type of outcomes, check the 'value' field of "
            "outcomes cohort events." in str(context.exception))

        mock_dataframe = MagicMock()
        mock_dataframe.take = lambda x: True
        mock_cohort = MagicMock()
        mock_cohort.subjects = mock_dataframe
        mock_cohort.events = mock_dataframe
        mock_empty_df = MagicMock()
        mock_empty_df.take = lambda x: []
        mock_empty_cohort = MagicMock
        mock_empty_cohort.subjects = mock_empty_df
        mock_empty_cohort.events = mock_empty_df
        with patch.object(
                ConvSccsFeatureDriver,
                "_log_invalid_events_cohort",
                return_value="Ooops, error here!",
        ) as mock_log_invalid:
            with patch.object(
                    ConvSccsFeatureDriver,
                    "_find_events_not_in_followup_bounds",
                    return_value=mock_cohort,
            ) as mock_find_events_outcome_bounds:
                loader = ConvSccsFeatureDriver(**self.kwargs)
                loader.run_checks = True
                with self.assertRaises(ValueError) as context:
                    loader.outcomes = self.outcomes
                mock_find_events_outcome_bounds.assert_called_once_with(
                    self.outcomes)
                mock_log_invalid.assert_called_once_with(
                    mock_cohort, log_invalid_events=True)
                self.assertTrue("Ooops, error here!" == str(context.exception))
        with patch.object(
                ConvSccsFeatureDriver,
                "_log_invalid_events_cohort",
                return_value="Ooops, error here!",
        ) as mock_log_invalid:
            with patch.object(
                    ConvSccsFeatureDriver,
                    "_find_events_not_in_followup_bounds",
                    return_value=mock_empty_cohort,
            ) as mock_did_not_find_outcome_bounds:
                with patch.object(
                        ConvSccsFeatureDriver,
                        "_find_subjects_with_many_outcomes",
                        return_value=mock_cohort,
                ) as mock_find_many_outcomes:
                    loader = ConvSccsFeatureDriver(**self.kwargs)
                    loader.run_checks = True
                    with self.assertRaises(ValueError) as context:
                        loader.outcomes = self.outcomes
                    mock_did_not_find_outcome_bounds.assert_called_once_with(
                        self.outcomes)
                    mock_find_many_outcomes.assert_called_once_with(
                        self.outcomes)
                    mock_log_invalid.assert_called_once_with(
                        mock_cohort, log_invalid_subjects=True)
                    self.assertTrue(
                        "Ooops, error here!" == str(context.exception))
Beispiel #30
0
    def setUp(self):
        super().setUp()
        self.study_start = pytz.datetime.datetime(2010, 2, 5, tzinfo=pytz.UTC)
        self.study_end = pytz.datetime.datetime(2013, 10, 12, tzinfo=pytz.UTC)
        self.age_reference_date = pytz.datetime.datetime(2011,
                                                         9,
                                                         21,
                                                         tzinfo=pytz.UTC)
        self.age_groups = [55, 65, 60, 75, 70, 80, 85]
        self.sorted_age_groups = sorted(self.age_groups)

        self.patients = {
            "patientID": ["0", "1", "2", "3", "4"],  # uuid
            "gender": [1, 2, 2, 2, 1],  # in {1, 2}
            "birthDate": [
                pytz.datetime.datetime(1934, 7, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(1951, 5, 1, tzinfo=pytz.UTC),
                pytz.datetime.datetime(1942, 1, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(1933, 10, 3, tzinfo=pytz.UTC),
                pytz.datetime.datetime(1937, 12, 31, tzinfo=pytz.UTC),
            ],
            "deathDate": [
                None,
                None,
                None,
                pytz.datetime.datetime(2011, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 12, 10, tzinfo=pytz.UTC),
            ],  # can be null
        }

        self.followup_events = {
            "patientID": ["0", "3", "4", "2"],  # uuid
            "start": [
                pytz.datetime.datetime(2010, 6, 5, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 3, 27, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 7, 2, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2010, 11, 21, tzinfo=pytz.UTC),
            ],
            "end": [
                pytz.datetime.datetime(2013, 10, 12, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 6, 20, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2012, 7, 3, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2013, 10, 12, tzinfo=pytz.UTC),
            ],
            "endReason":
            ["ObservationEnd", "Trackloss", "Death", "ObservationEnd"],
        }

        self.exposure_events = {
            "patientID": ["0", "3", "4", "2"],  # uuid
            "start": [
                pytz.datetime.datetime(2010, 6, 7, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 3, 28, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 7, 3, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2010, 11, 22, tzinfo=pytz.UTC),
            ],
            "end": [
                None,
                None,
                None,
                pytz.datetime.datetime(2011, 11, 22, tzinfo=pytz.UTC),
            ],
            "value": ["foo"] * 4,
            "category": ["exposure"] * 4,
            "groupID": [0] * 4,
            "weight": [1] * 4,
        }

        self.outcome_events = {
            "patientID": ["0", "3", "4", "2"],  # uuid
            "start": [
                pytz.datetime.datetime(2010, 6, 8, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 3, 29, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2011, 7, 4, tzinfo=pytz.UTC),
                pytz.datetime.datetime(2010, 11, 23, tzinfo=pytz.UTC),
            ],
            "end": [
                None,
                None,
                None,
                pytz.datetime.datetime(2010, 11, 24, tzinfo=pytz.UTC),
            ],
            "value": ["bar"] * 4,
            "category": ["outcome"] * 4,
            "groupID": [0] * 4,
            "weight": [1] * 4,
        }

        patients_df, _ = self.create_spark_df(self.patients)
        fup_events_df, _ = self.create_spark_df(self.followup_events)
        exp_events_df, _ = self.create_spark_df(self.exposure_events)
        out_events_df, _ = self.create_spark_df(self.outcome_events)

        self.base_population = Cohort("base_population", "base_population",
                                      patients_df, None)

        self.followups = Cohort(
            "followups",
            "followups",
            fup_events_df.select("patientID").distinct(),
            fup_events_df,
        )

        self.exposures = Cohort(
            "exposures",
            "exposures",
            exp_events_df.select("patientID").distinct(),
            exp_events_df,
        )

        self.outcomes = Cohort(
            "outcomes",
            "outcomes",
            out_events_df.select("patientID").distinct(),
            out_events_df,
        )

        self.bucket_size = 30

        self.kwargs = {
            "base_population": self.base_population,
            "followups": self.followups,
            "exposures": self.exposures,
            "outcomes": self.outcomes,
            "bucket_size": self.bucket_size,
            "study_start": self.study_start,
            "study_end": self.study_end,
            "age_reference_date": self.age_reference_date,
            "age_groups": self.sorted_age_groups,
            "bucket_rounding": "ceil",
            "run_checks": False,
        }