Beispiel #1
0
def first_test_date_per_patient(session: Session,
                                patient_table,
                                test_table,
                                test_date_name,
                                dest_patient_table,
                                dest_patient_name):
    """
    Filter the first date of test performed for each patient id.

    :param session: The Exetera session instance.
    :param patient_table: The patient dataframe.
    :param test_table: The tests dataframe.
    :param test_date_name: The name of the test dataframe, not used.
    :param dest_patient_table: The destination dataframe to store the results.
    :param dest_patient_name: The name of the destination field to store the results.
    """

    pid = 'id'
    pids = session.get(patient_table[pid])
    pids_ = pids.data[:]
    if not ops.is_ordered(pids.data[:]):
        raise ValueError("The patient table must be ordered by '{}'".format(pid))

    t_pid = 'patient_id'
    t_pids = session.get(test_table[t_pid])
    t_pids_ = t_pids.data[:]
    if not ops.is_ordered(t_pids_):
        raise ValueError("The test table must be ordered by '{}'".format(t_pid))

    # collapse the test data by patient_id and get the counts
    cats = session.get(test_table['created_at'])
    spans_ = session.get_spans(t_pids_)
    s_t_pids_ = session.apply_spans_first(spans_, t_pids_)
    counts_ = session.apply_spans_first(spans_, cats)

    # merge the counts for the test table into the patient table
    dest = session.create_numeric(dest_patient_table, dest_patient_name, 'int32')
    session.ordered_merge_left(left_on=pids_, right_on=s_t_pids_, right_field_sources=(counts_,),
                               left_field_sinks=(dest,), left_unique=True, right_unique=True)
Beispiel #2
0
def postprocess(dataset, destination, timestamp=None, flags=None):

    if flags is None:
        flags = set()

    do_daily_asmts = 'daily' in flags
    has_patients = 'patients' in dataset.keys()
    has_assessments = 'assessments' in dataset.keys()
    has_tests = 'tests' in dataset.keys()
    has_diet = 'diet' in dataset.keys()

    sort_enabled = lambda x: True
    process_enabled = lambda x: True

    sort_patients = sort_enabled(flags) and True
    sort_assessments = sort_enabled(flags) and True
    sort_tests = sort_enabled(flags) and True
    sort_diet = sort_enabled(flags) and True

    make_assessment_patient_id_fkey = process_enabled(flags) and True
    year_from_age = process_enabled(flags) and True
    clean_weight_height_bmi = process_enabled(flags) and True
    health_worker_with_contact = process_enabled(flags) and True
    clean_temperatures = process_enabled(flags) and True
    check_symptoms = process_enabled(flags) and True
    create_daily = process_enabled(flags) and do_daily_asmts
    make_patient_level_assessment_metrics = process_enabled(flags) and True
    make_patient_level_daily_assessment_metrics = process_enabled(
        flags) and do_daily_asmts
    make_new_test_level_metrics = process_enabled(flags) and True
    make_diet_level_metrics = True
    make_healthy_diet_index = True

    # ds = DataStore(timestamp=timestamp)
    s = Session()

    # patients ================================================================

    sorted_patients_src = None

    if has_patients:
        patients_src = dataset['patients']

        write_mode = 'write'

        if 'patients' not in destination.keys():
            patients_dest = s.get_or_create_group(destination, 'patients')
            sorted_patients_src = patients_dest

            # Patient sort
            # ============
            if sort_patients:
                duplicate_filter = \
                    persistence.filter_duplicate_fields(s.get(patients_src['id']).data[:])

                for k in patients_src.keys():
                    t0 = time.time()
                    r = s.get(patients_src[k])
                    w = r.create_like(patients_dest, k)
                    s.apply_filter(duplicate_filter, r, w)
                    print(f"'{k}' filtered in {time.time() - t0}s")

                print(np.count_nonzero(duplicate_filter == True),
                      np.count_nonzero(duplicate_filter == False))
                sort_keys = ('id', )
                s.sort_on(patients_dest,
                          patients_dest,
                          sort_keys,
                          write_mode='overwrite')

            # Patient processing
            # ==================
            if year_from_age:
                log("year of birth -> age; 18 to 90 filter")
                t0 = time.time()
                yobs = s.get(patients_dest['year_of_birth'])
                yob_filter = s.get(patients_dest['year_of_birth_valid'])
                age = s.create_numeric(patients_dest, 'age', 'uint32')
                age_filter = s.create_numeric(patients_dest, 'age_filter',
                                              'bool')
                age_16_to_90 = s.create_numeric(patients_dest,
                                                '16_to_90_years', 'bool')
                print('year_of_birth:', patients_dest['year_of_birth'])
                for k in patients_dest['year_of_birth'].attrs.keys():
                    print(k, patients_dest['year_of_birth'].attrs[k])
                calculate_age_from_year_of_birth_v1(yobs, yob_filter, 16, 90,
                                                    age, age_filter,
                                                    age_16_to_90, 2020)
                log(f"completed in {time.time() - t0}")

                print('age_filter count:',
                      np.sum(patients_dest['age_filter']['values'][:]))
                print('16_to_90_years count:',
                      np.sum(patients_dest['16_to_90_years']['values'][:]))

            if clean_weight_height_bmi:
                log("height / weight / bmi; standard range filters")
                t0 = time.time()

                weights_clean = s.create_numeric(patients_dest,
                                                 'weight_kg_clean', 'float32')
                weights_filter = s.create_numeric(patients_dest,
                                                  '40_to_200_kg', 'bool')
                heights_clean = s.create_numeric(patients_dest,
                                                 'height_cm_clean', 'float32')
                heights_filter = s.create_numeric(patients_dest,
                                                  '110_to_220_cm', 'bool')
                bmis_clean = s.create_numeric(patients_dest, 'bmi_clean',
                                              'float32')
                bmis_filter = s.create_numeric(patients_dest, '15_to_55_bmi',
                                               'bool')

                weight_height_bmi_v1(s, 40, 200, 110, 220, 15, 55, None, None,
                                     None, None, patients_dest['weight_kg'],
                                     patients_dest['weight_kg_valid'],
                                     patients_dest['height_cm'],
                                     patients_dest['height_cm_valid'],
                                     patients_dest['bmi'],
                                     patients_dest['bmi_valid'], weights_clean,
                                     weights_filter, None, heights_clean,
                                     heights_filter, None, bmis_clean,
                                     bmis_filter, None)
                log(f"completed in {time.time() - t0}")

            if health_worker_with_contact:
                with utils.Timer("health_worker_with_contact field"):
                    #writer = ds.get_categorical_writer(patients_dest, 'health_worker_with_contact', 'int8')
                    combined_hcw_with_contact_v1(
                        s, s.get(patients_dest['healthcare_professional']),
                        s.get(patients_dest['contact_health_worker']),
                        s.get(patients_dest['is_carer_for_community']),
                        patients_dest, 'health_worker_with_contact')

    # assessments =============================================================

    sorted_assessments_src = None
    if has_assessments:
        assessments_src = dataset['assessments']
        if 'assessments' not in destination.keys():
            assessments_dest = s.get_or_create_group(destination,
                                                     'assessments')
            sorted_assessments_src = assessments_dest

            if sort_assessments:
                sort_keys = ('patient_id', 'created_at')
                with utils.Timer("sorting assessments"):
                    s.sort_on(assessments_src, assessments_dest, sort_keys)

            if has_patients:
                if make_assessment_patient_id_fkey:
                    print(
                        "creating 'assessment_patient_id_fkey' foreign key index for 'patient_id'"
                    )
                    t0 = time.time()
                    patient_ids = s.get(sorted_patients_src['id'])
                    assessment_patient_ids =\
                        s.get(sorted_assessments_src['patient_id'])
                    assessment_patient_id_fkey =\
                        s.create_numeric(assessments_dest, 'assessment_patient_id_fkey', 'int64')
                    s.get_index(patient_ids.data[:],
                                assessment_patient_ids.data[:],
                                assessment_patient_id_fkey)
                    print(f"completed in {time.time() - t0}s")

            if clean_temperatures:
                print("clean temperatures")
                t0 = time.time()
                temps = s.get(sorted_assessments_src['temperature'])
                temp_units = s.get(sorted_assessments_src['temperature_unit'])
                temps_valid = s.get(
                    sorted_assessments_src['temperature_valid'])
                dest_temps = temps.create_like(assessments_dest,
                                               'temperature_c_clean')
                dest_temps_valid = temps_valid.create_like(
                    assessments_dest, 'temperature_35_to_42_inclusive')
                dest_temps_modified = temps_valid.create_like(
                    assessments_dest, 'temperature_modified')
                validate_temperature_v1(s, 35.0, 42.0, temps, temp_units,
                                        temps_valid, dest_temps,
                                        dest_temps_valid, dest_temps_modified)
                print(f"temperature cleaning done in {time.time() - t0}")

            if check_symptoms:
                print('check inconsistent health_status')
                t0 = time.time()
                check_inconsistent_symptoms_v1(s, sorted_assessments_src,
                                               assessments_dest)
                print(time.time() - t0)

    # tests ===================================================================

    if has_tests:
        if sort_tests:
            tests_src = dataset['tests']
            tests_dest = s.get_or_create_group(destination, 'tests')
            sort_keys = ('patient_id', 'created_at')
            s.sort_on(tests_src, tests_dest, sort_keys)

    # diet ====================================================================

    if has_diet:
        diet_src = dataset['diet']
        if 'diet' not in destination.keys():
            diet_dest = s.get_or_create_group(destination, 'diet')
            sorted_diet_src = diet_dest
            if sort_diet:
                sort_keys = ('patient_id', 'display_name', 'id')
                s.sort_on(diet_src, diet_dest, sort_keys)

    if has_assessments:
        if do_daily_asmts:
            daily_assessments_dest = s.get_or_create_group(
                destination, 'daily_assessments')

    # post process patients
    # TODO: need an transaction table

    print(patients_src.keys())
    print(dataset['assessments'].keys())
    print(dataset['tests'].keys())

    # write_mode = 'overwrite'
    write_mode = 'write'

    # Daily assessments
    # =================

    if has_assessments:
        if create_daily:
            print("generate daily assessments")
            patient_ids = s.get(sorted_assessments_src['patient_id'])
            created_at_days = s.get(sorted_assessments_src['created_at_day'])
            raw_created_at_days = created_at_days.data[:]

            if 'assessment_patient_id_fkey' in assessments_src.keys():
                patient_id_index = assessments_src[
                    'assessment_patient_id_fkey']
            else:
                patient_id_index = assessments_dest[
                    'assessment_patient_id_fkey']
            patient_id_indices = s.get(patient_id_index)
            raw_patient_id_indices = patient_id_indices.data[:]

            print("Calculating patient id index spans")
            t0 = time.time()
            patient_id_index_spans = s.get_spans(
                fields=(raw_patient_id_indices, raw_created_at_days))
            print(
                f"Calculated {len(patient_id_index_spans)-1} spans in {time.time() - t0}s"
            )

            print("Applying spans to 'health_status'")
            t0 = time.time()
            default_behavour_overrides = {
                'id': s.apply_spans_last,
                'patient_id': s.apply_spans_last,
                'patient_index': s.apply_spans_last,
                'created_at': s.apply_spans_last,
                'created_at_day': s.apply_spans_last,
                'updated_at': s.apply_spans_last,
                'updated_at_day': s.apply_spans_last,
                'version': s.apply_spans_max,
                'country_code': s.apply_spans_first,
                'date_test_occurred': None,
                'date_test_occurred_guess': None,
                'date_test_occurred_day': None,
                'date_test_occurred_set': None,
            }
            for k in sorted_assessments_src.keys():
                t1 = time.time()
                reader = s.get(sorted_assessments_src[k])
                if k in default_behavour_overrides:
                    apply_span_fn = default_behavour_overrides[k]
                    if apply_span_fn is not None:
                        apply_span_fn(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  Skipping field {k}")
                else:
                    if isinstance(reader, fields.CategoricalField):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.IndexedStringReader):
                        s.apply_spans_concat(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.NumericReader):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  No function for {k}")

            print(f"apply_spans completed in {time.time() - t0}s")

    if has_patients and has_assessments:
        if make_patient_level_assessment_metrics:
            if 'assessment_patient_id_fkey' in assessments_dest:
                src = assessments_dest['assessment_patient_id_fkey']
            else:
                src = assessments_src['assessment_patient_id_fkey']
            assessment_patient_id_fkey = s.get(src)
            # generate spans from the assessment-space patient_id foreign key
            spans = s.get_spans(field=assessment_patient_id_fkey.data[:])

            ids = s.get(patients_dest['id'])

            print('calculate assessment counts per patient')
            t0 = time.time()
            writer = s.create_numeric(patients_dest, 'assessment_count',
                                      'uint32')
            aggregated_counts = s.apply_spans_count(spans)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated assessment counts per patient in {time.time() - t0}"
            )

            print('calculate first assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'first_assessment_day', 10)
            aggregated_counts = s.apply_spans_first(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated first assessment days per patient in {time.time() - t0}"
            )

            print('calculate last assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'last_assessment_day', 10)
            aggregated_counts = s.apply_spans_last(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated last assessment days per patient in {time.time() - t0}"
            )

            print('calculate maximum assessment test result per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['tested_covid_positive'])
            writer = reader.create_like(patients_dest,
                                        'max_assessment_test_result')
            max_result_value = s.apply_spans_max(spans, reader)
            s.join(ids, assessment_patient_id_fkey, max_result_value, writer,
                   spans)
            print(
                f"calculated maximum assessment test result in {time.time() - t0}"
            )

    if has_assessments and do_daily_asmts and make_patient_level_daily_assessment_metrics:
        print(
            "creating 'daily_assessment_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        daily_assessment_patient_ids =\
            s.get(daily_assessments_dest['patient_id'])
        daily_assessment_patient_id_fkey =\
            s.create_numeric(daily_assessments_dest, 'daily_assessment_patient_id_fkey', 'int64')
        s.get_index(patient_ids, daily_assessment_patient_ids,
                    daily_assessment_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        spans = s.get_spans(field=s.get(
            daily_assessments_dest['daily_assessment_patient_id_fkey']))

        print('calculate daily assessment counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'daily_assessment_count',
                                  'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        daily_assessment_patient_id_fkey =\
            s.get(daily_assessments_dest['daily_assessment_patient_id_fkey'])
        s.join(ids, daily_assessment_patient_id_fkey, aggregated_counts,
               writer, spans)
        print(
            f"calculated daily assessment counts per patient in {time.time() - t0}"
        )

    if has_tests and make_new_test_level_metrics:
        print(
            "creating 'test_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        test_patient_ids = s.get(tests_dest['patient_id'])
        test_patient_id_fkey = s.create_numeric(tests_dest,
                                                'test_patient_id_fkey',
                                                'int64')
        s.get_index(patient_ids, test_patient_ids, test_patient_id_fkey)
        test_patient_id_fkey = s.get(tests_dest['test_patient_id_fkey'])
        spans = s.get_spans(field=test_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        print('calculate test_counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'test_count', 'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        s.join(ids, test_patient_id_fkey, aggregated_counts, writer, spans)
        print(f"calculated test counts per patient in {time.time() - t0}")

        print('calculate test_result per patient')
        t0 = time.time()
        test_results = s.get(tests_dest['result'])
        writer = test_results.create_like(patients_dest, 'max_test_result')
        aggregated_results = s.apply_spans_max(spans, test_results)
        s.join(ids, test_patient_id_fkey, aggregated_results, writer, spans)
        print(f"calculated max_test_result per patient in {time.time() - t0}")

    if has_diet and make_diet_level_metrics:
        with utils.Timer("Making patient-level diet questions count",
                         new_line=True):
            d_pids_ = s.get(diet_dest['patient_id']).data[:]
            d_pid_spans = s.get_spans(d_pids_)
            d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_)
            d_pid_counts = s.apply_spans_count(d_pid_spans)
            p_diet_counts = s.create_numeric(patients_dest, 'diet_counts',
                                             'int32')
            s.merge_left(left_on=s.get(patients_dest['id']).data[:],
                         right_on=d_distinct_pids,
                         right_fields=(d_pid_counts, ),
                         right_writers=(p_diet_counts, ))

src_file = '/home/ben/covid/ds_20200929_full.hdf5'
dest_file = '/home/ben/covid/ds_diet_tmp.hdf5'
with h5py.File(src_file, 'r') as hf:
    with h5py.File(dest_file, 'w') as dest:
        s = Session()

        ptnts = hf['patients']
        print(hf['diet'].keys())
        diet = hf['diet']

        p_ids_ = s.get(hf['patients']['id']).data[:]
        d_pids_ = s.get(hf['diet']['patient_id']).data[:]
        d_pid_spans = s.get_spans(d_pids_)
        d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_)
        d_pid_counts = s.apply_spans_count(d_pid_spans)
        print(np.unique(d_pid_counts, return_counts=True))
        p_diet_counts_new = s.create_numeric(dest, 'diet_counts_new', 'int32')
        dcs = s.merge_left(left_on=p_ids_,
                           right_on=d_distinct_pids,
                           right_fields=(d_pid_counts, ),
                           right_writers=(p_diet_counts_new, ))
        # res = np.unique(s.get(patients_dest['diet_counts']).data[:], return_counts=True)
        print(np.unique(p_diet_counts_new.data[:], return_counts=True))

        # ddtest = defaultdict(int)
        # for p in d_pids_:
        #     ddtest[p] += 1
        #
        # p_diet_counts_new_ = p_diet_counts_new.data[:]