コード例 #1
0
def covid_test_date_v1(session: Session,
                       test_table,
                       dest_test_table,
                       dest_field_name='test_date',
                       dest_field_flags_name='test_date_valid'):
    """
    Infer the test date from 'date_taken_specific', 'date_taken_between_start' and 'date_taken_between_end' columns.

    :param session: The Exetera session instance.
    :param test_table: The tests dataframe which contains 'date_taken_specific', 'date_taken_between_start' and
        'date_taken_between_end' columns..
    :param dest_test_table: The destination dataframe to write the result to.
    :param dest_field_name: The name of the result date column.
    :param dest_field_flags_name: The name of the column to store the flat indicates if the date is set or inferred.
    """
    exact = session.get(test_table['date_taken_specific'])
    exact_ = exact.data[:]
    between_start_ = session.get(
        test_table['date_taken_between_start']).data[:]
    between_end_ = session.get(test_table['date_taken_between_end']).data[:]

    # flag dates where neither exact or between_start are set
    test_date_valid = (exact_ == 0.0) & (between_start_ != 0.0) & (between_end_ != 0.0) &\
                      (between_end_ >= between_start_)
    test_date_valid = test_date_valid | ((exact_ != 0.0) &
                                         (between_start_ == 0.0) &
                                         (between_end_ == 0.0))

    test_date = np.where(exact_ != 0.0, exact_,
                         between_start_ + (between_end_ - between_start_) / 2)

    exact.create_like(dest_test_table, dest_field_name).data.write(test_date)
    session.create_numeric(dest_test_table, dest_field_flags_name,
                           'bool').data.write(test_date_valid)
コード例 #2
0
def iterator_test_1(length):
    a_ids, a_vals, b_ids = generate_dataset_1(length)
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf:
        wa_vals = s.create_numeric(hf, 'a_vals', 'int32')
        wa_vals.data.write(a_vals)

        wa_vals2 = s.get(hf['a_vals'])
        print(fast_sum(iter(ops.data_iterator(wa_vals2))))
コード例 #3
0
def new_hs_test(vcount):
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf:
        with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest:

            print(hf.keys())

            a_ids_f = s.get(hf['fk_ids'])
            b_ids_f = s.get(hf['ids'])

            all_b_val_fields = list()
            for v in range(vcount):
                b_vals_f = s.create_numeric(dest, 'left_data_{}'.format(v),
                                            'int32')
                all_b_val_fields.append(b_vals_f)

            a_to_b = s.create_numeric(dest, 'a_to_b', 'int64')

            all_a_val_fields = list()
            for v in range(vcount):
                a_vals_f = s.get(hf['right_data_{}'.format(v)])
                all_a_val_fields.append(a_vals_f)

            print("running test")
            t0 = time.time()
            # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True,
            #                      left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,))
            print(a_ids_f.data[:100])
            print(b_ids_f.data[:100])
            print(all_a_val_fields[0].data[:100])
            s.ordered_merge_left(a_ids_f,
                                 b_ids_f,
                                 left_to_right_map=a_to_b,
                                 right_unique=True,
                                 right_field_sources=tuple(all_a_val_fields),
                                 left_field_sinks=tuple(all_b_val_fields))
            print(a_to_b.data[:100])
            results = s.merge_left(a_ids_f,
                                   b_ids_f,
                                   right_fields=tuple(all_a_val_fields))
            elapsed = time.time() - t0
            print("total:", elapsed)
            print(all_b_val_fields[0].data[:100])
            print(results[0][:100])
コード例 #4
0
def first_test_date_per_patient(session: Session,
                                patient_table,
                                test_table,
                                test_date_name,
                                dest_patient_table,
                                dest_patient_name):
    """
    Filter the first date of test performed for each patient id.

    :param session: The Exetera session instance.
    :param patient_table: The patient dataframe.
    :param test_table: The tests dataframe.
    :param test_date_name: The name of the test dataframe, not used.
    :param dest_patient_table: The destination dataframe to store the results.
    :param dest_patient_name: The name of the destination field to store the results.
    """

    pid = 'id'
    pids = session.get(patient_table[pid])
    pids_ = pids.data[:]
    if not ops.is_ordered(pids.data[:]):
        raise ValueError("The patient table must be ordered by '{}'".format(pid))

    t_pid = 'patient_id'
    t_pids = session.get(test_table[t_pid])
    t_pids_ = t_pids.data[:]
    if not ops.is_ordered(t_pids_):
        raise ValueError("The test table must be ordered by '{}'".format(t_pid))

    # collapse the test data by patient_id and get the counts
    cats = session.get(test_table['created_at'])
    spans_ = session.get_spans(t_pids_)
    s_t_pids_ = session.apply_spans_first(spans_, t_pids_)
    counts_ = session.apply_spans_first(spans_, cats)

    # merge the counts for the test table into the patient table
    dest = session.create_numeric(dest_patient_table, dest_patient_name, 'int32')
    session.ordered_merge_left(left_on=pids_, right_on=s_t_pids_, right_field_sources=(counts_,),
                               left_field_sinks=(dest,), left_unique=True, right_unique=True)
コード例 #5
0
def read_fields_from_hdf5(file_name, field_count):
    fields = ('id', 'created_at', 'updated_at', 'version', 'country_code',
              'reported_by_another', 'same_household_as_reporter',
              'contact_additional_studies', 'year_of_birth', 'height_cm',
              'weight_kg', 'gender', 'race_other', 'ethnicity',
              'profile_attributes_updated_at', 'has_diabetes')
    print(len(fields))
    s = Session()
    with h5py.File(file_name, 'r') as hf:
        with utils.Timer("reading {} fields from dataset".format(field_count)):
            for f in range(field_count):
                field = s.get(hf['patients'][fields[f]])
                if isinstance(field, flds.IndexedStringField):
                    indices = field.indices[:]
                    values = field.values[:]
                else:
                    data = field.data[:]
コード例 #6
0
        #     print(substrs)
        substrs = replace_multi_with_str("#!,\"(){}[].:;", substrs)
        substrs = [s_.strip() for s_ in substrs.split() if len(s_) > 0]
        for s in substrs:
            if s in words_to_check:
                total_count += 1
                break
    print(total_count)


with h5py.File('/home/ben/covid/ds_20200901_full.hdf5', 'r') as hf:
    with h5py.File('/home/ben/covid/ds_20200901_othersymp.hdf5', 'w') as tmp:
        s = Session()
        print([k for k in hf['patients'].keys() if 'result' in k])

        old_test = s.get(hf['patients']['max_assessment_test_result']).data[:]
        new_test = s.get(hf['patients']['max_test_result']).data[:]
        test_results = np.where((old_test == 3) | (new_test == 4), 2, 0)
        test_results = np.where(
            (test_results == 0) & ((old_test == 2) | (new_test == 3)), 1,
            test_results)
        p_test_results = s.create_numeric(tmp, 'p_test_results', 'int8')
        p_test_results.data.write(test_results)
        print("overall tests:", np.unique(test_results, return_counts=True))

        other = s.get(hf['assessments']['other_symptoms'])
        cc = s.get(hf['assessments']['country_code']).data[:]
        otherstart = other.indices[:-1]
        otherend = other.indices[1:]
        ofilter = otherend - otherstart > 0
        print("ofilter:", ofilter.sum(), len(ofilter))
コード例 #7
0
def hs_test_1(length, val_column_count):
    # rng = np.random.RandomState(12345678)
    # id_base = 1000000000
    # mapping = [0, 1, 2, 1]
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf:
        with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest:
            # print('creating a_ids')
            # a_ids = generate_a_ids(length, id_base)
            # a_ids_f = s.create_numeric(hf, 'a_ids', 'int64')
            # a_ids_f.data.write(a_ids)
            # del a_ids
            #
            # print('creating a_vals')
            # # all_a_val_fields = list()
            # for v in range(val_column_count):
            #     a_vals = generate_a_vals(length, 0, 100, rng)
            #     a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64')
            #     a_vals_f.data.write(a_vals)
            #     # all_a_val_fields.append(a_vals_f)
            #     del a_vals
            #
            # print('creating b_ids')
            # b_ids = generate_b_ids(length, id_base, mapping)
            # b_ids_f = s.create_numeric(hf, 'b_ids', 'int64')
            # b_ids_f.data.write(b_ids)
            # del b_ids

            a_ids_f = s.get(hf['a_ids'])
            b_ids_f = s.get(hf['b_ids'])

            all_b_val_fields = list()
            for v in range(val_column_count):
                b_vals_f = s.create_numeric(dest, 'b_vals_{}'.format(v),
                                            'int32')
                all_b_val_fields.append(b_vals_f)

            a_to_b = s.create_numeric(dest, 'a_to_b', 'int64')

            all_a_val_fields = list()
            for v in range(val_column_count):
                a_vals_f = s.get(hf['a_vals_{}'.format(v)])
                all_a_val_fields.append(a_vals_f)

            print("running test")
            t0 = time.time()
            # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True,
            #                      left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,))
            print(b_ids_f.data[:100])
            print(a_ids_f.data[:100])
            s.ordered_merge_left(b_ids_f,
                                 a_ids_f,
                                 right_field_sources=tuple(all_a_val_fields),
                                 left_field_sinks=tuple(all_b_val_fields),
                                 left_to_right_map=a_to_b,
                                 right_unique=True)
            print(a_to_b.data[:100])
            results = s.merge_left(b_ids_f,
                                   a_ids_f,
                                   right_fields=tuple(all_a_val_fields))
            elapsed = time.time() - t0
            print(elapsed)
            print(all_b_val_fields[0].data[:100])
            print(results[0][:100])
コード例 #8
0
def postprocess(dataset, destination, timestamp=None, flags=None):

    if flags is None:
        flags = set()

    do_daily_asmts = 'daily' in flags
    has_patients = 'patients' in dataset.keys()
    has_assessments = 'assessments' in dataset.keys()
    has_tests = 'tests' in dataset.keys()
    has_diet = 'diet' in dataset.keys()

    sort_enabled = lambda x: True
    process_enabled = lambda x: True

    sort_patients = sort_enabled(flags) and True
    sort_assessments = sort_enabled(flags) and True
    sort_tests = sort_enabled(flags) and True
    sort_diet = sort_enabled(flags) and True

    make_assessment_patient_id_fkey = process_enabled(flags) and True
    year_from_age = process_enabled(flags) and True
    clean_weight_height_bmi = process_enabled(flags) and True
    health_worker_with_contact = process_enabled(flags) and True
    clean_temperatures = process_enabled(flags) and True
    check_symptoms = process_enabled(flags) and True
    create_daily = process_enabled(flags) and do_daily_asmts
    make_patient_level_assessment_metrics = process_enabled(flags) and True
    make_patient_level_daily_assessment_metrics = process_enabled(
        flags) and do_daily_asmts
    make_new_test_level_metrics = process_enabled(flags) and True
    make_diet_level_metrics = True
    make_healthy_diet_index = True

    # ds = DataStore(timestamp=timestamp)
    s = Session()

    # patients ================================================================

    sorted_patients_src = None

    if has_patients:
        patients_src = dataset['patients']

        write_mode = 'write'

        if 'patients' not in destination.keys():
            patients_dest = s.get_or_create_group(destination, 'patients')
            sorted_patients_src = patients_dest

            # Patient sort
            # ============
            if sort_patients:
                duplicate_filter = \
                    persistence.filter_duplicate_fields(s.get(patients_src['id']).data[:])

                for k in patients_src.keys():
                    t0 = time.time()
                    r = s.get(patients_src[k])
                    w = r.create_like(patients_dest, k)
                    s.apply_filter(duplicate_filter, r, w)
                    print(f"'{k}' filtered in {time.time() - t0}s")

                print(np.count_nonzero(duplicate_filter == True),
                      np.count_nonzero(duplicate_filter == False))
                sort_keys = ('id', )
                s.sort_on(patients_dest,
                          patients_dest,
                          sort_keys,
                          write_mode='overwrite')

            # Patient processing
            # ==================
            if year_from_age:
                log("year of birth -> age; 18 to 90 filter")
                t0 = time.time()
                yobs = s.get(patients_dest['year_of_birth'])
                yob_filter = s.get(patients_dest['year_of_birth_valid'])
                age = s.create_numeric(patients_dest, 'age', 'uint32')
                age_filter = s.create_numeric(patients_dest, 'age_filter',
                                              'bool')
                age_16_to_90 = s.create_numeric(patients_dest,
                                                '16_to_90_years', 'bool')
                print('year_of_birth:', patients_dest['year_of_birth'])
                for k in patients_dest['year_of_birth'].attrs.keys():
                    print(k, patients_dest['year_of_birth'].attrs[k])
                calculate_age_from_year_of_birth_v1(yobs, yob_filter, 16, 90,
                                                    age, age_filter,
                                                    age_16_to_90, 2020)
                log(f"completed in {time.time() - t0}")

                print('age_filter count:',
                      np.sum(patients_dest['age_filter']['values'][:]))
                print('16_to_90_years count:',
                      np.sum(patients_dest['16_to_90_years']['values'][:]))

            if clean_weight_height_bmi:
                log("height / weight / bmi; standard range filters")
                t0 = time.time()

                weights_clean = s.create_numeric(patients_dest,
                                                 'weight_kg_clean', 'float32')
                weights_filter = s.create_numeric(patients_dest,
                                                  '40_to_200_kg', 'bool')
                heights_clean = s.create_numeric(patients_dest,
                                                 'height_cm_clean', 'float32')
                heights_filter = s.create_numeric(patients_dest,
                                                  '110_to_220_cm', 'bool')
                bmis_clean = s.create_numeric(patients_dest, 'bmi_clean',
                                              'float32')
                bmis_filter = s.create_numeric(patients_dest, '15_to_55_bmi',
                                               'bool')

                weight_height_bmi_v1(s, 40, 200, 110, 220, 15, 55, None, None,
                                     None, None, patients_dest['weight_kg'],
                                     patients_dest['weight_kg_valid'],
                                     patients_dest['height_cm'],
                                     patients_dest['height_cm_valid'],
                                     patients_dest['bmi'],
                                     patients_dest['bmi_valid'], weights_clean,
                                     weights_filter, None, heights_clean,
                                     heights_filter, None, bmis_clean,
                                     bmis_filter, None)
                log(f"completed in {time.time() - t0}")

            if health_worker_with_contact:
                with utils.Timer("health_worker_with_contact field"):
                    #writer = ds.get_categorical_writer(patients_dest, 'health_worker_with_contact', 'int8')
                    combined_hcw_with_contact_v1(
                        s, s.get(patients_dest['healthcare_professional']),
                        s.get(patients_dest['contact_health_worker']),
                        s.get(patients_dest['is_carer_for_community']),
                        patients_dest, 'health_worker_with_contact')

    # assessments =============================================================

    sorted_assessments_src = None
    if has_assessments:
        assessments_src = dataset['assessments']
        if 'assessments' not in destination.keys():
            assessments_dest = s.get_or_create_group(destination,
                                                     'assessments')
            sorted_assessments_src = assessments_dest

            if sort_assessments:
                sort_keys = ('patient_id', 'created_at')
                with utils.Timer("sorting assessments"):
                    s.sort_on(assessments_src, assessments_dest, sort_keys)

            if has_patients:
                if make_assessment_patient_id_fkey:
                    print(
                        "creating 'assessment_patient_id_fkey' foreign key index for 'patient_id'"
                    )
                    t0 = time.time()
                    patient_ids = s.get(sorted_patients_src['id'])
                    assessment_patient_ids =\
                        s.get(sorted_assessments_src['patient_id'])
                    assessment_patient_id_fkey =\
                        s.create_numeric(assessments_dest, 'assessment_patient_id_fkey', 'int64')
                    s.get_index(patient_ids.data[:],
                                assessment_patient_ids.data[:],
                                assessment_patient_id_fkey)
                    print(f"completed in {time.time() - t0}s")

            if clean_temperatures:
                print("clean temperatures")
                t0 = time.time()
                temps = s.get(sorted_assessments_src['temperature'])
                temp_units = s.get(sorted_assessments_src['temperature_unit'])
                temps_valid = s.get(
                    sorted_assessments_src['temperature_valid'])
                dest_temps = temps.create_like(assessments_dest,
                                               'temperature_c_clean')
                dest_temps_valid = temps_valid.create_like(
                    assessments_dest, 'temperature_35_to_42_inclusive')
                dest_temps_modified = temps_valid.create_like(
                    assessments_dest, 'temperature_modified')
                validate_temperature_v1(s, 35.0, 42.0, temps, temp_units,
                                        temps_valid, dest_temps,
                                        dest_temps_valid, dest_temps_modified)
                print(f"temperature cleaning done in {time.time() - t0}")

            if check_symptoms:
                print('check inconsistent health_status')
                t0 = time.time()
                check_inconsistent_symptoms_v1(s, sorted_assessments_src,
                                               assessments_dest)
                print(time.time() - t0)

    # tests ===================================================================

    if has_tests:
        if sort_tests:
            tests_src = dataset['tests']
            tests_dest = s.get_or_create_group(destination, 'tests')
            sort_keys = ('patient_id', 'created_at')
            s.sort_on(tests_src, tests_dest, sort_keys)

    # diet ====================================================================

    if has_diet:
        diet_src = dataset['diet']
        if 'diet' not in destination.keys():
            diet_dest = s.get_or_create_group(destination, 'diet')
            sorted_diet_src = diet_dest
            if sort_diet:
                sort_keys = ('patient_id', 'display_name', 'id')
                s.sort_on(diet_src, diet_dest, sort_keys)

    if has_assessments:
        if do_daily_asmts:
            daily_assessments_dest = s.get_or_create_group(
                destination, 'daily_assessments')

    # post process patients
    # TODO: need an transaction table

    print(patients_src.keys())
    print(dataset['assessments'].keys())
    print(dataset['tests'].keys())

    # write_mode = 'overwrite'
    write_mode = 'write'

    # Daily assessments
    # =================

    if has_assessments:
        if create_daily:
            print("generate daily assessments")
            patient_ids = s.get(sorted_assessments_src['patient_id'])
            created_at_days = s.get(sorted_assessments_src['created_at_day'])
            raw_created_at_days = created_at_days.data[:]

            if 'assessment_patient_id_fkey' in assessments_src.keys():
                patient_id_index = assessments_src[
                    'assessment_patient_id_fkey']
            else:
                patient_id_index = assessments_dest[
                    'assessment_patient_id_fkey']
            patient_id_indices = s.get(patient_id_index)
            raw_patient_id_indices = patient_id_indices.data[:]

            print("Calculating patient id index spans")
            t0 = time.time()
            patient_id_index_spans = s.get_spans(
                fields=(raw_patient_id_indices, raw_created_at_days))
            print(
                f"Calculated {len(patient_id_index_spans)-1} spans in {time.time() - t0}s"
            )

            print("Applying spans to 'health_status'")
            t0 = time.time()
            default_behavour_overrides = {
                'id': s.apply_spans_last,
                'patient_id': s.apply_spans_last,
                'patient_index': s.apply_spans_last,
                'created_at': s.apply_spans_last,
                'created_at_day': s.apply_spans_last,
                'updated_at': s.apply_spans_last,
                'updated_at_day': s.apply_spans_last,
                'version': s.apply_spans_max,
                'country_code': s.apply_spans_first,
                'date_test_occurred': None,
                'date_test_occurred_guess': None,
                'date_test_occurred_day': None,
                'date_test_occurred_set': None,
            }
            for k in sorted_assessments_src.keys():
                t1 = time.time()
                reader = s.get(sorted_assessments_src[k])
                if k in default_behavour_overrides:
                    apply_span_fn = default_behavour_overrides[k]
                    if apply_span_fn is not None:
                        apply_span_fn(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  Skipping field {k}")
                else:
                    if isinstance(reader, fields.CategoricalField):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.IndexedStringReader):
                        s.apply_spans_concat(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.NumericReader):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  No function for {k}")

            print(f"apply_spans completed in {time.time() - t0}s")

    if has_patients and has_assessments:
        if make_patient_level_assessment_metrics:
            if 'assessment_patient_id_fkey' in assessments_dest:
                src = assessments_dest['assessment_patient_id_fkey']
            else:
                src = assessments_src['assessment_patient_id_fkey']
            assessment_patient_id_fkey = s.get(src)
            # generate spans from the assessment-space patient_id foreign key
            spans = s.get_spans(field=assessment_patient_id_fkey.data[:])

            ids = s.get(patients_dest['id'])

            print('calculate assessment counts per patient')
            t0 = time.time()
            writer = s.create_numeric(patients_dest, 'assessment_count',
                                      'uint32')
            aggregated_counts = s.apply_spans_count(spans)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated assessment counts per patient in {time.time() - t0}"
            )

            print('calculate first assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'first_assessment_day', 10)
            aggregated_counts = s.apply_spans_first(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated first assessment days per patient in {time.time() - t0}"
            )

            print('calculate last assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'last_assessment_day', 10)
            aggregated_counts = s.apply_spans_last(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated last assessment days per patient in {time.time() - t0}"
            )

            print('calculate maximum assessment test result per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['tested_covid_positive'])
            writer = reader.create_like(patients_dest,
                                        'max_assessment_test_result')
            max_result_value = s.apply_spans_max(spans, reader)
            s.join(ids, assessment_patient_id_fkey, max_result_value, writer,
                   spans)
            print(
                f"calculated maximum assessment test result in {time.time() - t0}"
            )

    if has_assessments and do_daily_asmts and make_patient_level_daily_assessment_metrics:
        print(
            "creating 'daily_assessment_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        daily_assessment_patient_ids =\
            s.get(daily_assessments_dest['patient_id'])
        daily_assessment_patient_id_fkey =\
            s.create_numeric(daily_assessments_dest, 'daily_assessment_patient_id_fkey', 'int64')
        s.get_index(patient_ids, daily_assessment_patient_ids,
                    daily_assessment_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        spans = s.get_spans(field=s.get(
            daily_assessments_dest['daily_assessment_patient_id_fkey']))

        print('calculate daily assessment counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'daily_assessment_count',
                                  'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        daily_assessment_patient_id_fkey =\
            s.get(daily_assessments_dest['daily_assessment_patient_id_fkey'])
        s.join(ids, daily_assessment_patient_id_fkey, aggregated_counts,
               writer, spans)
        print(
            f"calculated daily assessment counts per patient in {time.time() - t0}"
        )

    if has_tests and make_new_test_level_metrics:
        print(
            "creating 'test_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        test_patient_ids = s.get(tests_dest['patient_id'])
        test_patient_id_fkey = s.create_numeric(tests_dest,
                                                'test_patient_id_fkey',
                                                'int64')
        s.get_index(patient_ids, test_patient_ids, test_patient_id_fkey)
        test_patient_id_fkey = s.get(tests_dest['test_patient_id_fkey'])
        spans = s.get_spans(field=test_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        print('calculate test_counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'test_count', 'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        s.join(ids, test_patient_id_fkey, aggregated_counts, writer, spans)
        print(f"calculated test counts per patient in {time.time() - t0}")

        print('calculate test_result per patient')
        t0 = time.time()
        test_results = s.get(tests_dest['result'])
        writer = test_results.create_like(patients_dest, 'max_test_result')
        aggregated_results = s.apply_spans_max(spans, test_results)
        s.join(ids, test_patient_id_fkey, aggregated_results, writer, spans)
        print(f"calculated max_test_result per patient in {time.time() - t0}")

    if has_diet and make_diet_level_metrics:
        with utils.Timer("Making patient-level diet questions count",
                         new_line=True):
            d_pids_ = s.get(diet_dest['patient_id']).data[:]
            d_pid_spans = s.get_spans(d_pids_)
            d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_)
            d_pid_counts = s.apply_spans_count(d_pid_spans)
            p_diet_counts = s.create_numeric(patients_dest, 'diet_counts',
                                             'int32')
            s.merge_left(left_on=s.get(patients_dest['id']).data[:],
                         right_on=d_distinct_pids,
                         right_fields=(d_pid_counts, ),
                         right_writers=(p_diet_counts, ))
コード例 #9
0
        self.values[9] |= 1 if d else 0
        self.values[10] |= 0 if z else 1
        self.values[11] |= 1 if z else 0


src_file = '/home/ben/covid/ds_20200929_full.hdf5'
dest_file = '/home/ben/covid/ds_diet_tmp.hdf5'
with h5py.File(src_file, 'r') as hf:
    with h5py.File(dest_file, 'w') as dest:
        s = Session()

        ptnts = hf['patients']
        print(hf['diet'].keys())
        diet = hf['diet']

        p_ids_ = s.get(hf['patients']['id']).data[:]
        d_pids_ = s.get(hf['diet']['patient_id']).data[:]
        d_pid_spans = s.get_spans(d_pids_)
        d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_)
        d_pid_counts = s.apply_spans_count(d_pid_spans)
        print(np.unique(d_pid_counts, return_counts=True))
        p_diet_counts_new = s.create_numeric(dest, 'diet_counts_new', 'int32')
        dcs = s.merge_left(left_on=p_ids_,
                           right_on=d_distinct_pids,
                           right_fields=(d_pid_counts, ),
                           right_writers=(p_diet_counts_new, ))
        # res = np.unique(s.get(patients_dest['diet_counts']).data[:], return_counts=True)
        print(np.unique(p_diet_counts_new.data[:], return_counts=True))

        # ddtest = defaultdict(int)
        # for p in d_pids_: