Python Timer Examples

Programming Language: Python

Namespace/Package Name: exetera.core.utils

Method/Function: Timer

Examples at hotexamples.com: 15

Python Timer - 15 examples found. These are the top rated real world Python examples of exetera.core.utils.Timer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def generate_dataset(length, val_column_count):
    rng = np.random.RandomState(12345678)
    id_base = 0  #1000000000
    mapping = [0, 1, 2, 1]
    s = Session()
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf:
        with utils.Timer('creating a_ids'):
            a_ids = generate_a_ids(length, id_base)
            a_ids_f = s.create_numeric(hf, 'a_ids', 'int64')
            a_ids_f.data.write(a_ids)
            del a_ids

        print('creating a_vals')
        # all_a_val_fields = list()
        for v in range(val_column_count):
            with utils.Timer("creating a_vals[{}]".format(v)):
                a_vals = generate_a_vals(length, 0, 100, rng)
                a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64')
                a_vals_f.data.write(a_vals)
                # all_a_val_fields.append(a_vals_f)
                del a_vals

        with utils.Timer('creating b_ids'):
            b_ids = generate_b_ids(length, id_base, mapping)
            b_ids_f = s.create_numeric(hf, 'b_ids', 'int64')
            b_ids_f.data.write(b_ids)
            del b_ids

Example #2

Show file

def raw_np_test_1(length, count):
    rng = np.random.RandomState(12345678)
    for c in range(count):
        vals = generate_a_vals(length, 0, 100, rng)
        with utils.Timer("writing source vals {}".format(c)):
            np.save('/home/ben/covid/test_save/vals_{}'.format(c), vals)

    for c in range(count):
        vname = '/home/ben/covid/test_save/vals_{}.npy'.format(c)
        with utils.Timer("reading {}".format(vname)):
            vals = np.load(vname)
        vals *= 2
        v2name = '/home/ben/covid/test_save/dest_vals_{}'.format(c)
        with utils.Timer("writing {}".format(v2name)):
            np.save(vname, vals)

Example #3

Show file

File: method_paper_model.py Project: deng113jie/ExeTeraCovid

def method_paper_model(ds, symptoms_reader_dict, prediction):
    """
    A leaner model to predict Covid positiveness from symptoms.

    :param ds: The Exetera session instance.
    :param symptoms_reader_dict: The dataframe which stores symptoms data.
    :param prediction: A field to store the prediction result.
    """

    intercept = -1.19015973
    weights = {'persistent_cough': 0.23186655,
               'fatigue': 0.56532346,
               'delirium': -0.12935112,
               'shortness_of_breath': 0.58273967,
               'fever': 0.16580974,
               'diarrhoea': 0.10236126,
               'abdominal_pain': -0.11204163,
               'chest_pain': -0.12318634,
               'hoarse_voice': -0.17818597,
               'skipped_meals': 0.25902482,
               'loss_of_smell': 1.82895239}

    with utils.Timer("predicting covid by assessment", new_line=True):
        cumulative = np.zeros(len(symptoms_reader_dict['persistent_cough']), dtype='float32')
        for s in symptoms_reader_dict:
            cumulative += symptoms_reader_dict[s][:] * weights[s]
        cumulative += intercept
        prediction.write(cumulative)

Example #4

Show file

def minimal_test_1(length, count):
    rng = np.random.RandomState(12345678)
    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf:
        for c in range(count):
            vals = generate_a_vals(length, 0, 100, rng)
            with utils.Timer("writing source vals {}".format(c)):
                hf.create_dataset("vals_{}".format(c),
                                  chunks=(1 << 20, ),
                                  data=vals)

    with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r+') as hf:
        for c in range(count):
            vname = "vals_{}".format(c)
            with utils.Timer("reading {}".format(vname)):
                vals = hf[vname][:]
            vals *= 2
            v2name = "dest_vals_{}".format(c)
            with utils.Timer("writing {}".format(v2name)):
                hf.create_dataset(v2name, chunks=(1 << 20, ), data=vals)

Example #5

Show file

def read_id_from_csv(file_name, field_count):
    import csv
    with open(file_name) as f:

        rdr = csv.reader(f)
        fields = next(iter(rdr))
        if field_count == 1:
            ids = list()
            with utils.Timer("reading id from dataset"):
                for r in rdr:
                    ids.append(r[0])
        else:
            values = list()
            for _ in range(field_count):
                values.append(list())
            with utils.Timer(
                    "reading {} fields from dataset".format(field_count)):
                for r in rdr:
                    for i in range(field_count):
                        values[i].append(r[i])
                    del r

Example #6

Show file

def read_fields_from_hdf5(file_name, field_count):
    fields = ('id', 'created_at', 'updated_at', 'version', 'country_code',
              'reported_by_another', 'same_household_as_reporter',
              'contact_additional_studies', 'year_of_birth', 'height_cm',
              'weight_kg', 'gender', 'race_other', 'ethnicity',
              'profile_attributes_updated_at', 'has_diabetes')
    print(len(fields))
    s = Session()
    with h5py.File(file_name, 'r') as hf:
        with utils.Timer("reading {} fields from dataset".format(field_count)):
            for f in range(field_count):
                field = s.get(hf['patients'][fields[f]])
                if isinstance(field, flds.IndexedStringField):
                    indices = field.indices[:]
                    values = field.values[:]
                else:
                    data = field.data[:]

Example #7

Show file

File: merging_results_5.1.py Project: deng113jie/ExeTeraCovid

def merging_results(s, source, output):
    list_symptoms = [
        'abdominal_pain', 'altered_smell', 'blisters_on_feet', 'brain_fog',
        'chest_pain', 'chills_or_shivers', 'delirium', 'diarrhoea',
        'diarrhoea_frequency', 'dizzy_light_headed', 'ear_ringing', 'earache',
        'eye_soreness', 'fatigue', 'feeling_down', 'fever', 'hair_loss',
        'headache', 'headache_frequency', 'hoarse_voice',
        'irregular_heartbeat', 'loss_of_smell', 'nausea', 'persistent_cough',
        'rash', 'red_welts_on_face_or_lips', 'runny_nose',
        'shortness_of_breath', 'skin_burning', 'skipped_meals', 'sneezing',
        'sore_throat', 'swollen_glands', 'typical_hayfever',
        'unusual_muscle_pains'
    ]

    #path = '/home/jd21/data'
    #ds = DataStore()
    ts = str(datetime.now(timezone.utc))

    # # Same but for test
    src_test = source['tests']
    list_testid = src_test['patient_id']
    list_testcreate = src_test['created_at']
    out_test = output.create_dataframe('tests')
    # ====
    # out_test step 1 copy from src_test
    # ====
    with utils.Timer('applying sort'):
        for k in src_test.keys():
            dataframe.copy(src_test[k], out_test, k)

    # convert test date
    covid_test_date_v1(s, out_test, out_test, 'date_effective_test')

    # Filtering only definite results

    results_raw = out_test['result'].data[:]
    results_filt = np.where(np.logical_or(results_raw == 4, results_raw == 3),
                            True, False)
    for k in out_test.keys():
        out_test[k].apply_filter(results_filt, in_place=True)

    # Filter check
    # sanity_filter = (date_fin == 0)
    # print(np.sum(sanity_filter))

    # Creating clean mechanism
    reader_mec = out_test['mechanism'].data
    s_reader_mec = s.get(out_test['mechanism'])

    print(len(reader_mec), len(out_test['patient_id'].data))

    reader_ftmec = out_test['mechanism_freetext'].data
    s_reader_ftmec = s.get(out_test['mechanism_freetext'])

    test_type_from_mechanism_v1_standard_input(s, out_test)

    pcr_standard_summarize_v1(s, out_test)

    out_test_fin = output.create_dataframe('tests_fin')
    # ====
    # out_test_fin step 1 copy from out_test
    # ====
    writers_dict = {}
    # other fields
    for k in ('patient_id', 'date_effective_test', 'result', 'pcr_standard'):
        values = out_test[k].data[:]
        if k == 'result':
            values -= 3
        writers_dict[k] = out_test[k].create_like(out_test_fin, k, ts).data
        print(len(values), k)
        writers_dict[k].write_part(values)
    # converted_test
    values = np.zeros(len(out_test_fin['patient_id'].data), dtype='bool')
    writers_dict['converted_test'] = out_test_fin.create_numeric(
        'converted_test', 'bool', timestamp=ts).data
    writers_dict['converted_test'].write_part(values)

    # Taking care of the old test
    src_asmt = source['assessments']
    print(src_asmt.keys())

    # # Remap had_covid_test to 0/1 2 to binary 0,1
    # tcp_flat = np.where(src_asmt['tested_covid_positive'].data[:] < 1, 0, 1)
    # spans = src_asmt['patient_id'].get_spans()
    # # Get the first index at which the hct field is maximum
    # firstnz_tcp_ind = ds.apply_spans_index_of_max(spans, tcp_flat)
    # # Get the index of first element of patient_id when sorted
    # first_hct_ind = spans[:-1]
    # filt_tl = first_hct_ind != firstnz_tcp_ind
    # # Get the indices for which hct changed value (indicating that test happened after the first input)
    # sel_max_ind = ds.apply_filter(filter_to_apply=filt_tl, reader=firstnz_tcp_ind)
    # # Get the index at which test is maximum and for which that hct is possible
    # # max_tcp_ind = ds.apply_spans_index_of_max(spans, src_asmt['tested_covid_positive'].data[:])
    # # filt_max_test = ds.apply_indices(filt_tl, max_tcp )
    # sel_max_tcp = ds.apply_indices(filt_tl, firstnz_tcp_ind)
    # sel_maxtcp_ind = ds.apply_filter(filter_to_apply=filt_tl, reader=firstnz_tcp_ind)
    # # Define usable assessments with correct test based on previous filter on indices

    sel_max_ind, sel_max_tcp = multiple_tests_start_with_negative_v1(
        s, src_asmt)

    usable_asmt_tests = output.create_group('usable_asmt_tests')
    # ====
    # usable_asmt_tests step 1: copy from src_asmt, filter patients w/ multiple test and first ok
    # ====
    for k in ('id', 'patient_id', 'created_at', 'had_covid_test'):
        fld = src_asmt[k].create_like(usable_asmt_tests, k)
        src_asmt[k].apply_index(sel_max_ind, target=fld)
        print(usable_asmt_tests[k].data[0])

    src_asmt['created_at'].create_like(usable_asmt_tests, 'eff_result_time')
    src_asmt['created_at'].apply_index(
        sel_max_tcp, target=usable_asmt_tests['eff_result_time'])

    src_asmt['tested_covid_positive'].create_like(usable_asmt_tests,
                                                  'eff_result')
    src_asmt['tested_covid_positive'].apply_index(
        sel_max_tcp, target=usable_asmt_tests['eff_result'])

    src_asmt['tested_covid_positive'].create_like(usable_asmt_tests,
                                                  'tested_covid_positive')
    src_asmt['tested_covid_positive'].apply_index(
        sel_max_tcp, target=usable_asmt_tests['tested_covid_positive'])

    # ====
    # usable_asmt_tests step 2: filter only positive
    # ====
    # Making sure that the test is definite (either positive or negative)
    filt_deftest = usable_asmt_tests['tested_covid_positive'].data[:] > 1
    # print(len(ds.get_reader(usable_asmt_tests['patient_id'])))
    for k in ('id', 'patient_id', 'created_at', 'had_covid_test',
              'tested_covid_positive', 'eff_result_time', 'eff_result'):
        usable_asmt_tests[k].apply_filter(filt_deftest, in_place=True)

    # ====
    # usable_asmt_tests step 3: add delta_days_test, date_final_test, and pcr_standard fields
    # ====
    # Getting difference between created at (max of hct date) and max of test result (eff_result_time)
    reader_hct = usable_asmt_tests['created_at'].data[:]
    reader_tcp = usable_asmt_tests['eff_result_time'].data[:]
    with utils.Timer('doing delta time'):
        delta_time = reader_tcp - reader_hct
        delta_days = delta_time / 86400
    print(delta_days[:10], delta_time[:10])
    writer = usable_asmt_tests.create_numeric('delta_days_test', 'float32')
    writer.data.write(delta_days)

    # Final day of test
    date_final_test = np.where(delta_days < 7, reader_hct,
                               reader_tcp - 2 * 86400)
    writer = usable_asmt_tests.create_timestamp('date_final_test')
    writer.data.write(date_final_test)
    # print(ds.get_reader(usable_asmt_tests['date_final_test'])[:10], date_final_test[:10])

    pcr_standard = np.ones(len(usable_asmt_tests['patient_id'].data))
    writer = usable_asmt_tests.create_numeric('pcr_standard', 'int')
    writer.data.write(pcr_standard)

    # ====
    # out_test_fin step 2 copy from usable_asmt_tests
    # ====
    list_init = ('patient_id', 'date_final_test', 'tested_covid_positive',
                 'pcr_standard')
    list_final = ('patient_id', 'date_effective_test', 'result',
                  'pcr_standard')
    # Join
    for (i, f) in zip(list_init, list_final):
        values = usable_asmt_tests[i].data[:]
        if f == 'result':
            values -= 2
        # writers_dict[f] = reader.get_writer(out_test_fin, f, ts)
        print(len(values), f)
        writers_dict[f].write(values)
    writers_dict['converted_test'].write(
        np.ones(len(usable_asmt_tests['patient_id'].data), dtype='bool'))

    # ====
    # out_pos step 1: copy from out_test_fin, filter valid result, and write to csv
    # ====
    result_fin = out_test_fin['result'].data[:]
    filt_pos = result_fin == 1
    out_pos = output.create_dataframe('out_pos')
    for k in out_test_fin.keys():
        out_test_fin[k].create_like(out_pos, k)
        out_test_fin[k].apply_filter(filt_pos, target=out_pos[k])
        print(k, len(out_test_fin[k].data), len(filt_pos))

    pat_pos_len = len(out_pos['patient_id'].get_spans()) - 1
    dataset.copy(out_pos, output, 'out_pos_copy')
    save_df_to_csv(out_pos, 'TestedPositiveTestDetails.csv')

    # ====
    # out_pos step 2 filter patient that has assessment
    # ====
    with utils.Timer('Mapping index asmt to pos only'):
        test2pat = prst.foreign_key_is_in_primary_key(
            out_pos['patient_id'].data[:],
            foreign_key=src_asmt['patient_id'].data[:])

    for f in [
            'created_at', 'patient_id', 'treatment', 'other_symptoms',
            'country_code', 'location', 'updated_at'
    ] + list_symptoms:
        #print(f)
        if (f in list(out_pos.keys())):
            out_pos[f].data.clear()
            src_asmt[f].apply_filter(test2pat, target=out_pos[f])
        else:
            src_asmt[f].create_like(out_pos, f)
            src_asmt[f].apply_filter(test2pat, target=out_pos[f])

    # print(len(np.unique(ds.get_reader(out_pos['patient_id'])[:])), len(np.unique(pat_pos[:])))
    print(len(out_pos['patient_id'].get_spans()) - 1, pat_pos_len)
    unique_other, counts = np.unique(out_pos['other_symptoms'].data[:],
                                     return_counts=True)
    dict_other = {'other': unique_other, 'counts': counts}

    df_other = pd.DataFrame.from_dict(dict_other)
    df_other.to_csv('OtherSymptoms.csv')

    #  this is duplicated with 265-273
    # for k in list_symptoms:
    #     print(k)
    #     if k in list(out_pos.keys()):
    #         src_asmt[k].apply_filter(test2pat, target=out_pos[k])
    #     else:
    #         src_asmt[k].create_like(out_pos, k)
    #         src_asmt[k].apply_filter(test2pat, target=out_pos[k])
    # reader = ds.get_reader(src_asmt[k])
    # writer = reader.get_writer(out_pos, k,ts,write_mode='overwrite')
    # ds.apply_filter(test2pat, reader,writer)

    # ====
    # summarize the symptoms
    # ====

    # sum_symp = np.zeros(len(out_pos['patient_id'].data))
    # for k in list_symptoms:
    #     values = out_pos[k].data[:]
    #     if k == 'fatigue' or k == 'shortness_of_breath':
    #         values = np.where(values > 2, np.ones_like(values), np.zeros_like(values))
    #     else:
    #         values = np.where(values > 1, np.ones_like(values), np.zeros_like(values))
    #     sum_symp += values
    sum_symp = sum_up_symptons_v1(out_pos)
    out_pos.create_numeric('sum_symp', 'int').data.write(sum_symp)
    # writer = ds.get_numeric_writer(out_pos, 'sum_symp', dtype='int', timestamp=ts, writemode='overwrite')
    # writer.write(sum_symp)

    # ====
    # filter the symptoms
    # ====
    # symp_flat = np.where(out_pos['sum_symp'].data[:] < 1, 0, 1)
    # spans = out_pos['patient_id'].get_spans()
    # print('Number definitie positive is', len(spans) - 1)
    #
    # # Get the first index at which the hct field is maximum
    # firstnz_symp_ind = ds.apply_spans_index_of_max(spans, symp_flat)
    # max_symp_check = symp_flat[firstnz_symp_ind]
    # # Get the index of first element of patient_id when sorted
    #
    # filt_asymptomatic = max_symp_check == 0
    # print('Number asymptomatic is ', len(spans) - 1 - np.sum(max_symp_check), np.sum(filt_asymptomatic))
    #
    # first_symp_ind = spans[:-1]
    # not_healthy_first = first_symp_ind != firstnz_symp_ind
    # print('Number not healthy first is ', len(spans) - 1 - np.sum(not_healthy_first))
    #
    # spans_valid = ds.apply_filter(not_healthy_first, first_symp_ind)
    # pat_sel = ds.apply_indices(spans_valid, out_pos['patient_id'].data[:])
    # filt_sel = prst.foreign_key_is_in_primary_key(pat_sel, out_pos['patient_id'].data[:])
    #
    # spans_asymp = ds.apply_filter(filt_asymptomatic, first_symp_ind)
    spans_asymp, filt_sel = filter_asymp_and_firstnz_v1(s, out_pos)
    # ====
    # out_pos step 3 filter asymptomatic
    # ====
    pat_asymp = out_pos['patient_id'].apply_index(spans_asymp)
    #pat_asymp = ds.apply_indices(spans_asymp, ds.get_reader(out_pos['patient_id']))
    filt_pata = prst.foreign_key_is_in_primary_key(
        pat_asymp.data[:], out_pos['patient_id'].data[:])

    # ====
    # out_pos_hs step 1 copy from out_pos and apply filter not healthy first
    # ====
    out_pos_hs = output.create_dataframe('out_pos_hs')
    for k in list_symptoms + [
            'created_at', 'patient_id', 'sum_symp', 'country_code', 'location',
            'treatment', 'updated_at'
    ]:
        #print(k)
        out_pos[k].create_like(out_pos_hs, k)
        out_pos[k].apply_filter(filt_sel, target=out_pos_hs[k])
        # reader = ds.get_reader(out_pos[k])
        # writer = reader.get_writer(out_pos_hs, k, ts)
        # ds.apply_filter(filt_sel, reader, writer)

    # dict_final = {}
    # for k in out_pos_hs.keys():
    #     dict_final[k] = out_pos_hs[k].data[:]
    #
    # df_final = pd.DataFrame.from_dict(dict_final)
    # df_final.to_csv(path + '/PositiveSympStartHealthyAllSymptoms.csv')
    save_df_to_csv(out_pos_hs, 'PositiveSympStartHealthyAllSymptoms.csv')

    print('out_pos_asymp')
    # ====
    # out_pos_as 1 out_pos filter asymptomatic
    # ====
    out_pos_as = output.create_dataframe('out_pos_asymp')
    for k in list_symptoms + [
            'created_at', 'patient_id', 'sum_symp', 'country_code', 'location',
            'treatment'
    ]:
        out_pos[k].create_like(out_pos_as, k)
        out_pos[k].apply_filter(filt_pata, target=out_pos_as[k])
        # reader = ds.get_reader(out_pos[k])
        # writer = reader.get_writer(out_pos_as, k, ts)
        # ds.apply_filter(filt_pata, reader, writer)

    # dict_finala = {}
    # for k in out_pos_as.keys():
    #     dict_finala[k] = out_pos_as[k].data[:]
    #
    # df_finala = pd.DataFrame.from_dict(dict_finala)
    # df_finala.to_csv(path + '/PositiveAsympAllSymptoms.csv')
    save_df_to_csv(out_pos_as, 'PositiveAsympAllSymptoms.csv')

    # Based on the final selected patient_id, select the appropriate rows of the patient_table
    src_pat = source['patients']
    filt_pat = prst.foreign_key_is_in_primary_key(
        out_pos_hs['patient_id'].data[:], src_pat['id'].data[:])
    list_interest = [
        'has_cancer', 'has_diabetes', 'has_lung_disease', 'has_heart_disease',
        'has_kidney_disease', 'has_asthma', 'race_is_other',
        'race_is_prefer_not_to_say', 'race_is_uk_asian', 'race_is_uk_black',
        'race_is_uk_chinese', 'race_is_uk_middle_eastern',
        'race_is_uk_mixed_other', 'race_is_uk_mixed_white_black',
        'race_is_uk_white', 'race_is_us_asian', 'race_is_us_black',
        'race_is_us_hawaiian_pacific', 'race_is_us_indian_native',
        'race_is_us_white', 'race_other', 'year_of_birth', 'is_smoker',
        'smoker_status', 'bmi_clean', 'is_in_uk_twins',
        'healthcare_professional', 'gender', 'id', 'blood_group', 'lsoa11cd',
        'already_had_covid'
    ]
    out_pat = output.create_dataframe('patient_pos')
    print('patient_pos')
    for k in list_interest:
        src_pat[k].create_like(out_pat, k)
        src_pat[k].apply_filter(filt_pat, target=out_pat[k])
        # reader = ds.get_reader(src_pat[k])
        # writer = reader.get_writer(out_pat, k, ts)
        # ds.apply_filter(filt_pat, reader, writer)

    # dict_pat = {}
    # for k in list_interest:
    #     values = out_pat[k].data[:]
    #     dict_pat[k] = values
    #
    # df_pat = pd.DataFrame.from_dict(dict_pat)
    # df_pat.to_csv(path + '/PositiveSympStartHealthy_PatDetails.csv')
    save_df_to_csv(out_pat, 'PositiveSympStartHealthy_PatDetails.csv')

    #spans_asymp = ds.apply_filter(filt_asymptomatic, first_symp_ind)
    #pat_asymp = ds.apply_indices(spans_asymp, ds.get_reader(out_pos['patient_id']))
    pat_asymp = out_pos['patient_id'].apply_index(spans_asymp)
    filt_asymp = prst.foreign_key_is_in_primary_key(pat_asymp.data[:],
                                                    src_pat['id'].data[:])
    out_pat_asymp = output.create_dataframe('patient_asymp')
    for k in list_interest:
        src_pat[k].create_like(out_pat_asymp, k)
        src_pat[k].apply_filter(filt_asymp, target=out_pat_asymp[k])
        # reader = ds.get_reader(src_pat[k])
        # writer = reader.get_writer(out_pat_asymp, k, ts)
        # ds.apply_filter(filt_asymp, reader, writer)

    # dict_pata = {}
    # for k in list_interest:
    #     values = out_pat_asymp[k].data[:]
    #     dict_pata[k] = values
    #
    # df_pata = pd.DataFrame.from_dict(dict_pata)
    # df_pata.to_csv(path + '/PositiveAsymp_PatDetails.csv')
    save_df_to_csv(out_pat_asymp, 'PositiveAsymp_PatDetails.csv')

Example #8

Show file

File: csv_reader_speedup.py Project: KCL-BMEIS/ExeTera

def read_file_using_fast_csv_reader(source,
                                    chunk_row_size,
                                    column_offsets,
                                    index_map,
                                    field_importer_list=None,
                                    stop_after_rows=None):
    ESCAPE_VALUE = np.frombuffer(b'"', dtype='S1')[0][0]
    SEPARATOR_VALUE = np.frombuffer(b',', dtype='S1')[0][0]
    NEWLINE_VALUE = np.frombuffer(b'\n', dtype='S1')[0][0]
    WHITE_SPACE_VALUE = np.frombuffer(b' ', dtype='S1')[0][0]

    chunk_row_size *= 2
    time0 = time.time()

    total_byte_size, count_columns, count_rows, chunk_byte_size = get_file_stat(
        source, chunk_row_size)

    column_val_total_count = column_offsets[-1]

    with utils.Timer("read_file_using_fast_csv_reader"):
        chunk_index = 0
        hasHeader = True

        accumulated_written_rows = 0

        # initialize column_inds, column_vals ouside of while-loop
        column_inds = np.zeros(
            (count_columns, count_rows + 1),
            dtype=np.int64)  # add one more row for initial index 0

        # column_vals = np.zeros((count_columns, val_row_count), dtype=np.uint8)
        column_vals = np.zeros(np.int64(column_val_total_count),
                               dtype=np.uint8)

        # make ndarray larger factor
        larger_factor = 2
        is_indices_full, is_values_full = False, False

        content = None
        start_index = 0

        ch = 0
        while chunk_index < total_byte_size:
            if stop_after_rows and accumulated_written_rows >= stop_after_rows:
                break

            # reads chunk size of file content
            # when indices or values is full, we need to call fast_csv_reader again, but we don't want to read same content again
            if not is_indices_full and not is_values_full:
                content = np.fromfile(source,
                                      count=chunk_byte_size,
                                      offset=chunk_index,
                                      dtype=np.uint8)
                start_index = 0

                length_content = content.shape[0]
                if length_content == 0:
                    break

                # check if there's newline at EOF in the last chunk. add one if it's missing
                if chunk_index + length_content == total_byte_size and content[
                        -1] != NEWLINE_VALUE:
                    content = np.append(content, NEWLINE_VALUE)

            offset_pos, written_row_count, is_indices_full, is_values_full, val_full_col_idx = fast_csv_reader(
                content, start_index, column_inds, column_vals, column_offsets,
                hasHeader, ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE,
                WHITE_SPACE_VALUE)

            # convert and write
            for ith, i_c in enumerate(index_map):
                if field_importer_list and field_importer_list[ith]:
                    field_importer_list[ith].transform_and_write_part(
                        column_inds, column_vals, column_offsets, i_c,
                        written_row_count)

            # make column_inds larger if it gets full before reach the end of chunk
            if is_indices_full:
                indices_row_count = column_inds.shape[1] - 1
                column_inds = np.zeros(
                    (count_columns,
                     np.uint32(indices_row_count * larger_factor + 1)),
                    dtype=np.int64)

            # make column_values larger if it gets full before reach the end of chunk
            if is_values_full and val_full_col_idx != -1:
                col_val_count = column_offsets[
                    val_full_col_idx + 1] - column_offsets[val_full_col_idx]
                delta = col_val_count * (larger_factor - 1)
                column_offsets = np.concatenate(
                    (column_offsets[:val_full_col_idx + 1],
                     column_offsets[val_full_col_idx + 1:] + np.int64(delta)))
                column_val_total_count = column_offsets[-1]
                column_vals = np.zeros(np.int64(column_val_total_count),
                                       dtype=np.uint8)

            # reassign
            if is_indices_full or is_values_full:
                start_index = offset_pos
            else:
                chunk_index += offset_pos

            hasHeader = False
            accumulated_written_rows += written_row_count
            ch += 1

            print(
                f"{ch} chunks, {accumulated_written_rows} accumulated_written_rows parsed in {time.time() - time0}s"
            )

        # flush at the end
        for ith in range(len(index_map)):
            field_importer_list[ith].flush()

    print(f"Total time {time.time() - time0}s")

Example #9

Show file

File: other_symptoms.py Project: deng113jie/ExeTeraCovid

        ofilter = otherend - otherstart > 0
        print("ofilter:", ofilter.sum(), len(ofilter))
        cfilter = cc == b"GB"
        print("cfilter:", cfilter.sum(), len(cfilter))
        filter_ = ofilter & cfilter
        print("filter_:", filter_.sum(), len(filter_))

        filt_asmt = tmp.create_group('filt_assessments')
        filt_other_symptoms = other.create_like(filt_asmt, 'other_symptoms')
        s.apply_filter(filter_, other, filt_other_symptoms)
        patient_id = s.get(hf['assessments']['patient_id'])
        filt_patient_id = patient_id.create_like(filt_asmt, 'patient_id')
        s.apply_filter(filter_, patient_id, filt_patient_id)
        print('filtered symptoms len =', len(filt_other_symptoms.data))

        with utils.Timer("merging test_results"):
            p_to_a = s.create_numeric(tmp, 'p_to_a', 'int64')
            a_test_results = s.create_numeric(tmp, 'a_test_results', 'int8')
            s.ordered_merge_left(left_on=s.get(
                tmp['filt_assessments']['patient_id']),
                                 right_on=s.get(hf['patients']['id']),
                                 left_field_sources=(p_test_results, ),
                                 left_field_sinks=(a_test_results, ),
                                 left_to_right_map=p_to_a,
                                 right_unique=True)
        print(len(a_test_results.data))
        print(np.unique(a_test_results.data[:], return_counts=True))

        a_test_results_ = a_test_results.data[:]
        #     filtered_test_results = test_results[filter_]
        #     print("filtered tests:", np.unique(filtered_test_results, return_counts=True))

Example #10

Show file

File: postprocess.py Project: deng113jie/ExeTeraCovid

def postprocess(dataset, destination, timestamp=None, flags=None):

    if flags is None:
        flags = set()

    do_daily_asmts = 'daily' in flags
    has_patients = 'patients' in dataset.keys()
    has_assessments = 'assessments' in dataset.keys()
    has_tests = 'tests' in dataset.keys()
    has_diet = 'diet' in dataset.keys()

    sort_enabled = lambda x: True
    process_enabled = lambda x: True

    sort_patients = sort_enabled(flags) and True
    sort_assessments = sort_enabled(flags) and True
    sort_tests = sort_enabled(flags) and True
    sort_diet = sort_enabled(flags) and True

    make_assessment_patient_id_fkey = process_enabled(flags) and True
    year_from_age = process_enabled(flags) and True
    clean_weight_height_bmi = process_enabled(flags) and True
    health_worker_with_contact = process_enabled(flags) and True
    clean_temperatures = process_enabled(flags) and True
    check_symptoms = process_enabled(flags) and True
    create_daily = process_enabled(flags) and do_daily_asmts
    make_patient_level_assessment_metrics = process_enabled(flags) and True
    make_patient_level_daily_assessment_metrics = process_enabled(
        flags) and do_daily_asmts
    make_new_test_level_metrics = process_enabled(flags) and True
    make_diet_level_metrics = True
    make_healthy_diet_index = True

    # ds = DataStore(timestamp=timestamp)
    s = Session()

    # patients ================================================================

    sorted_patients_src = None

    if has_patients:
        patients_src = dataset['patients']

        write_mode = 'write'

        if 'patients' not in destination.keys():
            patients_dest = s.get_or_create_group(destination, 'patients')
            sorted_patients_src = patients_dest

            # Patient sort
            # ============
            if sort_patients:
                duplicate_filter = \
                    persistence.filter_duplicate_fields(s.get(patients_src['id']).data[:])

                for k in patients_src.keys():
                    t0 = time.time()
                    r = s.get(patients_src[k])
                    w = r.create_like(patients_dest, k)
                    s.apply_filter(duplicate_filter, r, w)
                    print(f"'{k}' filtered in {time.time() - t0}s")

                print(np.count_nonzero(duplicate_filter == True),
                      np.count_nonzero(duplicate_filter == False))
                sort_keys = ('id', )
                s.sort_on(patients_dest,
                          patients_dest,
                          sort_keys,
                          write_mode='overwrite')

            # Patient processing
            # ==================
            if year_from_age:
                log("year of birth -> age; 18 to 90 filter")
                t0 = time.time()
                yobs = s.get(patients_dest['year_of_birth'])
                yob_filter = s.get(patients_dest['year_of_birth_valid'])
                age = s.create_numeric(patients_dest, 'age', 'uint32')
                age_filter = s.create_numeric(patients_dest, 'age_filter',
                                              'bool')
                age_16_to_90 = s.create_numeric(patients_dest,
                                                '16_to_90_years', 'bool')
                print('year_of_birth:', patients_dest['year_of_birth'])
                for k in patients_dest['year_of_birth'].attrs.keys():
                    print(k, patients_dest['year_of_birth'].attrs[k])
                calculate_age_from_year_of_birth_v1(yobs, yob_filter, 16, 90,
                                                    age, age_filter,
                                                    age_16_to_90, 2020)
                log(f"completed in {time.time() - t0}")

                print('age_filter count:',
                      np.sum(patients_dest['age_filter']['values'][:]))
                print('16_to_90_years count:',
                      np.sum(patients_dest['16_to_90_years']['values'][:]))

            if clean_weight_height_bmi:
                log("height / weight / bmi; standard range filters")
                t0 = time.time()

                weights_clean = s.create_numeric(patients_dest,
                                                 'weight_kg_clean', 'float32')
                weights_filter = s.create_numeric(patients_dest,
                                                  '40_to_200_kg', 'bool')
                heights_clean = s.create_numeric(patients_dest,
                                                 'height_cm_clean', 'float32')
                heights_filter = s.create_numeric(patients_dest,
                                                  '110_to_220_cm', 'bool')
                bmis_clean = s.create_numeric(patients_dest, 'bmi_clean',
                                              'float32')
                bmis_filter = s.create_numeric(patients_dest, '15_to_55_bmi',
                                               'bool')

                weight_height_bmi_v1(s, 40, 200, 110, 220, 15, 55, None, None,
                                     None, None, patients_dest['weight_kg'],
                                     patients_dest['weight_kg_valid'],
                                     patients_dest['height_cm'],
                                     patients_dest['height_cm_valid'],
                                     patients_dest['bmi'],
                                     patients_dest['bmi_valid'], weights_clean,
                                     weights_filter, None, heights_clean,
                                     heights_filter, None, bmis_clean,
                                     bmis_filter, None)
                log(f"completed in {time.time() - t0}")

            if health_worker_with_contact:
                with utils.Timer("health_worker_with_contact field"):
                    #writer = ds.get_categorical_writer(patients_dest, 'health_worker_with_contact', 'int8')
                    combined_hcw_with_contact_v1(
                        s, s.get(patients_dest['healthcare_professional']),
                        s.get(patients_dest['contact_health_worker']),
                        s.get(patients_dest['is_carer_for_community']),
                        patients_dest, 'health_worker_with_contact')

    # assessments =============================================================

    sorted_assessments_src = None
    if has_assessments:
        assessments_src = dataset['assessments']
        if 'assessments' not in destination.keys():
            assessments_dest = s.get_or_create_group(destination,
                                                     'assessments')
            sorted_assessments_src = assessments_dest

            if sort_assessments:
                sort_keys = ('patient_id', 'created_at')
                with utils.Timer("sorting assessments"):
                    s.sort_on(assessments_src, assessments_dest, sort_keys)

            if has_patients:
                if make_assessment_patient_id_fkey:
                    print(
                        "creating 'assessment_patient_id_fkey' foreign key index for 'patient_id'"
                    )
                    t0 = time.time()
                    patient_ids = s.get(sorted_patients_src['id'])
                    assessment_patient_ids =\
                        s.get(sorted_assessments_src['patient_id'])
                    assessment_patient_id_fkey =\
                        s.create_numeric(assessments_dest, 'assessment_patient_id_fkey', 'int64')
                    s.get_index(patient_ids.data[:],
                                assessment_patient_ids.data[:],
                                assessment_patient_id_fkey)
                    print(f"completed in {time.time() - t0}s")

            if clean_temperatures:
                print("clean temperatures")
                t0 = time.time()
                temps = s.get(sorted_assessments_src['temperature'])
                temp_units = s.get(sorted_assessments_src['temperature_unit'])
                temps_valid = s.get(
                    sorted_assessments_src['temperature_valid'])
                dest_temps = temps.create_like(assessments_dest,
                                               'temperature_c_clean')
                dest_temps_valid = temps_valid.create_like(
                    assessments_dest, 'temperature_35_to_42_inclusive')
                dest_temps_modified = temps_valid.create_like(
                    assessments_dest, 'temperature_modified')
                validate_temperature_v1(s, 35.0, 42.0, temps, temp_units,
                                        temps_valid, dest_temps,
                                        dest_temps_valid, dest_temps_modified)
                print(f"temperature cleaning done in {time.time() - t0}")

            if check_symptoms:
                print('check inconsistent health_status')
                t0 = time.time()
                check_inconsistent_symptoms_v1(s, sorted_assessments_src,
                                               assessments_dest)
                print(time.time() - t0)

    # tests ===================================================================

    if has_tests:
        if sort_tests:
            tests_src = dataset['tests']
            tests_dest = s.get_or_create_group(destination, 'tests')
            sort_keys = ('patient_id', 'created_at')
            s.sort_on(tests_src, tests_dest, sort_keys)

    # diet ====================================================================

    if has_diet:
        diet_src = dataset['diet']
        if 'diet' not in destination.keys():
            diet_dest = s.get_or_create_group(destination, 'diet')
            sorted_diet_src = diet_dest
            if sort_diet:
                sort_keys = ('patient_id', 'display_name', 'id')
                s.sort_on(diet_src, diet_dest, sort_keys)

    if has_assessments:
        if do_daily_asmts:
            daily_assessments_dest = s.get_or_create_group(
                destination, 'daily_assessments')

    # post process patients
    # TODO: need an transaction table

    print(patients_src.keys())
    print(dataset['assessments'].keys())
    print(dataset['tests'].keys())

    # write_mode = 'overwrite'
    write_mode = 'write'

    # Daily assessments
    # =================

    if has_assessments:
        if create_daily:
            print("generate daily assessments")
            patient_ids = s.get(sorted_assessments_src['patient_id'])
            created_at_days = s.get(sorted_assessments_src['created_at_day'])
            raw_created_at_days = created_at_days.data[:]

            if 'assessment_patient_id_fkey' in assessments_src.keys():
                patient_id_index = assessments_src[
                    'assessment_patient_id_fkey']
            else:
                patient_id_index = assessments_dest[
                    'assessment_patient_id_fkey']
            patient_id_indices = s.get(patient_id_index)
            raw_patient_id_indices = patient_id_indices.data[:]

            print("Calculating patient id index spans")
            t0 = time.time()
            patient_id_index_spans = s.get_spans(
                fields=(raw_patient_id_indices, raw_created_at_days))
            print(
                f"Calculated {len(patient_id_index_spans)-1} spans in {time.time() - t0}s"
            )

            print("Applying spans to 'health_status'")
            t0 = time.time()
            default_behavour_overrides = {
                'id': s.apply_spans_last,
                'patient_id': s.apply_spans_last,
                'patient_index': s.apply_spans_last,
                'created_at': s.apply_spans_last,
                'created_at_day': s.apply_spans_last,
                'updated_at': s.apply_spans_last,
                'updated_at_day': s.apply_spans_last,
                'version': s.apply_spans_max,
                'country_code': s.apply_spans_first,
                'date_test_occurred': None,
                'date_test_occurred_guess': None,
                'date_test_occurred_day': None,
                'date_test_occurred_set': None,
            }
            for k in sorted_assessments_src.keys():
                t1 = time.time()
                reader = s.get(sorted_assessments_src[k])
                if k in default_behavour_overrides:
                    apply_span_fn = default_behavour_overrides[k]
                    if apply_span_fn is not None:
                        apply_span_fn(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  Skipping field {k}")
                else:
                    if isinstance(reader, fields.CategoricalField):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.IndexedStringReader):
                        s.apply_spans_concat(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    elif isinstance(reader, rw.NumericReader):
                        s.apply_spans_max(
                            patient_id_index_spans, reader,
                            reader.create_like(daily_assessments_dest, k))
                        print(f"  Field {k} aggregated in {time.time() - t1}s")
                    else:
                        print(f"  No function for {k}")

            print(f"apply_spans completed in {time.time() - t0}s")

    if has_patients and has_assessments:
        if make_patient_level_assessment_metrics:
            if 'assessment_patient_id_fkey' in assessments_dest:
                src = assessments_dest['assessment_patient_id_fkey']
            else:
                src = assessments_src['assessment_patient_id_fkey']
            assessment_patient_id_fkey = s.get(src)
            # generate spans from the assessment-space patient_id foreign key
            spans = s.get_spans(field=assessment_patient_id_fkey.data[:])

            ids = s.get(patients_dest['id'])

            print('calculate assessment counts per patient')
            t0 = time.time()
            writer = s.create_numeric(patients_dest, 'assessment_count',
                                      'uint32')
            aggregated_counts = s.apply_spans_count(spans)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated assessment counts per patient in {time.time() - t0}"
            )

            print('calculate first assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'first_assessment_day', 10)
            aggregated_counts = s.apply_spans_first(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated first assessment days per patient in {time.time() - t0}"
            )

            print('calculate last assessment days per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['created_at_day'])
            writer = s.create_fixed_string(patients_dest,
                                           'last_assessment_day', 10)
            aggregated_counts = s.apply_spans_last(spans, reader)
            s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer,
                   spans)
            print(
                f"calculated last assessment days per patient in {time.time() - t0}"
            )

            print('calculate maximum assessment test result per patient')
            t0 = time.time()
            reader = s.get(sorted_assessments_src['tested_covid_positive'])
            writer = reader.create_like(patients_dest,
                                        'max_assessment_test_result')
            max_result_value = s.apply_spans_max(spans, reader)
            s.join(ids, assessment_patient_id_fkey, max_result_value, writer,
                   spans)
            print(
                f"calculated maximum assessment test result in {time.time() - t0}"
            )

    if has_assessments and do_daily_asmts and make_patient_level_daily_assessment_metrics:
        print(
            "creating 'daily_assessment_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        daily_assessment_patient_ids =\
            s.get(daily_assessments_dest['patient_id'])
        daily_assessment_patient_id_fkey =\
            s.create_numeric(daily_assessments_dest, 'daily_assessment_patient_id_fkey', 'int64')
        s.get_index(patient_ids, daily_assessment_patient_ids,
                    daily_assessment_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        spans = s.get_spans(field=s.get(
            daily_assessments_dest['daily_assessment_patient_id_fkey']))

        print('calculate daily assessment counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'daily_assessment_count',
                                  'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        daily_assessment_patient_id_fkey =\
            s.get(daily_assessments_dest['daily_assessment_patient_id_fkey'])
        s.join(ids, daily_assessment_patient_id_fkey, aggregated_counts,
               writer, spans)
        print(
            f"calculated daily assessment counts per patient in {time.time() - t0}"
        )

    if has_tests and make_new_test_level_metrics:
        print(
            "creating 'test_patient_id_fkey' foreign key index for 'patient_id'"
        )
        t0 = time.time()
        patient_ids = s.get(sorted_patients_src['id'])
        test_patient_ids = s.get(tests_dest['patient_id'])
        test_patient_id_fkey = s.create_numeric(tests_dest,
                                                'test_patient_id_fkey',
                                                'int64')
        s.get_index(patient_ids, test_patient_ids, test_patient_id_fkey)
        test_patient_id_fkey = s.get(tests_dest['test_patient_id_fkey'])
        spans = s.get_spans(field=test_patient_id_fkey)
        print(f"completed in {time.time() - t0}s")

        print('calculate test_counts per patient')
        t0 = time.time()
        writer = s.create_numeric(patients_dest, 'test_count', 'uint32')
        aggregated_counts = s.apply_spans_count(spans)
        s.join(ids, test_patient_id_fkey, aggregated_counts, writer, spans)
        print(f"calculated test counts per patient in {time.time() - t0}")

        print('calculate test_result per patient')
        t0 = time.time()
        test_results = s.get(tests_dest['result'])
        writer = test_results.create_like(patients_dest, 'max_test_result')
        aggregated_results = s.apply_spans_max(spans, test_results)
        s.join(ids, test_patient_id_fkey, aggregated_results, writer, spans)
        print(f"calculated max_test_result per patient in {time.time() - t0}")

    if has_diet and make_diet_level_metrics:
        with utils.Timer("Making patient-level diet questions count",
                         new_line=True):
            d_pids_ = s.get(diet_dest['patient_id']).data[:]
            d_pid_spans = s.get_spans(d_pids_)
            d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_)
            d_pid_counts = s.apply_spans_count(d_pid_spans)
            p_diet_counts = s.create_numeric(patients_dest, 'diet_counts',
                                             'int32')
            s.merge_left(left_on=s.get(patients_dest['id']).data[:],
                         right_on=d_distinct_pids,
                         right_fields=(d_pid_counts, ),
                         right_writers=(p_diet_counts, ))

Example #11

Show file

File: journal.py Project: clyyuanzi-london/ExeTera

def journal_table(session, schema, old_src, new_src, src_pk, result):
    old_keys = set(old_src.keys())
    new_keys = set(new_src.keys())

    common_keys = old_keys.intersection(new_keys)
    common_keys.remove('j_valid_from')
    common_keys.remove('j_valid_to')
    old_only_keys = old_keys.difference(new_keys)
    new_only_keys = new_keys.difference(old_keys)

    with utils.Timer("sorting old ids"):
        old_ids = session.get(old_src[src_pk])
        old_ids_ = old_ids.data[:]
        old_ids_valid_from = session.get(old_src['j_valid_from']).data[:]
        old_sorted_index = session.dataset_sort_index((old_ids_, old_ids_valid_from))
    old_count = len(old_ids_)

    with utils.Timer("sorting new_ids"):
        new_ids_ = session.get(new_src[src_pk]).data[:]
        new_sorted_index = session.dataset_sort_index((new_ids_,))
    new_count = len(new_ids_)

    # print("old_ids:", old_ids_[old_sorted_index[:20]])
    # print("new_ids:", new_ids_[new_sorted_index[:20]])

    # get the row maps for rows that we need to compare
    with utils.Timer("generating row_maps for merging"):
        old_ids_ = old_ids_[old_sorted_index]
        new_ids_ = new_ids_[new_sorted_index]
        old_map, new_map = ops.ordered_generate_journalling_indices(old_ids_, new_ids_)

    to_keep = np.zeros(len(old_map), dtype=np.bool)

    schema_fields = schema.fields.keys()
    common_keys = [k for k in schema_fields if k in common_keys]
    print("old_map:", old_map)
    print("new_map:", new_map)

    for k in common_keys:
        if k in (src_pk, 'j_valid_from', 'j_valid_to'):
            continue
        old_f = session.get(old_src[k])
        new_f = session.get(new_src[k])
        print(k)
        if isinstance(old_f, flds.IndexedStringField):
            old_f_i_, old_f_v_ = session.apply_index(old_sorted_index, old_f)
            new_f_i_, new_f_v_ = session.apply_index(new_sorted_index, new_f)
            ops.compare_indexed_rows_for_journalling(old_map, new_map,
                                                     old_f_i_, old_f_v_, new_f_i_, new_f_v_,
                                                     to_keep)
        else:
            old_f_ = session.apply_index(old_sorted_index, old_f)
            new_f_ = session.apply_index(new_sorted_index, new_f)
            ops.compare_rows_for_journalling(old_map, new_map, old_f_, new_f_, to_keep)

        print("to_keep:", to_keep.astype(np.uint8))
        print(to_keep.sum(), len(to_keep))

    merged_length = len(old_ids.data) + to_keep.sum()

    only_in_old = 0
    only_in_new = 0
    not_updated = 0
    updated = 0
    for i in range(len(old_map)):
        if old_map[i] == -1:
            only_in_new += 1
        if new_map[i] == -1:
            only_in_old += 1
        if (old_map[i] != -1) and (to_keep[i] == True):
            updated += 1
        if (new_map[i] != -1) and (to_keep[i] == False):
            not_updated += 1

    for k in common_keys:
        if k in (src_pk, 'j_valid_from', 'j_valid_to'):
            continue
        old_f = session.get(old_src[k])
        new_f = session.get(new_src[k])
        print(k)
        if isinstance(old_f, flds.IndexedStringField):
            old_f_i_, old_f_v_ = session.apply_index(old_sorted_index, old_f)
            new_f_i_, new_f_v_ = session.apply_index(new_sorted_index, new_f)
            dest_i_ = np.zeros(merged_length + 1, old_f_i_.dtype)
            val_count = ops.merge_indexed_journalled_entries_count(old_map, new_map, to_keep,
                                                                   old_f_i_, new_f_i_)
            dest_v_ = np.zeros(val_count, old_f_v_.dtype)
            ops.merge_indexed_journalled_entries(old_map, new_map, to_keep,
                                                 old_f_i_, old_f_v_, new_f_i_, new_f_v_,
                                                 dest_i_, dest_v_)
            dest_f = new_f.create_like(result, k)
            dest_f.indices.write(dest_i_)
            dest_f.values.write(dest_v_)

        else:
            old_f_v_ = session.apply_index(old_sorted_index, old_f)
            new_f_v_ = session.apply_index(new_sorted_index, new_f)
            dest_ = np.zeros(merged_length, old_f_v_.dtype)
            ops.merge_journalled_entries(old_map, new_map, to_keep, old_f_v_, new_f_v_, dest_)
            dest_f = new_f.create_like(result, k)
            dest_f.data.write(dest_)

    print("old_count:", old_count)
    print("new_count:", new_count)
    print("only in old:", only_in_old)
    print("only in new:", only_in_new)
    print("updated:", updated)
    print("not updated:", not_updated)
    print("post journal count:", merged_length)

Example #12

Show file

def method_paper_prediction_pipeline(ds, src_data, dest_data, first_timestamp, last_timestamp):
    s_ptnts = src_data['patients']
    s_asmts = src_data['assessments']
    s_tests = src_data['tests']

    first_dt = datetime.fromtimestamp(first_timestamp)
    last_dt = datetime.fromtimestamp(last_timestamp)
    print(s_tests.keys())

    # Filter patients to be only from England
    # =======================================

    eng_pats = set()
    p_ids_ = ds.get_reader(s_ptnts['id'])[:]
    p_lsoas_ = ds.get_reader(s_ptnts['lsoa11cd'])[:]
    for i in range(len(p_ids_)):
        lsoa = p_lsoas_[i]
        if len(lsoa) > 0 and lsoa[0] == 69: # E
            eng_pats.add(p_ids_[i])
    print("eng pats:", len(eng_pats))

    if "flat_asmts" not in dest_data.keys():
        flat_tests = dest_data.create_group('flat_tests')

        # Filter tests
        # ============

        t_cats = ds.get_reader(s_tests['created_at'])
        raw_t_cats = t_cats[:]
        t_dts = ds.get_reader(s_tests['date_taken_specific'])
        raw_t_dts = t_dts[:]
        t_dsbs = ds.get_reader(s_tests['date_taken_between_start'])
        raw_t_dsbs = t_dsbs[:]
        t_dsbe = ds.get_reader(s_tests['date_taken_between_end'])
        raw_t_dsbe = t_dsbe[:]

        # remove non GB tests
        cur_filter = (ds.get_reader(s_tests['country_code'])[:] == b'GB')
        test_filter = cur_filter[:]
        print("standard test filter GB:", np.count_nonzero(test_filter), len(test_filter))

        # remove non england tests
        t_pids_ = ds.get_reader(s_tests['patient_id'])[:]
        cur_filter = np.zeros(len(t_pids_), dtype=np.bool)
        for i in range(len(t_pids_)):
            cur_filter[i] = t_pids_[i] in eng_pats
        test_filter = test_filter & cur_filter
        print("standard test filter Eng:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where no dates are set
        cur_filter = np.logical_not((raw_t_dts == 0) & (raw_t_dsbs == 0) & (raw_t_dsbe == 0))
        test_filter = test_filter & cur_filter
        print("standard test filter 1:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where all three dates are set
        cur_filter = np.logical_not((raw_t_dts != 0) & (raw_t_dsbs != 0) & (raw_t_dsbe != 0))
        test_filter = test_filter & cur_filter
        print("standard test filter 2:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where only one of the date range tests is set
        cur_filter = np.logical_not((raw_t_dsbs != 0) & (raw_t_dsbe == 0) |
                                    (raw_t_dsbs == 0) & (raw_t_dsbe != 0))
        test_filter = test_filter & cur_filter
        print("standard test filter 3:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where specific date is set but out of range
        cur_filter =\
            (raw_t_dts == 0) | ((raw_t_dts >= first_timestamp) & (raw_t_dts <= last_timestamp))
        test_filter = test_filter & cur_filter
        print("standard test filter 4:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where beginning date is set but out of range
        cur_filter =\
            (raw_t_dsbs == 0) | ((raw_t_dsbs >= first_timestamp) & (raw_t_dsbs <= last_timestamp))
        test_filter = test_filter & cur_filter
        print("standard test filter 5:", np.count_nonzero(test_filter), len(test_filter))

        # remove tests where ending date is set but out of range
        cur_filter = \
            (raw_t_dsbe == 0) | ((raw_t_dsbe >= first_timestamp) & (raw_t_dsbe <= last_timestamp))
        test_filter = test_filter & cur_filter
        print("standard test filter 6:", np.count_nonzero(test_filter), len(test_filter))

        test_timestamps = np.where(raw_t_dts != 0,
                                   raw_t_dts,
                                   raw_t_dsbs + (raw_t_dsbe - raw_t_dsbs) / 2)

        # remove tests where the test date is after the created at date
        cur_filter = test_timestamps <= raw_t_cats
        test_filter = test_filter & cur_filter
        print("standard test filter 7:", np.count_nonzero(test_filter), len(test_filter))

        t_rsts = ds.get_reader(s_tests['result'])
        t_rsts.get_writer(flat_tests, 'result').write(ds.apply_filter(test_filter, t_rsts))
        t_pids = ds.get_reader(s_tests['patient_id'])
        t_pids.get_writer(flat_tests, 'patient_id').write(ds.apply_filter(test_filter, t_pids))
        ds.get_timestamp_writer(flat_tests, 'eff_test_date').write(
            ds.apply_filter(test_filter, test_timestamps))

        # test_min_ts = datetime.fromtimestamp(test_timestamps[test_filter].min())
        # test_max_ts = datetime.fromtimestamp(test_timestamps[test_filter].max())
        # print(test_min_ts, test_max_ts)
    else:
        flat_tests = dest_data["flat_tests"]


    symptoms = ('persistent_cough', 'fatigue', 'delirium', 'shortness_of_breath', 'fever',
                'diarrhoea', 'abdominal_pain', 'chest_pain', 'hoarse_voice', 'skipped_meals',
                'loss_of_smell')

    if "flat_asmts" not in dest_data.keys():
        flat_asmts = dest_data.create_group('flat_asmts')

        # Filter assessments
        # ------------------

        symptom_thresholds = {s: 2 for s in symptoms}
        symptom_thresholds['fatigue'] = 3
        symptom_thresholds['shortness_of_breath'] = 3

        with utils.Timer("filter all out of date range assessments and non-uk assessments", new_line=True):
            a_cats = ds.get_reader(s_asmts['created_at'])[:]
            # in_date_range = (a_cats >= first_timestamp) & (a_cats < last_timestamp)
            in_date_range = a_cats >= first_timestamp
            in_date_range = in_date_range & (ds.get_reader(s_asmts['country_code'])[:] == b'GB')

            a_pids = ds.get_reader(s_asmts['patient_id'])[:]
            in_eng = np.zeros(len(a_pids), dtype=np.bool)
            for i in range(len(a_pids)):
                if a_pids[i] in eng_pats:
                    in_eng[i] = True
            print("in_eng:", in_eng.sum(), len(in_eng))
            in_date_range = in_date_range & in_eng

        with utils.Timer("get indices of final assessments of each day for each person"):
            f_a_pids = ds.apply_filter(in_date_range, a_pids)
            f_a_catds = ds.apply_filter(in_date_range, ds.get_reader(s_asmts['created_at_day'])[:])
            spans = ds.get_spans(f_a_pids)

            last_daily_asmt_filter = np.zeros(len(f_a_pids), dtype=np.bool)
            for s in range(len(spans)-1):
                sb = spans[s]
                se = spans[s+1]
                subspans = ds.get_spans(f_a_catds[sb:se])
                if s < 3:
                    print(subspans)
                for s2 in range(1, len(subspans)):
                    last_daily_asmt_filter[sb + subspans[s2]-1] = True
            print("last_daily_asmt_filter:", last_daily_asmt_filter.sum())
            print(last_daily_asmt_filter[:50])

            # otherspans = ds.get_spans(f_a_catds)
            # last_daily_asmts = np.zeros(len(otherspans)-1, dtype='int64')
            # ds.apply_spans_index_of_last(otherspans, last_daily_asmts)
            # print("last_daily_asmts:", len(last_daily_asmts))

        # pc = ds.get_reader(s_asmts['persistent_cough'])[:]
        # pc1 = ds.apply_indices(last_daily_asmts, ds.apply_filter(in_date_range, pc))
        # pc2 = ds.apply_indices(last_daily_asmts, pc)
        # print(len(pc1), len(pc2))
        # print(np.array_equal(pc1, pc2))


        with utils.Timer("flattening and filtering symptoms"):
            for s in symptoms:
                reader = ds.get_reader(s_asmts[s])
                writer = ds.get_numeric_writer(flat_asmts, s, 'bool')
                filtered = ds.apply_filter(last_daily_asmt_filter, ds.apply_filter(in_date_range, reader[:]))
                writer.write(filtered >= symptom_thresholds[s])

        with utils.Timer("flattening and filtering other fields", new_line=True):
            for f in ('id', 'patient_id', 'created_at', 'created_at_day', 'tested_covid_positive'):
                reader = ds.get_reader(s_asmts[f])
                writer = reader.get_writer(flat_asmts, f)
                ds.apply_filter(in_date_range, reader, writer)
                reader = ds.get_reader(flat_asmts[f])
                writer = reader.get_writer(flat_asmts, f, write_mode='overwrite')
                ds.apply_filter(last_daily_asmt_filter, reader, writer)
                print("  {}".format(f), len(ds.get_reader(flat_asmts[f])))

        # telemetry only
        for s in symptoms:
            print(s, len(ds.get_reader(flat_asmts[s])),
                  np.count_nonzero(ds.get_reader(flat_asmts[s])[:]))
    else:
        flat_asmts = dest_data["flat_asmts"]


    # Filter tests
    # ------------

    # # filter tests within day range first
    # t_cats = ds.get_reader(s_tests['created_at'])
    # raw_t_cats = t_cats[:]
    # t_rsts = ds.get_reader(s_tests['result'])
    # t_pids = ds.get_reader(s_tests['patient_id'])
    # # test_date_filter = (raw_t_cats >= first_timestamp) & (raw_t_cats < last_timestamp)
    # test_date_filter = raw_t_cats >= first_timestamp
    # test_date_filter = test_date_filter & (ds.get_reader(s_tests['country_code'])[:] == b'GB')
    # t_cats.get_writer(flat_tests, 'created_at').write(ds.apply_filter(test_date_filter, raw_t_cats))
    # t_rsts.get_writer(flat_tests, 'result').write(ds.apply_filter(test_date_filter, t_rsts))
    # t_pids.get_writer(flat_tests, 'patient_id').write(ds.apply_filter(test_date_filter, t_pids))
    #
    # raw_t_cats = ds.get_reader(flat_tests['created_at'])[:]
    # min_test_day = datetime.fromtimestamp(np.min(raw_t_cats))
    # max_test_day = datetime.fromtimestamp(np.max(raw_t_cats))
    # print(min_test_day, max_test_day)

    # Calculate prevalence
    # --------------------

    if 'prediction' not in flat_asmts:
        intercept = -1.19015973
        weights = {'persistent_cough': 0.23186655,
                   'fatigue': 0.56532346,
                   'delirium': -0.12935112,
                   'shortness_of_breath': 0.58273967,
                   'fever': 0.16580974,
                   'diarrhoea': 0.10236126,
                   'abdominal_pain': -0.11204163,
                   'chest_pain': -0.12318634,
                   'hoarse_voice': -0.17818597,
                   'skipped_meals': 0.25902482,
                   'loss_of_smell': 1.82895239}

        with utils.Timer("predicting covid by assessment", new_line=True):
            cumulative = np.zeros(len(ds.get_reader(flat_asmts['persistent_cough'])), dtype='float64')
            for s in symptoms:
                reader = ds.get_reader(flat_asmts[s])
                cumulative += reader[:] * weights[s]
            cumulative += intercept
            print("  {}".format(len(cumulative)))
            ds.get_numeric_writer(flat_asmts, 'prediction', 'float32', writemode='overwrite').write(cumulative)
            pos_filter = cumulative > 0.0
            print("pos_filter: ", np.count_nonzero(pos_filter), len(pos_filter))
    else:
        cumulative = ds.get_reader(flat_asmts['prediction'])[:]

    # apply
    # positive test -> imputed positive -> negative test
    spans = ds.get_spans(ds.get_reader(flat_asmts['patient_id'])[:])
    print('spans:', len(spans))

    # generate a numpy array for each day, where each entry in the array is a patient with
    # assessments still in the dataset after the initial filter

    daydict = defaultdict(int)
    with utils.Timer("checking date deltas", new_line=True):
        a_cats = ds.get_reader(flat_asmts['created_at'])[:]
        first_day = datetime.fromtimestamp(first_timestamp)
        for i_r in range(len(a_cats)):
            daydict[(datetime.fromtimestamp(a_cats[i_r]) - first_day).days] += 1
        sdaydict = sorted(daydict.items())
        print(sdaydict)

    # build a combined id index for assessments and tests
    # ---------------------------------------------------
    remaining_a_pids = ds.get_reader(flat_asmts['patient_id'])[:]
    remaining_t_pids = ds.get_reader(flat_tests['patient_id'])[:]
    print("pids from assessments and tests:", len(remaining_a_pids), len(remaining_t_pids),
          len(set(remaining_a_pids).union(set(remaining_t_pids))))
    a_pid_index, t_pid_index = ds.get_shared_index((remaining_a_pids, remaining_t_pids))
    print("merging indices:", len(a_pid_index), len(t_pid_index), max(np.max(a_pid_index), np.max(t_pid_index)))

    max_index = max(a_pid_index[-1], t_pid_index[-1])
    print('max indices:', a_pid_index[-1], t_pid_index[-1])


    # calculate offset days for assessments
    # -------------------------------------

    first_day = datetime.fromtimestamp(first_timestamp)
    a_cats = ds.get_reader(flat_asmts['created_at'])[:]
    a_tcps = ds.get_reader(flat_asmts['tested_covid_positive'])[:]
    a_offset_days = np.zeros(len(a_cats), dtype='int16')

    with utils.Timer("calculate offset days for assessments", new_line=True):
        for i_r, r in enumerate(a_cats):
            a_offset_days[i_r] = (datetime.fromtimestamp(a_cats[i_r]) - first_day).days
        print("assessment_dates:", sorted(utils.build_histogram(a_offset_days)))


    # calculate offset days for tests
    # -------------------------------

    t_etds = ds.get_reader(flat_tests['eff_test_date'])
    raw_t_etds = t_etds[:]
    t_rsts = ds.get_reader(flat_tests['result'])
    t_pids = ds.get_reader(flat_tests['patient_id'])

    t_offset_days = np.zeros(len(raw_t_etds), dtype='int16')
    t_offset_dates = [None] * len(raw_t_etds)
    for i_r, r in enumerate(raw_t_etds):
        t_offset_days[i_r] = (datetime.fromtimestamp(raw_t_etds[i_r]) - first_day).days
        t_offset_dates[i_r] = datetime.fromtimestamp(raw_t_etds[i_r]).date()
    print("test_dates:", sorted(utils.build_histogram(t_offset_days)))
    print("test_dates2:", sorted(utils.build_histogram(t_offset_dates)))


    # create the destination arrays to hold daily data per patient
    # ------------------------------------------------------------
    daycount = max(a_offset_days.max(), t_offset_days.max()) + 1
    i_days = list([None] * daycount)
    t_days = list([None] * daycount)
    print("daycount:", daycount)
    for i in range(daycount):
        i_days[i] = np.zeros(max_index+1, dtype='int16')
        t_days[i] = np.zeros(max_index+1, dtype='int16')


    # incorporate assessment predictions and positive test results
    # note: a_offset_days is in assessment space
    print("len(a_offset_days):", len(a_offset_days))
    print("len(a_pid_index):", len(a_pid_index))
    with utils.Timer("incorporating assessments and assessment-based tests"):
        for i_r, r in enumerate(a_offset_days):
            # i_days[a_offset_days[i_r]][a_pid_index[i_r]] =\
            #     from_tcp if from_tcp != 0 else from_prediction
            from_prediction = 7 if cumulative[i_r] > 0.0 else -7
            i_days[a_offset_days[i_r]][a_pid_index[i_r]] = from_prediction
            from_tcp = 7 if a_tcps[i_r] == 3 else -7 if a_tcps[i_r] == 2 else 0
            t_days[a_offset_days[i_r]][a_pid_index[i_r]] = from_tcp

    # incorporate test results by to appropriate day's entry
    with utils.Timer("incorporating test_results"):
        for i_r, r in enumerate(t_offset_days):
            day = t_days[t_offset_days[i_r]]
            if t_rsts[i_r] == 4:
                day[t_pid_index[i_r]] = 7
            elif t_rsts[i_r] == 3:
                day[t_pid_index[i_r]] = -7
            # day[t_pid_index[i_r]] = 7 if t_rsts[i_r] == 4 else -7 if t_rsts[i_r] == 3 else 0
            # if day[t_pid_index[i_r]] == 0:
            #     day[t_pid_index[i_r]] = 7 if t_rsts[i_r] == 4 else -7 if t_rsts[i_r] == 3 else 0
            # else:
            #     day[t_pid_index[i_r]] = 7 if t_rsts[i_r] == 4 else max(day[t_pid_index[i_r]], -7)

    for i_d, d in enumerate(i_days):
        print(i_d, np.count_nonzero(d))

    with utils.Timer("calculating progression"):
        for da in (i_days, t_days):
            for i_d in range(len(da)-1):
                prior_d = da[i_d]
                next_d = da[i_d + 1]
                next_d[:] = np.where(next_d != 0,
                                     next_d,
                                     np.where(prior_d > 0, prior_d-1, np.minimum(prior_d+1, 0)))
    for d in range(len(i_days)):
        i_d = i_days[d]
        t_d = t_days[d]
        i_present = np.count_nonzero(i_d != 0)
        i_positive = np.count_nonzero(i_d > 0)
        t_present = np.count_nonzero(t_d != 0)
        t_positive = np.count_nonzero(t_d > 0)
        c_d = np.where(t_d == 0, i_d, t_d)
        c_present = np.count_nonzero(c_d != 0)
        c_positive = np.count_nonzero(c_d > 0)

        day = first_day + timedelta(days=d)
        if c_present != 0:
            print(day, i_present, i_positive, t_present, t_positive, c_present, c_positive,
                  c_positive / c_present)
        else:
            print(day, i_present, i_positive, t_present, t_positive, c_present, c_positive,
                  "NA")

Example #13

Show file

def method_paper_summary_pipeline(ds, src_data, dest_data, first_timestamp,
                                  last_timestamp):
    s_ptnts = src_data['patients']
    s_asmts = src_data['assessments']
    filters = ds.get_or_create_group(dest_data, 'filters')
    print(s_ptnts.keys())
    print(src_data['tests'].keys())

    conditions = ('has_kidney_disease', 'has_lung_disease',
                  'has_heart_disease', 'has_diabetes', 'has_hayfever',
                  'has_cancer')

    symptoms = ('persistent_cough', 'fatigue', 'delirium',
                'shortness_of_breath', 'fever', 'diarrhoea', 'abdominal_pain',
                'chest_pain', 'hoarse_voice', 'skipped_meals', 'loss_of_smell')
    symptom_thresholds = {s: 2 for s in symptoms}
    symptom_thresholds['fatigue'] = 3
    symptom_thresholds['shortness_of_breath'] = 3

    intercept = -1.19015973
    weights = {
        'persistent_cough': 0.23186655,
        'fatigue': 0.56532346,
        'delirium': -0.12935112,
        'shortness_of_breath': 0.58273967,
        'fever': 0.16580974,
        'diarrhoea': 0.10236126,
        'abdominal_pain': -0.11204163,
        'chest_pain': -0.12318634,
        'hoarse_voice': -0.17818597,
        'skipped_meals': 0.25902482,
        'loss_of_smell': 1.82895239
    }

    # Filter patients to be only from England
    # =======================================

    eng_pats = set()
    p_ids_ = ds.get_reader(s_ptnts['id'])[:]
    p_lsoas_ = ds.get_reader(s_ptnts['lsoa11cd'])[:]
    for i in range(len(p_ids_)):
        lsoa = p_lsoas_[i]
        if len(lsoa) > 0 and lsoa[0] == 69:  # E
            eng_pats.add(p_ids_[i])
    print("eng pats:", len(eng_pats))

    # generating patient filter
    # -------------------------
    if 'patient_filter' not in filters.keys():
        with utils.Timer("generating patient filter", new_line=True):
            p_filter = ds.get_reader(s_ptnts['year_of_birth_valid'])[:]

            # valid age ranges
            r_ = ds.get_reader(s_ptnts['age'])[:]
            f_ = (r_ >= 18) & (r_ <= 100)
            p_filter = p_filter & f_

            # gender filter
            r_ = ds.get_reader(s_ptnts['gender'])[:]
            f_ = (r_ == 1) | (r_ == 2)
            p_filter = p_filter & f_

            # country code
            r_ = ds.get_reader(s_ptnts['country_code'])[:]
            f_ = r_ == b'GB'
            p_filter = p_filter & f_
            print("UK:", p_filter.sum(), len(p_filter))

            # # England only
            # r_ = ds.get_reader(s_ptnts['lsoa11cd'])[:]
            # f_ = np.zeros(len(r_), dtype=np.bool)
            # for i in range(len(r_)):
            #     lsoa = r_[i]
            #     if len(lsoa) > 0 and lsoa[0] == 69: # E
            #         f_[i] = True
            # p_filter = p_filter & f_
            # print("Eng:", p_filter.sum(), len(p_filter))

            # no assessments
            r_ = ds.get_reader(s_ptnts['assessment_count'])[:]
            f_ = r_ > 0
            p_filter = p_filter & f_
            print("No asmts:", p_filter.sum(), len(p_filter))

            print("  {}, {}".format(np.count_nonzero(p_filter),
                                    np.count_nonzero(p_filter == False)))
            ds.get_numeric_writer(filters, 'patient_filter',
                                  'bool').write(p_filter)

    # generating assessment filter
    # ----------------------------
    if 'assessment_filter' not in filters.keys():
        with utils.Timer("generating assessment filter", new_line=True):
            a_filter = np.ones(len(ds.get_reader(s_asmts['id'])),
                               dtype=np.bool)

            # created_at in range
            r_ = ds.get_reader(s_asmts['created_at'])[:]
            f_ = (r_ >= first_timestamp) & (r_ < last_timestamp)
            a_filter = a_filter & f_

            # country code
            r_ = ds.get_reader(s_asmts['country_code'])[:]
            f_ = r_ == b'GB'
            a_filter = a_filter & f_

            with utils.Timer(f"filtering out orphaned assessments"):
                p_ids_ = ds.get_reader(s_ptnts['id'])[:]
                p_ids_ = ds.apply_filter(
                    ds.get_reader(filters['patient_filter'])[:], p_ids_)
                a_pids_ = ds.get_reader(s_asmts['patient_id'])[:]
                f_ = persistence.foreign_key_is_in_primary_key(p_ids_, a_pids_)
            a_filter = a_filter & f_

            print("  {}, {}".format(np.count_nonzero(a_filter),
                                    np.count_nonzero(a_filter == False)))
            ds.get_numeric_writer(filters, 'assessment_filter',
                                  'bool').write(a_filter)

    # filtering patients
    # ------------------
    if 'filtered_patients' not in dest_data.keys():
        flt_ptnts = dest_data.create_group('filtered_patients')
        with utils.Timer("filtering/flattening patient fields", new_line=True):
            p_filter = ds.get_reader(filters['patient_filter'])[:]

            r = ds.get_reader(s_ptnts['age'])
            r.get_writer(flt_ptnts,
                         'age').write(ds.apply_filter(p_filter, r[:]))

            for k in conditions:
                r = ds.get_reader(s_ptnts[k])
                ds.get_numeric_writer(
                    flt_ptnts, k,
                    'bool').write(ds.apply_filter(p_filter, r[:]) == 2)

            smoker1 = ds.get_reader(s_ptnts['is_smoker'])
            smoker2 = ds.get_reader(s_ptnts['smoker_status'])
            smoker = (smoker1[:] == 2) | (smoker2[:] == 3)
            ds.get_numeric_writer(flt_ptnts, 'smoker', 'bool').write(smoker)

            gender_ = ds.get_reader(s_ptnts['gender'])
            ds.get_numeric_writer(
                flt_ptnts, 'gender',
                'uint8').write(ds.apply_filter(p_filter, gender_) - 1)
    else:
        flt_ptnts = dest_data['filtered_patients']

    # filtering assessments
    # ---------------------
    if 'filtered_assessments' not in dest_data.keys():
        flt_asmts = dest_data.create_group('filtered_assessments')
        with utils.Timer("filtering/flattening symptoms", new_line=True):
            a_filter = ds.get_reader(filters['assessment_filter'])[:]
            for s in symptoms:
                r_ = ds.get_reader(s_asmts[s])[:]
                ds.get_numeric_writer(flt_asmts, s, 'bool').write(
                    ds.apply_filter(a_filter, r_) >= symptom_thresholds[s])
            a_pids = ds.get_reader(s_asmts['patient_id'])
            a_pids.get_writer(flt_asmts, 'patient_id').write(
                ds.apply_filter(a_filter, a_pids[:]))
    else:
        flt_asmts = dest_data['filtered_assessments']

    # predicting covid
    # ----------------
    if 'prediction' not in dest_data['filtered_assessments']:
        with utils.Timer("generating covid prediction", new_line=True):
            cumulative = np.zeros(len(
                ds.get_reader(flt_asmts['persistent_cough'])),
                                  dtype='float64')
            for s in symptoms:
                reader = ds.get_reader(flt_asmts[s])
                cumulative += reader[:] * weights[s]
            cumulative += intercept
            print("positive predictions", np.count_nonzero(cumulative > 0.0),
                  len(cumulative))

            a_pids_ = ds.get_reader(flt_asmts['patient_id'])[:]
            spans = ds.get_spans(a_pids_)
            max_prediction_inds = ds.apply_spans_index_of_max(
                spans, cumulative)
            max_predictions = cumulative[max_prediction_inds]

            ds.get_numeric_writer(flt_asmts, 'prediction',
                                  'float32').write(max_predictions)
            pos_filter = max_predictions > 0.0
            print("pos_filter: ", np.count_nonzero(pos_filter),
                  len(pos_filter))

    # generating table results
    print('total_assessments:',
          np.count_nonzero(ds.get_reader(filters['assessment_filter'])[:]))
    subjects = np.count_nonzero(ds.get_reader(filters['patient_filter'])[:])
    genders = ds.get_reader(flt_ptnts['gender'])[:]
    predicted_c19 = np.count_nonzero(
        ds.get_reader(flt_asmts['prediction'])[:] > 0.0)
    age_mean = np.mean(ds.get_reader(flt_ptnts['age'])[:])
    age_std = np.std(ds.get_reader(flt_ptnts['age'])[:])
    print('subjects:', subjects)
    male = np.count_nonzero(genders)
    female = np.count_nonzero(genders == False)
    print('gender: {}:{}, {:.2%}:{:.2%}'.format(male, female,
                                                male / len(genders),
                                                female / len(genders)))
    # print('predicted covid-19:', predicted_c19)
    print(
        '{}:'.format('predicted covid-19'), '{} {:.2%}'.format(
            predicted_c19,
            predicted_c19 / len(ds.get_reader(flt_asmts['prediction']))))
    print('age {:.2f} ({:.2f})'.format(age_mean, age_std))
    for k in conditions + ('smoker', ):
        kr_ = ds.get_reader(flt_ptnts[k])[:]
        pos = np.count_nonzero(kr_)
        print('{}:'.format(k), '{} {:.2%}'.format(pos, pos / len(kr_)))

Example #14

Show file

def ppe_use_and_travel(ds, src, tmp, start_timestamp):

    logging = True

    s_asmts = src['assessments']

    if 'filtered_assessments' not in tmp.keys():
        f_asmts = tmp.create_group('filtered_assessments')
        cats = ds.get_reader(s_asmts['created_at'])
        asmt_filter = cats[:] >= start_timestamp

        ccs = ds.get_reader(s_asmts['country_code'])
        asmt_filter = asmt_filter & (ccs[:] == b'GB')

        symptom_keys = ('persistent_cough', 'fatigue', 'delirium',
                        'shortness_of_breath', 'fever', 'diarrhoea',
                        'abdominal_pain', 'chest_pain', 'hoarse_voice',
                        'skipped_meals', 'loss_of_smell')
        mask_keys = ('mask_cloth_or_scarf', 'mask_surgical', 'mask_n95_ffp')
        isolation_keys = ('isolation_healthcare_provider',
                          'isolation_little_interaction',
                          'isolation_lots_of_people')
        other_keys = ('patient_id', )
        symptom_thresholds = {s: 2 for s in symptom_keys}
        symptom_thresholds.update({m: 2 for m in mask_keys})
        symptom_thresholds['fatigue'] = 3
        symptom_thresholds['shortness_of_breath'] = 3

        for k in symptom_keys + mask_keys + isolation_keys + other_keys:
            with utils.Timer("filtering {}".format(k)):
                reader = ds.get_reader(s_asmts[k])
                if k in mask_keys + symptom_keys:
                    values = np.where(reader[:] >= symptom_thresholds[k], 1, 0)
                    ds.get_numeric_writer(f_asmts, k, 'int8').write(
                        ds.apply_filter(asmt_filter, values))
                    hist = np.unique(reader[:], return_counts=True)
                    print(sorted(zip(hist[0], hist[1])))
                    hist = np.unique(values, return_counts=True)
                    print(sorted(zip(hist[0], hist[1])))
                else:
                    reader.get_writer(f_asmts, k).write(
                        ds.apply_filter(asmt_filter, reader))

        print('filtered assessments:', np.count_nonzero(asmt_filter),
              len(asmt_filter))
        #
        #
        # if 'filtered_assessment_predictions' not in tmp.keys():
        #     f_pred_asmts = tmp.create_group('filtered_assessment_predictions')
        symptom_readers = dict()
        for s in symptom_keys:
            symptom_readers[s] = ds.get_reader(f_asmts[s])
        predictions = ds.get_numeric_writer(f_asmts, 'prediction', 'float32')
        method_paper_model(ds, symptom_readers, predictions)
        predictions = ds.get_reader(f_asmts['prediction'])
        print('predictions:', np.count_nonzero(predictions[:] > 0),
              len(predictions))

    if 'patient_assessment_summaries' not in tmp.keys():
        asmt_psum = tmp.create_group('patient_assessment_summaries')
        pids = ds.get_reader(f_asmts['patient_id'])
        mcos = ds.get_reader(f_asmts['mask_cloth_or_scarf'])
        msurg = ds.get_reader(f_asmts['mask_surgical'])
        m95 = ds.get_reader(f_asmts['mask_n95_ffp'])
        with utils.Timer("generating patient_id spans"):
            asmt_spans = ds.get_spans(field=pids[:])

        for k in mask_keys:
            with utils.Timer(
                    "getting per patient mask summary for {}".format(k)):
                writer = ds.get_numeric_writer(asmt_psum, k, 'int8')
                ds.apply_spans_max(asmt_spans,
                                   ds.get_reader(f_asmts[k])[:], writer)
                print(
                    sorted(
                        utils.build_histogram(ds.get_reader(asmt_psum[k])[:])))

        for k in isolation_keys:
            with utils.Timer(
                    "getting per patient isolation summary for {}".format(k)):
                writer = ds.get_numeric_writer(asmt_psum, k, 'int32')
                ds.apply_spans_max(asmt_spans,
                                   ds.get_reader(f_asmts[k])[:], writer)
                print(
                    sorted(
                        utils.build_histogram(ds.get_reader(asmt_psum[k])[:])))

        with utils.Timer("getting prediction maxes for patients"):
            p_predictions = predictions.get_writer(asmt_psum, 'prediction')
            ds.apply_spans_max(asmt_spans, predictions, p_predictions)
            p_predictions = ds.get_reader(asmt_psum[k])
            positives = p_predictions[:] > 0
            print("max covid prediction:", np.count_nonzero(positives),
                  len(positives))

        with utils.Timer("getting patient ids from assessments"):
            writer = pids.get_writer(asmt_psum, 'patient_id')
            writer.write(pd.unique(pids[:]))
    else:
        asmt_psum = tmp['patient_assessment_summaries']

    s_ptnts = src['patients']
    print(s_ptnts.keys())

    pdf = pd.DataFrame({
        'id':
        ds.get_reader(s_ptnts['id'])[:],
        'hwwc':
        ds.get_reader(s_ptnts['health_worker_with_contact'])[:]
    })
    adf = pd.DataFrame(
        {'patient_id': ds.get_reader(asmt_psum['patient_id'])[:]})
    jdf = pd.merge(left=adf,
                   right=pdf,
                   left_on='patient_id',
                   right_on='id',
                   how='left')
    print(len(jdf['hwwc']))

    class TestResults:
        def __init__(self):
            self.positive = 0
            self.total = 0

        def add(self, result):
            if result:
                self.positive += 1
            self.total += 1

    results = defaultdict(TestResults)
    positives = ds.get_reader(asmt_psum['prediction'])[:]
    positives = positives > 0
    mask_0 = ds.get_reader(asmt_psum['mask_cloth_or_scarf'])[:]
    mask_1 = ds.get_reader(asmt_psum['mask_surgical'])[:]
    mask_2 = ds.get_reader(asmt_psum['mask_cloth_or_scarf'])[:]
    # mask = mask_0 | mask_1 | mask_2
    mask = mask_0
    print(np.unique(mask, return_counts=True))
    isol_lots = ds.get_reader(asmt_psum['isolation_lots_of_people'])[:]
    isol_lots_7 = np.where(isol_lots > 7, 7, isol_lots)
    print(np.unique(isol_lots_7, return_counts=True))
    print(len(mask), len(positives), len(isol_lots_7))

    # isolation lots of users
    for i_r in range(len(mask)):
        results[(isol_lots_7[i_r], mask[i_r])].add(positives[i_r])

    groupings = sorted(
        list((r[0], (r[1].positive, r[1].total)) for r in results.items()))

    for g in groupings:
        print(g[0], g[1][0], g[1][1], g[1][0] / g[1][1])

Example #15

Show file

def ppe_use_and_travel_2(ds, src, dest, start_ts):
    ds = session.Session()
    s_ptnts = src['patients']
    s_asmts = src['assessments']
    print(s_asmts.keys())
    s_tests = src['tests']

    if 'filtered_patients' not in dest.keys():
        f_ptnts = dest.create_group('filtered_patients')
        f_asmts = dest.create_group('filtered_assessments')
        f_tests = dest.create_group('filtered_tests')

        # calculate patient first positives
        raw_p_ids = ds.get(s_ptnts['id']).data[:]
        raw_p_acts = ds.get(s_ptnts['assessment_count']).data[:]
        raw_a_pids = ds.get(s_asmts['patient_id']).data[:]
        raw_t_pids = ds.get(s_tests['patient_id']).data[:]

        # filter out anyone without assessments
        patient_filter = raw_p_acts > 0

        print("patient_filter:", np.count_nonzero(patient_filter),
              np.count_nonzero(patient_filter == 0))

        # filter patients
        f_p_ids = ds.get(s_ptnts['id']).create_like(f_ptnts, 'id')
        f_p_ids.data.write(ds.apply_filter(patient_filter, raw_p_ids))

        # filter out any orphaned assessments
        with utils.Timer("fk in pk"):
            assessment_filter = persistence.foreign_key_is_in_primary_key(
                raw_p_ids, raw_a_pids)
        print("assessment_filter:", np.count_nonzero(assessment_filter),
              np.count_nonzero(assessment_filter == False))
        f_a_pids = ds.get(s_asmts['patient_id']).create_like(
            f_asmts, 'patient_id')
        f_a_pids.data.write(ds.apply_filter(assessment_filter, raw_a_pids))
        for k in ('created_at', 'tested_covid_positive'):
            field = ds.get(s_asmts[k]).create_like(f_asmts, k)
            field.data.write(
                ds.apply_filter(assessment_filter,
                                ds.get(s_asmts[k]).data[:]))

        # filter out any orphaned tests
        test_filter = persistence.foreign_key_is_in_primary_key(
            raw_p_ids, raw_t_pids)
        print("test_filter:", np.count_nonzero(test_filter),
              np.count_nonzero(test_filter == False))
        f_t_pids = ds.get(s_tests['patient_id']).create_like(
            f_tests, 'patient_id')
        f_t_pids.data.write(ds.apply_filter(test_filter, raw_t_pids))

    else:
        f_ptnts = dest['filtered_patients']
        f_asmts = dest['filtered_assessments']
        f_tests = dest['filtered_tests']
        f_p_ids = ds.get(f_ptnts['id'])
        f_a_pids = ds.get(f_asmts['patient_id'])
        f_t_pids = ds.get(f_tests['patient_id'])

    # calculate the shared set of indices for assessments / tests back to patients
    with utils.Timer("get_shared_index"):
        p_inds, a_pinds, t_pinds = ds.get_shared_index(
            (f_p_ids, f_a_pids, f_t_pids))
    print(max(p_inds.max(), a_pinds.max(), t_pinds.max()))

    # now filter only assessments with positive test results
    pos_asmt_tests = ds.get(f_asmts['tested_covid_positive']).data[:] == 3
    print("old tests positive:", np.count_nonzero(pos_asmt_tests),
          np.count_nonzero(pos_asmt_tests == False))

    # now filter only tests with positive test results

    s_asmts = src['assessments']
    a_cats = ds.get(f_asmts['created_at'])
    asmt_filter = a_cats.data[:] >= start_ts
    print(np.count_nonzero(asmt_filter), len(asmt_filter))
    raw_a_cats = ds.apply_filter(asmt_filter, a_cats.data[:])
    a_days = np.zeros(len(raw_a_cats), dtype=np.int32)
    start_dt = datetime.fromtimestamp(start_ts)
    for i_r in range(len(raw_a_cats)):
        a_days[i_r] = (datetime.fromtimestamp(raw_a_cats[i_r]) - start_dt).days
    print(sorted(utils.build_histogram(a_days)))