def generate_dataset(length, val_column_count): rng = np.random.RandomState(12345678) id_base = 0 #1000000000 mapping = [0, 1, 2, 1] s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf: with utils.Timer('creating a_ids'): a_ids = generate_a_ids(length, id_base) a_ids_f = s.create_numeric(hf, 'a_ids', 'int64') a_ids_f.data.write(a_ids) del a_ids print('creating a_vals') # all_a_val_fields = list() for v in range(val_column_count): with utils.Timer("creating a_vals[{}]".format(v)): a_vals = generate_a_vals(length, 0, 100, rng) a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64') a_vals_f.data.write(a_vals) # all_a_val_fields.append(a_vals_f) del a_vals with utils.Timer('creating b_ids'): b_ids = generate_b_ids(length, id_base, mapping) b_ids_f = s.create_numeric(hf, 'b_ids', 'int64') b_ids_f.data.write(b_ids) del b_ids
def raw_np_test_1(length, count): rng = np.random.RandomState(12345678) for c in range(count): vals = generate_a_vals(length, 0, 100, rng) with utils.Timer("writing source vals {}".format(c)): np.save('/home/ben/covid/test_save/vals_{}'.format(c), vals) for c in range(count): vname = '/home/ben/covid/test_save/vals_{}.npy'.format(c) with utils.Timer("reading {}".format(vname)): vals = np.load(vname) vals *= 2 v2name = '/home/ben/covid/test_save/dest_vals_{}'.format(c) with utils.Timer("writing {}".format(v2name)): np.save(vname, vals)
def method_paper_model(ds, symptoms_reader_dict, prediction): """ A leaner model to predict Covid positiveness from symptoms. :param ds: The Exetera session instance. :param symptoms_reader_dict: The dataframe which stores symptoms data. :param prediction: A field to store the prediction result. """ intercept = -1.19015973 weights = {'persistent_cough': 0.23186655, 'fatigue': 0.56532346, 'delirium': -0.12935112, 'shortness_of_breath': 0.58273967, 'fever': 0.16580974, 'diarrhoea': 0.10236126, 'abdominal_pain': -0.11204163, 'chest_pain': -0.12318634, 'hoarse_voice': -0.17818597, 'skipped_meals': 0.25902482, 'loss_of_smell': 1.82895239} with utils.Timer("predicting covid by assessment", new_line=True): cumulative = np.zeros(len(symptoms_reader_dict['persistent_cough']), dtype='float32') for s in symptoms_reader_dict: cumulative += symptoms_reader_dict[s][:] * weights[s] cumulative += intercept prediction.write(cumulative)
def minimal_test_1(length, count): rng = np.random.RandomState(12345678) with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf: for c in range(count): vals = generate_a_vals(length, 0, 100, rng) with utils.Timer("writing source vals {}".format(c)): hf.create_dataset("vals_{}".format(c), chunks=(1 << 20, ), data=vals) with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r+') as hf: for c in range(count): vname = "vals_{}".format(c) with utils.Timer("reading {}".format(vname)): vals = hf[vname][:] vals *= 2 v2name = "dest_vals_{}".format(c) with utils.Timer("writing {}".format(v2name)): hf.create_dataset(v2name, chunks=(1 << 20, ), data=vals)
def read_id_from_csv(file_name, field_count): import csv with open(file_name) as f: rdr = csv.reader(f) fields = next(iter(rdr)) if field_count == 1: ids = list() with utils.Timer("reading id from dataset"): for r in rdr: ids.append(r[0]) else: values = list() for _ in range(field_count): values.append(list()) with utils.Timer( "reading {} fields from dataset".format(field_count)): for r in rdr: for i in range(field_count): values[i].append(r[i]) del r
def read_fields_from_hdf5(file_name, field_count): fields = ('id', 'created_at', 'updated_at', 'version', 'country_code', 'reported_by_another', 'same_household_as_reporter', 'contact_additional_studies', 'year_of_birth', 'height_cm', 'weight_kg', 'gender', 'race_other', 'ethnicity', 'profile_attributes_updated_at', 'has_diabetes') print(len(fields)) s = Session() with h5py.File(file_name, 'r') as hf: with utils.Timer("reading {} fields from dataset".format(field_count)): for f in range(field_count): field = s.get(hf['patients'][fields[f]]) if isinstance(field, flds.IndexedStringField): indices = field.indices[:] values = field.values[:] else: data = field.data[:]
def merging_results(s, source, output): list_symptoms = [ 'abdominal_pain', 'altered_smell', 'blisters_on_feet', 'brain_fog', 'chest_pain', 'chills_or_shivers', 'delirium', 'diarrhoea', 'diarrhoea_frequency', 'dizzy_light_headed', 'ear_ringing', 'earache', 'eye_soreness', 'fatigue', 'feeling_down', 'fever', 'hair_loss', 'headache', 'headache_frequency', 'hoarse_voice', 'irregular_heartbeat', 'loss_of_smell', 'nausea', 'persistent_cough', 'rash', 'red_welts_on_face_or_lips', 'runny_nose', 'shortness_of_breath', 'skin_burning', 'skipped_meals', 'sneezing', 'sore_throat', 'swollen_glands', 'typical_hayfever', 'unusual_muscle_pains' ] #path = '/home/jd21/data' #ds = DataStore() ts = str(datetime.now(timezone.utc)) # # Same but for test src_test = source['tests'] list_testid = src_test['patient_id'] list_testcreate = src_test['created_at'] out_test = output.create_dataframe('tests') # ==== # out_test step 1 copy from src_test # ==== with utils.Timer('applying sort'): for k in src_test.keys(): dataframe.copy(src_test[k], out_test, k) # convert test date covid_test_date_v1(s, out_test, out_test, 'date_effective_test') # Filtering only definite results results_raw = out_test['result'].data[:] results_filt = np.where(np.logical_or(results_raw == 4, results_raw == 3), True, False) for k in out_test.keys(): out_test[k].apply_filter(results_filt, in_place=True) # Filter check # sanity_filter = (date_fin == 0) # print(np.sum(sanity_filter)) # Creating clean mechanism reader_mec = out_test['mechanism'].data s_reader_mec = s.get(out_test['mechanism']) print(len(reader_mec), len(out_test['patient_id'].data)) reader_ftmec = out_test['mechanism_freetext'].data s_reader_ftmec = s.get(out_test['mechanism_freetext']) test_type_from_mechanism_v1_standard_input(s, out_test) pcr_standard_summarize_v1(s, out_test) out_test_fin = output.create_dataframe('tests_fin') # ==== # out_test_fin step 1 copy from out_test # ==== writers_dict = {} # other fields for k in ('patient_id', 'date_effective_test', 'result', 'pcr_standard'): values = out_test[k].data[:] if k == 'result': values -= 3 writers_dict[k] = out_test[k].create_like(out_test_fin, k, ts).data print(len(values), k) writers_dict[k].write_part(values) # converted_test values = np.zeros(len(out_test_fin['patient_id'].data), dtype='bool') writers_dict['converted_test'] = out_test_fin.create_numeric( 'converted_test', 'bool', timestamp=ts).data writers_dict['converted_test'].write_part(values) # Taking care of the old test src_asmt = source['assessments'] print(src_asmt.keys()) # # Remap had_covid_test to 0/1 2 to binary 0,1 # tcp_flat = np.where(src_asmt['tested_covid_positive'].data[:] < 1, 0, 1) # spans = src_asmt['patient_id'].get_spans() # # Get the first index at which the hct field is maximum # firstnz_tcp_ind = ds.apply_spans_index_of_max(spans, tcp_flat) # # Get the index of first element of patient_id when sorted # first_hct_ind = spans[:-1] # filt_tl = first_hct_ind != firstnz_tcp_ind # # Get the indices for which hct changed value (indicating that test happened after the first input) # sel_max_ind = ds.apply_filter(filter_to_apply=filt_tl, reader=firstnz_tcp_ind) # # Get the index at which test is maximum and for which that hct is possible # # max_tcp_ind = ds.apply_spans_index_of_max(spans, src_asmt['tested_covid_positive'].data[:]) # # filt_max_test = ds.apply_indices(filt_tl, max_tcp ) # sel_max_tcp = ds.apply_indices(filt_tl, firstnz_tcp_ind) # sel_maxtcp_ind = ds.apply_filter(filter_to_apply=filt_tl, reader=firstnz_tcp_ind) # # Define usable assessments with correct test based on previous filter on indices sel_max_ind, sel_max_tcp = multiple_tests_start_with_negative_v1( s, src_asmt) usable_asmt_tests = output.create_group('usable_asmt_tests') # ==== # usable_asmt_tests step 1: copy from src_asmt, filter patients w/ multiple test and first ok # ==== for k in ('id', 'patient_id', 'created_at', 'had_covid_test'): fld = src_asmt[k].create_like(usable_asmt_tests, k) src_asmt[k].apply_index(sel_max_ind, target=fld) print(usable_asmt_tests[k].data[0]) src_asmt['created_at'].create_like(usable_asmt_tests, 'eff_result_time') src_asmt['created_at'].apply_index( sel_max_tcp, target=usable_asmt_tests['eff_result_time']) src_asmt['tested_covid_positive'].create_like(usable_asmt_tests, 'eff_result') src_asmt['tested_covid_positive'].apply_index( sel_max_tcp, target=usable_asmt_tests['eff_result']) src_asmt['tested_covid_positive'].create_like(usable_asmt_tests, 'tested_covid_positive') src_asmt['tested_covid_positive'].apply_index( sel_max_tcp, target=usable_asmt_tests['tested_covid_positive']) # ==== # usable_asmt_tests step 2: filter only positive # ==== # Making sure that the test is definite (either positive or negative) filt_deftest = usable_asmt_tests['tested_covid_positive'].data[:] > 1 # print(len(ds.get_reader(usable_asmt_tests['patient_id']))) for k in ('id', 'patient_id', 'created_at', 'had_covid_test', 'tested_covid_positive', 'eff_result_time', 'eff_result'): usable_asmt_tests[k].apply_filter(filt_deftest, in_place=True) # ==== # usable_asmt_tests step 3: add delta_days_test, date_final_test, and pcr_standard fields # ==== # Getting difference between created at (max of hct date) and max of test result (eff_result_time) reader_hct = usable_asmt_tests['created_at'].data[:] reader_tcp = usable_asmt_tests['eff_result_time'].data[:] with utils.Timer('doing delta time'): delta_time = reader_tcp - reader_hct delta_days = delta_time / 86400 print(delta_days[:10], delta_time[:10]) writer = usable_asmt_tests.create_numeric('delta_days_test', 'float32') writer.data.write(delta_days) # Final day of test date_final_test = np.where(delta_days < 7, reader_hct, reader_tcp - 2 * 86400) writer = usable_asmt_tests.create_timestamp('date_final_test') writer.data.write(date_final_test) # print(ds.get_reader(usable_asmt_tests['date_final_test'])[:10], date_final_test[:10]) pcr_standard = np.ones(len(usable_asmt_tests['patient_id'].data)) writer = usable_asmt_tests.create_numeric('pcr_standard', 'int') writer.data.write(pcr_standard) # ==== # out_test_fin step 2 copy from usable_asmt_tests # ==== list_init = ('patient_id', 'date_final_test', 'tested_covid_positive', 'pcr_standard') list_final = ('patient_id', 'date_effective_test', 'result', 'pcr_standard') # Join for (i, f) in zip(list_init, list_final): values = usable_asmt_tests[i].data[:] if f == 'result': values -= 2 # writers_dict[f] = reader.get_writer(out_test_fin, f, ts) print(len(values), f) writers_dict[f].write(values) writers_dict['converted_test'].write( np.ones(len(usable_asmt_tests['patient_id'].data), dtype='bool')) # ==== # out_pos step 1: copy from out_test_fin, filter valid result, and write to csv # ==== result_fin = out_test_fin['result'].data[:] filt_pos = result_fin == 1 out_pos = output.create_dataframe('out_pos') for k in out_test_fin.keys(): out_test_fin[k].create_like(out_pos, k) out_test_fin[k].apply_filter(filt_pos, target=out_pos[k]) print(k, len(out_test_fin[k].data), len(filt_pos)) pat_pos_len = len(out_pos['patient_id'].get_spans()) - 1 dataset.copy(out_pos, output, 'out_pos_copy') save_df_to_csv(out_pos, 'TestedPositiveTestDetails.csv') # ==== # out_pos step 2 filter patient that has assessment # ==== with utils.Timer('Mapping index asmt to pos only'): test2pat = prst.foreign_key_is_in_primary_key( out_pos['patient_id'].data[:], foreign_key=src_asmt['patient_id'].data[:]) for f in [ 'created_at', 'patient_id', 'treatment', 'other_symptoms', 'country_code', 'location', 'updated_at' ] + list_symptoms: #print(f) if (f in list(out_pos.keys())): out_pos[f].data.clear() src_asmt[f].apply_filter(test2pat, target=out_pos[f]) else: src_asmt[f].create_like(out_pos, f) src_asmt[f].apply_filter(test2pat, target=out_pos[f]) # print(len(np.unique(ds.get_reader(out_pos['patient_id'])[:])), len(np.unique(pat_pos[:]))) print(len(out_pos['patient_id'].get_spans()) - 1, pat_pos_len) unique_other, counts = np.unique(out_pos['other_symptoms'].data[:], return_counts=True) dict_other = {'other': unique_other, 'counts': counts} df_other = pd.DataFrame.from_dict(dict_other) df_other.to_csv('OtherSymptoms.csv') # this is duplicated with 265-273 # for k in list_symptoms: # print(k) # if k in list(out_pos.keys()): # src_asmt[k].apply_filter(test2pat, target=out_pos[k]) # else: # src_asmt[k].create_like(out_pos, k) # src_asmt[k].apply_filter(test2pat, target=out_pos[k]) # reader = ds.get_reader(src_asmt[k]) # writer = reader.get_writer(out_pos, k,ts,write_mode='overwrite') # ds.apply_filter(test2pat, reader,writer) # ==== # summarize the symptoms # ==== # sum_symp = np.zeros(len(out_pos['patient_id'].data)) # for k in list_symptoms: # values = out_pos[k].data[:] # if k == 'fatigue' or k == 'shortness_of_breath': # values = np.where(values > 2, np.ones_like(values), np.zeros_like(values)) # else: # values = np.where(values > 1, np.ones_like(values), np.zeros_like(values)) # sum_symp += values sum_symp = sum_up_symptons_v1(out_pos) out_pos.create_numeric('sum_symp', 'int').data.write(sum_symp) # writer = ds.get_numeric_writer(out_pos, 'sum_symp', dtype='int', timestamp=ts, writemode='overwrite') # writer.write(sum_symp) # ==== # filter the symptoms # ==== # symp_flat = np.where(out_pos['sum_symp'].data[:] < 1, 0, 1) # spans = out_pos['patient_id'].get_spans() # print('Number definitie positive is', len(spans) - 1) # # # Get the first index at which the hct field is maximum # firstnz_symp_ind = ds.apply_spans_index_of_max(spans, symp_flat) # max_symp_check = symp_flat[firstnz_symp_ind] # # Get the index of first element of patient_id when sorted # # filt_asymptomatic = max_symp_check == 0 # print('Number asymptomatic is ', len(spans) - 1 - np.sum(max_symp_check), np.sum(filt_asymptomatic)) # # first_symp_ind = spans[:-1] # not_healthy_first = first_symp_ind != firstnz_symp_ind # print('Number not healthy first is ', len(spans) - 1 - np.sum(not_healthy_first)) # # spans_valid = ds.apply_filter(not_healthy_first, first_symp_ind) # pat_sel = ds.apply_indices(spans_valid, out_pos['patient_id'].data[:]) # filt_sel = prst.foreign_key_is_in_primary_key(pat_sel, out_pos['patient_id'].data[:]) # # spans_asymp = ds.apply_filter(filt_asymptomatic, first_symp_ind) spans_asymp, filt_sel = filter_asymp_and_firstnz_v1(s, out_pos) # ==== # out_pos step 3 filter asymptomatic # ==== pat_asymp = out_pos['patient_id'].apply_index(spans_asymp) #pat_asymp = ds.apply_indices(spans_asymp, ds.get_reader(out_pos['patient_id'])) filt_pata = prst.foreign_key_is_in_primary_key( pat_asymp.data[:], out_pos['patient_id'].data[:]) # ==== # out_pos_hs step 1 copy from out_pos and apply filter not healthy first # ==== out_pos_hs = output.create_dataframe('out_pos_hs') for k in list_symptoms + [ 'created_at', 'patient_id', 'sum_symp', 'country_code', 'location', 'treatment', 'updated_at' ]: #print(k) out_pos[k].create_like(out_pos_hs, k) out_pos[k].apply_filter(filt_sel, target=out_pos_hs[k]) # reader = ds.get_reader(out_pos[k]) # writer = reader.get_writer(out_pos_hs, k, ts) # ds.apply_filter(filt_sel, reader, writer) # dict_final = {} # for k in out_pos_hs.keys(): # dict_final[k] = out_pos_hs[k].data[:] # # df_final = pd.DataFrame.from_dict(dict_final) # df_final.to_csv(path + '/PositiveSympStartHealthyAllSymptoms.csv') save_df_to_csv(out_pos_hs, 'PositiveSympStartHealthyAllSymptoms.csv') print('out_pos_asymp') # ==== # out_pos_as 1 out_pos filter asymptomatic # ==== out_pos_as = output.create_dataframe('out_pos_asymp') for k in list_symptoms + [ 'created_at', 'patient_id', 'sum_symp', 'country_code', 'location', 'treatment' ]: out_pos[k].create_like(out_pos_as, k) out_pos[k].apply_filter(filt_pata, target=out_pos_as[k]) # reader = ds.get_reader(out_pos[k]) # writer = reader.get_writer(out_pos_as, k, ts) # ds.apply_filter(filt_pata, reader, writer) # dict_finala = {} # for k in out_pos_as.keys(): # dict_finala[k] = out_pos_as[k].data[:] # # df_finala = pd.DataFrame.from_dict(dict_finala) # df_finala.to_csv(path + '/PositiveAsympAllSymptoms.csv') save_df_to_csv(out_pos_as, 'PositiveAsympAllSymptoms.csv') # Based on the final selected patient_id, select the appropriate rows of the patient_table src_pat = source['patients'] filt_pat = prst.foreign_key_is_in_primary_key( out_pos_hs['patient_id'].data[:], src_pat['id'].data[:]) list_interest = [ 'has_cancer', 'has_diabetes', 'has_lung_disease', 'has_heart_disease', 'has_kidney_disease', 'has_asthma', 'race_is_other', 'race_is_prefer_not_to_say', 'race_is_uk_asian', 'race_is_uk_black', 'race_is_uk_chinese', 'race_is_uk_middle_eastern', 'race_is_uk_mixed_other', 'race_is_uk_mixed_white_black', 'race_is_uk_white', 'race_is_us_asian', 'race_is_us_black', 'race_is_us_hawaiian_pacific', 'race_is_us_indian_native', 'race_is_us_white', 'race_other', 'year_of_birth', 'is_smoker', 'smoker_status', 'bmi_clean', 'is_in_uk_twins', 'healthcare_professional', 'gender', 'id', 'blood_group', 'lsoa11cd', 'already_had_covid' ] out_pat = output.create_dataframe('patient_pos') print('patient_pos') for k in list_interest: src_pat[k].create_like(out_pat, k) src_pat[k].apply_filter(filt_pat, target=out_pat[k]) # reader = ds.get_reader(src_pat[k]) # writer = reader.get_writer(out_pat, k, ts) # ds.apply_filter(filt_pat, reader, writer) # dict_pat = {} # for k in list_interest: # values = out_pat[k].data[:] # dict_pat[k] = values # # df_pat = pd.DataFrame.from_dict(dict_pat) # df_pat.to_csv(path + '/PositiveSympStartHealthy_PatDetails.csv') save_df_to_csv(out_pat, 'PositiveSympStartHealthy_PatDetails.csv') #spans_asymp = ds.apply_filter(filt_asymptomatic, first_symp_ind) #pat_asymp = ds.apply_indices(spans_asymp, ds.get_reader(out_pos['patient_id'])) pat_asymp = out_pos['patient_id'].apply_index(spans_asymp) filt_asymp = prst.foreign_key_is_in_primary_key(pat_asymp.data[:], src_pat['id'].data[:]) out_pat_asymp = output.create_dataframe('patient_asymp') for k in list_interest: src_pat[k].create_like(out_pat_asymp, k) src_pat[k].apply_filter(filt_asymp, target=out_pat_asymp[k]) # reader = ds.get_reader(src_pat[k]) # writer = reader.get_writer(out_pat_asymp, k, ts) # ds.apply_filter(filt_asymp, reader, writer) # dict_pata = {} # for k in list_interest: # values = out_pat_asymp[k].data[:] # dict_pata[k] = values # # df_pata = pd.DataFrame.from_dict(dict_pata) # df_pata.to_csv(path + '/PositiveAsymp_PatDetails.csv') save_df_to_csv(out_pat_asymp, 'PositiveAsymp_PatDetails.csv')
def read_file_using_fast_csv_reader(source, chunk_row_size, column_offsets, index_map, field_importer_list=None, stop_after_rows=None): ESCAPE_VALUE = np.frombuffer(b'"', dtype='S1')[0][0] SEPARATOR_VALUE = np.frombuffer(b',', dtype='S1')[0][0] NEWLINE_VALUE = np.frombuffer(b'\n', dtype='S1')[0][0] WHITE_SPACE_VALUE = np.frombuffer(b' ', dtype='S1')[0][0] chunk_row_size *= 2 time0 = time.time() total_byte_size, count_columns, count_rows, chunk_byte_size = get_file_stat( source, chunk_row_size) column_val_total_count = column_offsets[-1] with utils.Timer("read_file_using_fast_csv_reader"): chunk_index = 0 hasHeader = True accumulated_written_rows = 0 # initialize column_inds, column_vals ouside of while-loop column_inds = np.zeros( (count_columns, count_rows + 1), dtype=np.int64) # add one more row for initial index 0 # column_vals = np.zeros((count_columns, val_row_count), dtype=np.uint8) column_vals = np.zeros(np.int64(column_val_total_count), dtype=np.uint8) # make ndarray larger factor larger_factor = 2 is_indices_full, is_values_full = False, False content = None start_index = 0 ch = 0 while chunk_index < total_byte_size: if stop_after_rows and accumulated_written_rows >= stop_after_rows: break # reads chunk size of file content # when indices or values is full, we need to call fast_csv_reader again, but we don't want to read same content again if not is_indices_full and not is_values_full: content = np.fromfile(source, count=chunk_byte_size, offset=chunk_index, dtype=np.uint8) start_index = 0 length_content = content.shape[0] if length_content == 0: break # check if there's newline at EOF in the last chunk. add one if it's missing if chunk_index + length_content == total_byte_size and content[ -1] != NEWLINE_VALUE: content = np.append(content, NEWLINE_VALUE) offset_pos, written_row_count, is_indices_full, is_values_full, val_full_col_idx = fast_csv_reader( content, start_index, column_inds, column_vals, column_offsets, hasHeader, ESCAPE_VALUE, SEPARATOR_VALUE, NEWLINE_VALUE, WHITE_SPACE_VALUE) # convert and write for ith, i_c in enumerate(index_map): if field_importer_list and field_importer_list[ith]: field_importer_list[ith].transform_and_write_part( column_inds, column_vals, column_offsets, i_c, written_row_count) # make column_inds larger if it gets full before reach the end of chunk if is_indices_full: indices_row_count = column_inds.shape[1] - 1 column_inds = np.zeros( (count_columns, np.uint32(indices_row_count * larger_factor + 1)), dtype=np.int64) # make column_values larger if it gets full before reach the end of chunk if is_values_full and val_full_col_idx != -1: col_val_count = column_offsets[ val_full_col_idx + 1] - column_offsets[val_full_col_idx] delta = col_val_count * (larger_factor - 1) column_offsets = np.concatenate( (column_offsets[:val_full_col_idx + 1], column_offsets[val_full_col_idx + 1:] + np.int64(delta))) column_val_total_count = column_offsets[-1] column_vals = np.zeros(np.int64(column_val_total_count), dtype=np.uint8) # reassign if is_indices_full or is_values_full: start_index = offset_pos else: chunk_index += offset_pos hasHeader = False accumulated_written_rows += written_row_count ch += 1 print( f"{ch} chunks, {accumulated_written_rows} accumulated_written_rows parsed in {time.time() - time0}s" ) # flush at the end for ith in range(len(index_map)): field_importer_list[ith].flush() print(f"Total time {time.time() - time0}s")
ofilter = otherend - otherstart > 0 print("ofilter:", ofilter.sum(), len(ofilter)) cfilter = cc == b"GB" print("cfilter:", cfilter.sum(), len(cfilter)) filter_ = ofilter & cfilter print("filter_:", filter_.sum(), len(filter_)) filt_asmt = tmp.create_group('filt_assessments') filt_other_symptoms = other.create_like(filt_asmt, 'other_symptoms') s.apply_filter(filter_, other, filt_other_symptoms) patient_id = s.get(hf['assessments']['patient_id']) filt_patient_id = patient_id.create_like(filt_asmt, 'patient_id') s.apply_filter(filter_, patient_id, filt_patient_id) print('filtered symptoms len =', len(filt_other_symptoms.data)) with utils.Timer("merging test_results"): p_to_a = s.create_numeric(tmp, 'p_to_a', 'int64') a_test_results = s.create_numeric(tmp, 'a_test_results', 'int8') s.ordered_merge_left(left_on=s.get( tmp['filt_assessments']['patient_id']), right_on=s.get(hf['patients']['id']), left_field_sources=(p_test_results, ), left_field_sinks=(a_test_results, ), left_to_right_map=p_to_a, right_unique=True) print(len(a_test_results.data)) print(np.unique(a_test_results.data[:], return_counts=True)) a_test_results_ = a_test_results.data[:] # filtered_test_results = test_results[filter_] # print("filtered tests:", np.unique(filtered_test_results, return_counts=True))
def postprocess(dataset, destination, timestamp=None, flags=None): if flags is None: flags = set() do_daily_asmts = 'daily' in flags has_patients = 'patients' in dataset.keys() has_assessments = 'assessments' in dataset.keys() has_tests = 'tests' in dataset.keys() has_diet = 'diet' in dataset.keys() sort_enabled = lambda x: True process_enabled = lambda x: True sort_patients = sort_enabled(flags) and True sort_assessments = sort_enabled(flags) and True sort_tests = sort_enabled(flags) and True sort_diet = sort_enabled(flags) and True make_assessment_patient_id_fkey = process_enabled(flags) and True year_from_age = process_enabled(flags) and True clean_weight_height_bmi = process_enabled(flags) and True health_worker_with_contact = process_enabled(flags) and True clean_temperatures = process_enabled(flags) and True check_symptoms = process_enabled(flags) and True create_daily = process_enabled(flags) and do_daily_asmts make_patient_level_assessment_metrics = process_enabled(flags) and True make_patient_level_daily_assessment_metrics = process_enabled( flags) and do_daily_asmts make_new_test_level_metrics = process_enabled(flags) and True make_diet_level_metrics = True make_healthy_diet_index = True # ds = DataStore(timestamp=timestamp) s = Session() # patients ================================================================ sorted_patients_src = None if has_patients: patients_src = dataset['patients'] write_mode = 'write' if 'patients' not in destination.keys(): patients_dest = s.get_or_create_group(destination, 'patients') sorted_patients_src = patients_dest # Patient sort # ============ if sort_patients: duplicate_filter = \ persistence.filter_duplicate_fields(s.get(patients_src['id']).data[:]) for k in patients_src.keys(): t0 = time.time() r = s.get(patients_src[k]) w = r.create_like(patients_dest, k) s.apply_filter(duplicate_filter, r, w) print(f"'{k}' filtered in {time.time() - t0}s") print(np.count_nonzero(duplicate_filter == True), np.count_nonzero(duplicate_filter == False)) sort_keys = ('id', ) s.sort_on(patients_dest, patients_dest, sort_keys, write_mode='overwrite') # Patient processing # ================== if year_from_age: log("year of birth -> age; 18 to 90 filter") t0 = time.time() yobs = s.get(patients_dest['year_of_birth']) yob_filter = s.get(patients_dest['year_of_birth_valid']) age = s.create_numeric(patients_dest, 'age', 'uint32') age_filter = s.create_numeric(patients_dest, 'age_filter', 'bool') age_16_to_90 = s.create_numeric(patients_dest, '16_to_90_years', 'bool') print('year_of_birth:', patients_dest['year_of_birth']) for k in patients_dest['year_of_birth'].attrs.keys(): print(k, patients_dest['year_of_birth'].attrs[k]) calculate_age_from_year_of_birth_v1(yobs, yob_filter, 16, 90, age, age_filter, age_16_to_90, 2020) log(f"completed in {time.time() - t0}") print('age_filter count:', np.sum(patients_dest['age_filter']['values'][:])) print('16_to_90_years count:', np.sum(patients_dest['16_to_90_years']['values'][:])) if clean_weight_height_bmi: log("height / weight / bmi; standard range filters") t0 = time.time() weights_clean = s.create_numeric(patients_dest, 'weight_kg_clean', 'float32') weights_filter = s.create_numeric(patients_dest, '40_to_200_kg', 'bool') heights_clean = s.create_numeric(patients_dest, 'height_cm_clean', 'float32') heights_filter = s.create_numeric(patients_dest, '110_to_220_cm', 'bool') bmis_clean = s.create_numeric(patients_dest, 'bmi_clean', 'float32') bmis_filter = s.create_numeric(patients_dest, '15_to_55_bmi', 'bool') weight_height_bmi_v1(s, 40, 200, 110, 220, 15, 55, None, None, None, None, patients_dest['weight_kg'], patients_dest['weight_kg_valid'], patients_dest['height_cm'], patients_dest['height_cm_valid'], patients_dest['bmi'], patients_dest['bmi_valid'], weights_clean, weights_filter, None, heights_clean, heights_filter, None, bmis_clean, bmis_filter, None) log(f"completed in {time.time() - t0}") if health_worker_with_contact: with utils.Timer("health_worker_with_contact field"): #writer = ds.get_categorical_writer(patients_dest, 'health_worker_with_contact', 'int8') combined_hcw_with_contact_v1( s, s.get(patients_dest['healthcare_professional']), s.get(patients_dest['contact_health_worker']), s.get(patients_dest['is_carer_for_community']), patients_dest, 'health_worker_with_contact') # assessments ============================================================= sorted_assessments_src = None if has_assessments: assessments_src = dataset['assessments'] if 'assessments' not in destination.keys(): assessments_dest = s.get_or_create_group(destination, 'assessments') sorted_assessments_src = assessments_dest if sort_assessments: sort_keys = ('patient_id', 'created_at') with utils.Timer("sorting assessments"): s.sort_on(assessments_src, assessments_dest, sort_keys) if has_patients: if make_assessment_patient_id_fkey: print( "creating 'assessment_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) assessment_patient_ids =\ s.get(sorted_assessments_src['patient_id']) assessment_patient_id_fkey =\ s.create_numeric(assessments_dest, 'assessment_patient_id_fkey', 'int64') s.get_index(patient_ids.data[:], assessment_patient_ids.data[:], assessment_patient_id_fkey) print(f"completed in {time.time() - t0}s") if clean_temperatures: print("clean temperatures") t0 = time.time() temps = s.get(sorted_assessments_src['temperature']) temp_units = s.get(sorted_assessments_src['temperature_unit']) temps_valid = s.get( sorted_assessments_src['temperature_valid']) dest_temps = temps.create_like(assessments_dest, 'temperature_c_clean') dest_temps_valid = temps_valid.create_like( assessments_dest, 'temperature_35_to_42_inclusive') dest_temps_modified = temps_valid.create_like( assessments_dest, 'temperature_modified') validate_temperature_v1(s, 35.0, 42.0, temps, temp_units, temps_valid, dest_temps, dest_temps_valid, dest_temps_modified) print(f"temperature cleaning done in {time.time() - t0}") if check_symptoms: print('check inconsistent health_status') t0 = time.time() check_inconsistent_symptoms_v1(s, sorted_assessments_src, assessments_dest) print(time.time() - t0) # tests =================================================================== if has_tests: if sort_tests: tests_src = dataset['tests'] tests_dest = s.get_or_create_group(destination, 'tests') sort_keys = ('patient_id', 'created_at') s.sort_on(tests_src, tests_dest, sort_keys) # diet ==================================================================== if has_diet: diet_src = dataset['diet'] if 'diet' not in destination.keys(): diet_dest = s.get_or_create_group(destination, 'diet') sorted_diet_src = diet_dest if sort_diet: sort_keys = ('patient_id', 'display_name', 'id') s.sort_on(diet_src, diet_dest, sort_keys) if has_assessments: if do_daily_asmts: daily_assessments_dest = s.get_or_create_group( destination, 'daily_assessments') # post process patients # TODO: need an transaction table print(patients_src.keys()) print(dataset['assessments'].keys()) print(dataset['tests'].keys()) # write_mode = 'overwrite' write_mode = 'write' # Daily assessments # ================= if has_assessments: if create_daily: print("generate daily assessments") patient_ids = s.get(sorted_assessments_src['patient_id']) created_at_days = s.get(sorted_assessments_src['created_at_day']) raw_created_at_days = created_at_days.data[:] if 'assessment_patient_id_fkey' in assessments_src.keys(): patient_id_index = assessments_src[ 'assessment_patient_id_fkey'] else: patient_id_index = assessments_dest[ 'assessment_patient_id_fkey'] patient_id_indices = s.get(patient_id_index) raw_patient_id_indices = patient_id_indices.data[:] print("Calculating patient id index spans") t0 = time.time() patient_id_index_spans = s.get_spans( fields=(raw_patient_id_indices, raw_created_at_days)) print( f"Calculated {len(patient_id_index_spans)-1} spans in {time.time() - t0}s" ) print("Applying spans to 'health_status'") t0 = time.time() default_behavour_overrides = { 'id': s.apply_spans_last, 'patient_id': s.apply_spans_last, 'patient_index': s.apply_spans_last, 'created_at': s.apply_spans_last, 'created_at_day': s.apply_spans_last, 'updated_at': s.apply_spans_last, 'updated_at_day': s.apply_spans_last, 'version': s.apply_spans_max, 'country_code': s.apply_spans_first, 'date_test_occurred': None, 'date_test_occurred_guess': None, 'date_test_occurred_day': None, 'date_test_occurred_set': None, } for k in sorted_assessments_src.keys(): t1 = time.time() reader = s.get(sorted_assessments_src[k]) if k in default_behavour_overrides: apply_span_fn = default_behavour_overrides[k] if apply_span_fn is not None: apply_span_fn( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") else: print(f" Skipping field {k}") else: if isinstance(reader, fields.CategoricalField): s.apply_spans_max( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") elif isinstance(reader, rw.IndexedStringReader): s.apply_spans_concat( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") elif isinstance(reader, rw.NumericReader): s.apply_spans_max( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") else: print(f" No function for {k}") print(f"apply_spans completed in {time.time() - t0}s") if has_patients and has_assessments: if make_patient_level_assessment_metrics: if 'assessment_patient_id_fkey' in assessments_dest: src = assessments_dest['assessment_patient_id_fkey'] else: src = assessments_src['assessment_patient_id_fkey'] assessment_patient_id_fkey = s.get(src) # generate spans from the assessment-space patient_id foreign key spans = s.get_spans(field=assessment_patient_id_fkey.data[:]) ids = s.get(patients_dest['id']) print('calculate assessment counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'assessment_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated assessment counts per patient in {time.time() - t0}" ) print('calculate first assessment days per patient') t0 = time.time() reader = s.get(sorted_assessments_src['created_at_day']) writer = s.create_fixed_string(patients_dest, 'first_assessment_day', 10) aggregated_counts = s.apply_spans_first(spans, reader) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated first assessment days per patient in {time.time() - t0}" ) print('calculate last assessment days per patient') t0 = time.time() reader = s.get(sorted_assessments_src['created_at_day']) writer = s.create_fixed_string(patients_dest, 'last_assessment_day', 10) aggregated_counts = s.apply_spans_last(spans, reader) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated last assessment days per patient in {time.time() - t0}" ) print('calculate maximum assessment test result per patient') t0 = time.time() reader = s.get(sorted_assessments_src['tested_covid_positive']) writer = reader.create_like(patients_dest, 'max_assessment_test_result') max_result_value = s.apply_spans_max(spans, reader) s.join(ids, assessment_patient_id_fkey, max_result_value, writer, spans) print( f"calculated maximum assessment test result in {time.time() - t0}" ) if has_assessments and do_daily_asmts and make_patient_level_daily_assessment_metrics: print( "creating 'daily_assessment_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) daily_assessment_patient_ids =\ s.get(daily_assessments_dest['patient_id']) daily_assessment_patient_id_fkey =\ s.create_numeric(daily_assessments_dest, 'daily_assessment_patient_id_fkey', 'int64') s.get_index(patient_ids, daily_assessment_patient_ids, daily_assessment_patient_id_fkey) print(f"completed in {time.time() - t0}s") spans = s.get_spans(field=s.get( daily_assessments_dest['daily_assessment_patient_id_fkey'])) print('calculate daily assessment counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'daily_assessment_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) daily_assessment_patient_id_fkey =\ s.get(daily_assessments_dest['daily_assessment_patient_id_fkey']) s.join(ids, daily_assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated daily assessment counts per patient in {time.time() - t0}" ) if has_tests and make_new_test_level_metrics: print( "creating 'test_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) test_patient_ids = s.get(tests_dest['patient_id']) test_patient_id_fkey = s.create_numeric(tests_dest, 'test_patient_id_fkey', 'int64') s.get_index(patient_ids, test_patient_ids, test_patient_id_fkey) test_patient_id_fkey = s.get(tests_dest['test_patient_id_fkey']) spans = s.get_spans(field=test_patient_id_fkey) print(f"completed in {time.time() - t0}s") print('calculate test_counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'test_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) s.join(ids, test_patient_id_fkey, aggregated_counts, writer, spans) print(f"calculated test counts per patient in {time.time() - t0}") print('calculate test_result per patient') t0 = time.time() test_results = s.get(tests_dest['result']) writer = test_results.create_like(patients_dest, 'max_test_result') aggregated_results = s.apply_spans_max(spans, test_results) s.join(ids, test_patient_id_fkey, aggregated_results, writer, spans) print(f"calculated max_test_result per patient in {time.time() - t0}") if has_diet and make_diet_level_metrics: with utils.Timer("Making patient-level diet questions count", new_line=True): d_pids_ = s.get(diet_dest['patient_id']).data[:] d_pid_spans = s.get_spans(d_pids_) d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_) d_pid_counts = s.apply_spans_count(d_pid_spans) p_diet_counts = s.create_numeric(patients_dest, 'diet_counts', 'int32') s.merge_left(left_on=s.get(patients_dest['id']).data[:], right_on=d_distinct_pids, right_fields=(d_pid_counts, ), right_writers=(p_diet_counts, ))
def journal_table(session, schema, old_src, new_src, src_pk, result): old_keys = set(old_src.keys()) new_keys = set(new_src.keys()) common_keys = old_keys.intersection(new_keys) common_keys.remove('j_valid_from') common_keys.remove('j_valid_to') old_only_keys = old_keys.difference(new_keys) new_only_keys = new_keys.difference(old_keys) with utils.Timer("sorting old ids"): old_ids = session.get(old_src[src_pk]) old_ids_ = old_ids.data[:] old_ids_valid_from = session.get(old_src['j_valid_from']).data[:] old_sorted_index = session.dataset_sort_index((old_ids_, old_ids_valid_from)) old_count = len(old_ids_) with utils.Timer("sorting new_ids"): new_ids_ = session.get(new_src[src_pk]).data[:] new_sorted_index = session.dataset_sort_index((new_ids_,)) new_count = len(new_ids_) # print("old_ids:", old_ids_[old_sorted_index[:20]]) # print("new_ids:", new_ids_[new_sorted_index[:20]]) # get the row maps for rows that we need to compare with utils.Timer("generating row_maps for merging"): old_ids_ = old_ids_[old_sorted_index] new_ids_ = new_ids_[new_sorted_index] old_map, new_map = ops.ordered_generate_journalling_indices(old_ids_, new_ids_) to_keep = np.zeros(len(old_map), dtype=np.bool) schema_fields = schema.fields.keys() common_keys = [k for k in schema_fields if k in common_keys] print("old_map:", old_map) print("new_map:", new_map) for k in common_keys: if k in (src_pk, 'j_valid_from', 'j_valid_to'): continue old_f = session.get(old_src[k]) new_f = session.get(new_src[k]) print(k) if isinstance(old_f, flds.IndexedStringField): old_f_i_, old_f_v_ = session.apply_index(old_sorted_index, old_f) new_f_i_, new_f_v_ = session.apply_index(new_sorted_index, new_f) ops.compare_indexed_rows_for_journalling(old_map, new_map, old_f_i_, old_f_v_, new_f_i_, new_f_v_, to_keep) else: old_f_ = session.apply_index(old_sorted_index, old_f) new_f_ = session.apply_index(new_sorted_index, new_f) ops.compare_rows_for_journalling(old_map, new_map, old_f_, new_f_, to_keep) print("to_keep:", to_keep.astype(np.uint8)) print(to_keep.sum(), len(to_keep)) merged_length = len(old_ids.data) + to_keep.sum() only_in_old = 0 only_in_new = 0 not_updated = 0 updated = 0 for i in range(len(old_map)): if old_map[i] == -1: only_in_new += 1 if new_map[i] == -1: only_in_old += 1 if (old_map[i] != -1) and (to_keep[i] == True): updated += 1 if (new_map[i] != -1) and (to_keep[i] == False): not_updated += 1 for k in common_keys: if k in (src_pk, 'j_valid_from', 'j_valid_to'): continue old_f = session.get(old_src[k]) new_f = session.get(new_src[k]) print(k) if isinstance(old_f, flds.IndexedStringField): old_f_i_, old_f_v_ = session.apply_index(old_sorted_index, old_f) new_f_i_, new_f_v_ = session.apply_index(new_sorted_index, new_f) dest_i_ = np.zeros(merged_length + 1, old_f_i_.dtype) val_count = ops.merge_indexed_journalled_entries_count(old_map, new_map, to_keep, old_f_i_, new_f_i_) dest_v_ = np.zeros(val_count, old_f_v_.dtype) ops.merge_indexed_journalled_entries(old_map, new_map, to_keep, old_f_i_, old_f_v_, new_f_i_, new_f_v_, dest_i_, dest_v_) dest_f = new_f.create_like(result, k) dest_f.indices.write(dest_i_) dest_f.values.write(dest_v_) else: old_f_v_ = session.apply_index(old_sorted_index, old_f) new_f_v_ = session.apply_index(new_sorted_index, new_f) dest_ = np.zeros(merged_length, old_f_v_.dtype) ops.merge_journalled_entries(old_map, new_map, to_keep, old_f_v_, new_f_v_, dest_) dest_f = new_f.create_like(result, k) dest_f.data.write(dest_) print("old_count:", old_count) print("new_count:", new_count) print("only in old:", only_in_old) print("only in new:", only_in_new) print("updated:", updated) print("not updated:", not_updated) print("post journal count:", merged_length)
def method_paper_prediction_pipeline(ds, src_data, dest_data, first_timestamp, last_timestamp): s_ptnts = src_data['patients'] s_asmts = src_data['assessments'] s_tests = src_data['tests'] first_dt = datetime.fromtimestamp(first_timestamp) last_dt = datetime.fromtimestamp(last_timestamp) print(s_tests.keys()) # Filter patients to be only from England # ======================================= eng_pats = set() p_ids_ = ds.get_reader(s_ptnts['id'])[:] p_lsoas_ = ds.get_reader(s_ptnts['lsoa11cd'])[:] for i in range(len(p_ids_)): lsoa = p_lsoas_[i] if len(lsoa) > 0 and lsoa[0] == 69: # E eng_pats.add(p_ids_[i]) print("eng pats:", len(eng_pats)) if "flat_asmts" not in dest_data.keys(): flat_tests = dest_data.create_group('flat_tests') # Filter tests # ============ t_cats = ds.get_reader(s_tests['created_at']) raw_t_cats = t_cats[:] t_dts = ds.get_reader(s_tests['date_taken_specific']) raw_t_dts = t_dts[:] t_dsbs = ds.get_reader(s_tests['date_taken_between_start']) raw_t_dsbs = t_dsbs[:] t_dsbe = ds.get_reader(s_tests['date_taken_between_end']) raw_t_dsbe = t_dsbe[:] # remove non GB tests cur_filter = (ds.get_reader(s_tests['country_code'])[:] == b'GB') test_filter = cur_filter[:] print("standard test filter GB:", np.count_nonzero(test_filter), len(test_filter)) # remove non england tests t_pids_ = ds.get_reader(s_tests['patient_id'])[:] cur_filter = np.zeros(len(t_pids_), dtype=np.bool) for i in range(len(t_pids_)): cur_filter[i] = t_pids_[i] in eng_pats test_filter = test_filter & cur_filter print("standard test filter Eng:", np.count_nonzero(test_filter), len(test_filter)) # remove tests where no dates are set cur_filter = np.logical_not((raw_t_dts == 0) & (raw_t_dsbs == 0) & (raw_t_dsbe == 0)) test_filter = test_filter & cur_filter print("standard test filter 1:", np.count_nonzero(test_filter), len(test_filter)) # remove tests where all three dates are set cur_filter = np.logical_not((raw_t_dts != 0) & (raw_t_dsbs != 0) & (raw_t_dsbe != 0)) test_filter = test_filter & cur_filter print("standard test filter 2:", np.count_nonzero(test_filter), len(test_filter)) # remove tests where only one of the date range tests is set cur_filter = np.logical_not((raw_t_dsbs != 0) & (raw_t_dsbe == 0) | (raw_t_dsbs == 0) & (raw_t_dsbe != 0)) test_filter = test_filter & cur_filter print("standard test filter 3:", np.count_nonzero(test_filter), len(test_filter)) # remove tests where specific date is set but out of range cur_filter =\ (raw_t_dts == 0) | ((raw_t_dts >= first_timestamp) & (raw_t_dts <= last_timestamp)) test_filter = test_filter & cur_filter print("standard test filter 4:", np.count_nonzero(test_filter), len(test_filter)) # remove tests where beginning date is set but out of range cur_filter =\ (raw_t_dsbs == 0) | ((raw_t_dsbs >= first_timestamp) & (raw_t_dsbs <= last_timestamp)) test_filter = test_filter & cur_filter print("standard test filter 5:", np.count_nonzero(test_filter), len(test_filter)) # remove tests where ending date is set but out of range cur_filter = \ (raw_t_dsbe == 0) | ((raw_t_dsbe >= first_timestamp) & (raw_t_dsbe <= last_timestamp)) test_filter = test_filter & cur_filter print("standard test filter 6:", np.count_nonzero(test_filter), len(test_filter)) test_timestamps = np.where(raw_t_dts != 0, raw_t_dts, raw_t_dsbs + (raw_t_dsbe - raw_t_dsbs) / 2) # remove tests where the test date is after the created at date cur_filter = test_timestamps <= raw_t_cats test_filter = test_filter & cur_filter print("standard test filter 7:", np.count_nonzero(test_filter), len(test_filter)) t_rsts = ds.get_reader(s_tests['result']) t_rsts.get_writer(flat_tests, 'result').write(ds.apply_filter(test_filter, t_rsts)) t_pids = ds.get_reader(s_tests['patient_id']) t_pids.get_writer(flat_tests, 'patient_id').write(ds.apply_filter(test_filter, t_pids)) ds.get_timestamp_writer(flat_tests, 'eff_test_date').write( ds.apply_filter(test_filter, test_timestamps)) # test_min_ts = datetime.fromtimestamp(test_timestamps[test_filter].min()) # test_max_ts = datetime.fromtimestamp(test_timestamps[test_filter].max()) # print(test_min_ts, test_max_ts) else: flat_tests = dest_data["flat_tests"] symptoms = ('persistent_cough', 'fatigue', 'delirium', 'shortness_of_breath', 'fever', 'diarrhoea', 'abdominal_pain', 'chest_pain', 'hoarse_voice', 'skipped_meals', 'loss_of_smell') if "flat_asmts" not in dest_data.keys(): flat_asmts = dest_data.create_group('flat_asmts') # Filter assessments # ------------------ symptom_thresholds = {s: 2 for s in symptoms} symptom_thresholds['fatigue'] = 3 symptom_thresholds['shortness_of_breath'] = 3 with utils.Timer("filter all out of date range assessments and non-uk assessments", new_line=True): a_cats = ds.get_reader(s_asmts['created_at'])[:] # in_date_range = (a_cats >= first_timestamp) & (a_cats < last_timestamp) in_date_range = a_cats >= first_timestamp in_date_range = in_date_range & (ds.get_reader(s_asmts['country_code'])[:] == b'GB') a_pids = ds.get_reader(s_asmts['patient_id'])[:] in_eng = np.zeros(len(a_pids), dtype=np.bool) for i in range(len(a_pids)): if a_pids[i] in eng_pats: in_eng[i] = True print("in_eng:", in_eng.sum(), len(in_eng)) in_date_range = in_date_range & in_eng with utils.Timer("get indices of final assessments of each day for each person"): f_a_pids = ds.apply_filter(in_date_range, a_pids) f_a_catds = ds.apply_filter(in_date_range, ds.get_reader(s_asmts['created_at_day'])[:]) spans = ds.get_spans(f_a_pids) last_daily_asmt_filter = np.zeros(len(f_a_pids), dtype=np.bool) for s in range(len(spans)-1): sb = spans[s] se = spans[s+1] subspans = ds.get_spans(f_a_catds[sb:se]) if s < 3: print(subspans) for s2 in range(1, len(subspans)): last_daily_asmt_filter[sb + subspans[s2]-1] = True print("last_daily_asmt_filter:", last_daily_asmt_filter.sum()) print(last_daily_asmt_filter[:50]) # otherspans = ds.get_spans(f_a_catds) # last_daily_asmts = np.zeros(len(otherspans)-1, dtype='int64') # ds.apply_spans_index_of_last(otherspans, last_daily_asmts) # print("last_daily_asmts:", len(last_daily_asmts)) # pc = ds.get_reader(s_asmts['persistent_cough'])[:] # pc1 = ds.apply_indices(last_daily_asmts, ds.apply_filter(in_date_range, pc)) # pc2 = ds.apply_indices(last_daily_asmts, pc) # print(len(pc1), len(pc2)) # print(np.array_equal(pc1, pc2)) with utils.Timer("flattening and filtering symptoms"): for s in symptoms: reader = ds.get_reader(s_asmts[s]) writer = ds.get_numeric_writer(flat_asmts, s, 'bool') filtered = ds.apply_filter(last_daily_asmt_filter, ds.apply_filter(in_date_range, reader[:])) writer.write(filtered >= symptom_thresholds[s]) with utils.Timer("flattening and filtering other fields", new_line=True): for f in ('id', 'patient_id', 'created_at', 'created_at_day', 'tested_covid_positive'): reader = ds.get_reader(s_asmts[f]) writer = reader.get_writer(flat_asmts, f) ds.apply_filter(in_date_range, reader, writer) reader = ds.get_reader(flat_asmts[f]) writer = reader.get_writer(flat_asmts, f, write_mode='overwrite') ds.apply_filter(last_daily_asmt_filter, reader, writer) print(" {}".format(f), len(ds.get_reader(flat_asmts[f]))) # telemetry only for s in symptoms: print(s, len(ds.get_reader(flat_asmts[s])), np.count_nonzero(ds.get_reader(flat_asmts[s])[:])) else: flat_asmts = dest_data["flat_asmts"] # Filter tests # ------------ # # filter tests within day range first # t_cats = ds.get_reader(s_tests['created_at']) # raw_t_cats = t_cats[:] # t_rsts = ds.get_reader(s_tests['result']) # t_pids = ds.get_reader(s_tests['patient_id']) # # test_date_filter = (raw_t_cats >= first_timestamp) & (raw_t_cats < last_timestamp) # test_date_filter = raw_t_cats >= first_timestamp # test_date_filter = test_date_filter & (ds.get_reader(s_tests['country_code'])[:] == b'GB') # t_cats.get_writer(flat_tests, 'created_at').write(ds.apply_filter(test_date_filter, raw_t_cats)) # t_rsts.get_writer(flat_tests, 'result').write(ds.apply_filter(test_date_filter, t_rsts)) # t_pids.get_writer(flat_tests, 'patient_id').write(ds.apply_filter(test_date_filter, t_pids)) # # raw_t_cats = ds.get_reader(flat_tests['created_at'])[:] # min_test_day = datetime.fromtimestamp(np.min(raw_t_cats)) # max_test_day = datetime.fromtimestamp(np.max(raw_t_cats)) # print(min_test_day, max_test_day) # Calculate prevalence # -------------------- if 'prediction' not in flat_asmts: intercept = -1.19015973 weights = {'persistent_cough': 0.23186655, 'fatigue': 0.56532346, 'delirium': -0.12935112, 'shortness_of_breath': 0.58273967, 'fever': 0.16580974, 'diarrhoea': 0.10236126, 'abdominal_pain': -0.11204163, 'chest_pain': -0.12318634, 'hoarse_voice': -0.17818597, 'skipped_meals': 0.25902482, 'loss_of_smell': 1.82895239} with utils.Timer("predicting covid by assessment", new_line=True): cumulative = np.zeros(len(ds.get_reader(flat_asmts['persistent_cough'])), dtype='float64') for s in symptoms: reader = ds.get_reader(flat_asmts[s]) cumulative += reader[:] * weights[s] cumulative += intercept print(" {}".format(len(cumulative))) ds.get_numeric_writer(flat_asmts, 'prediction', 'float32', writemode='overwrite').write(cumulative) pos_filter = cumulative > 0.0 print("pos_filter: ", np.count_nonzero(pos_filter), len(pos_filter)) else: cumulative = ds.get_reader(flat_asmts['prediction'])[:] # apply # positive test -> imputed positive -> negative test spans = ds.get_spans(ds.get_reader(flat_asmts['patient_id'])[:]) print('spans:', len(spans)) # generate a numpy array for each day, where each entry in the array is a patient with # assessments still in the dataset after the initial filter daydict = defaultdict(int) with utils.Timer("checking date deltas", new_line=True): a_cats = ds.get_reader(flat_asmts['created_at'])[:] first_day = datetime.fromtimestamp(first_timestamp) for i_r in range(len(a_cats)): daydict[(datetime.fromtimestamp(a_cats[i_r]) - first_day).days] += 1 sdaydict = sorted(daydict.items()) print(sdaydict) # build a combined id index for assessments and tests # --------------------------------------------------- remaining_a_pids = ds.get_reader(flat_asmts['patient_id'])[:] remaining_t_pids = ds.get_reader(flat_tests['patient_id'])[:] print("pids from assessments and tests:", len(remaining_a_pids), len(remaining_t_pids), len(set(remaining_a_pids).union(set(remaining_t_pids)))) a_pid_index, t_pid_index = ds.get_shared_index((remaining_a_pids, remaining_t_pids)) print("merging indices:", len(a_pid_index), len(t_pid_index), max(np.max(a_pid_index), np.max(t_pid_index))) max_index = max(a_pid_index[-1], t_pid_index[-1]) print('max indices:', a_pid_index[-1], t_pid_index[-1]) # calculate offset days for assessments # ------------------------------------- first_day = datetime.fromtimestamp(first_timestamp) a_cats = ds.get_reader(flat_asmts['created_at'])[:] a_tcps = ds.get_reader(flat_asmts['tested_covid_positive'])[:] a_offset_days = np.zeros(len(a_cats), dtype='int16') with utils.Timer("calculate offset days for assessments", new_line=True): for i_r, r in enumerate(a_cats): a_offset_days[i_r] = (datetime.fromtimestamp(a_cats[i_r]) - first_day).days print("assessment_dates:", sorted(utils.build_histogram(a_offset_days))) # calculate offset days for tests # ------------------------------- t_etds = ds.get_reader(flat_tests['eff_test_date']) raw_t_etds = t_etds[:] t_rsts = ds.get_reader(flat_tests['result']) t_pids = ds.get_reader(flat_tests['patient_id']) t_offset_days = np.zeros(len(raw_t_etds), dtype='int16') t_offset_dates = [None] * len(raw_t_etds) for i_r, r in enumerate(raw_t_etds): t_offset_days[i_r] = (datetime.fromtimestamp(raw_t_etds[i_r]) - first_day).days t_offset_dates[i_r] = datetime.fromtimestamp(raw_t_etds[i_r]).date() print("test_dates:", sorted(utils.build_histogram(t_offset_days))) print("test_dates2:", sorted(utils.build_histogram(t_offset_dates))) # create the destination arrays to hold daily data per patient # ------------------------------------------------------------ daycount = max(a_offset_days.max(), t_offset_days.max()) + 1 i_days = list([None] * daycount) t_days = list([None] * daycount) print("daycount:", daycount) for i in range(daycount): i_days[i] = np.zeros(max_index+1, dtype='int16') t_days[i] = np.zeros(max_index+1, dtype='int16') # incorporate assessment predictions and positive test results # note: a_offset_days is in assessment space print("len(a_offset_days):", len(a_offset_days)) print("len(a_pid_index):", len(a_pid_index)) with utils.Timer("incorporating assessments and assessment-based tests"): for i_r, r in enumerate(a_offset_days): # i_days[a_offset_days[i_r]][a_pid_index[i_r]] =\ # from_tcp if from_tcp != 0 else from_prediction from_prediction = 7 if cumulative[i_r] > 0.0 else -7 i_days[a_offset_days[i_r]][a_pid_index[i_r]] = from_prediction from_tcp = 7 if a_tcps[i_r] == 3 else -7 if a_tcps[i_r] == 2 else 0 t_days[a_offset_days[i_r]][a_pid_index[i_r]] = from_tcp # incorporate test results by to appropriate day's entry with utils.Timer("incorporating test_results"): for i_r, r in enumerate(t_offset_days): day = t_days[t_offset_days[i_r]] if t_rsts[i_r] == 4: day[t_pid_index[i_r]] = 7 elif t_rsts[i_r] == 3: day[t_pid_index[i_r]] = -7 # day[t_pid_index[i_r]] = 7 if t_rsts[i_r] == 4 else -7 if t_rsts[i_r] == 3 else 0 # if day[t_pid_index[i_r]] == 0: # day[t_pid_index[i_r]] = 7 if t_rsts[i_r] == 4 else -7 if t_rsts[i_r] == 3 else 0 # else: # day[t_pid_index[i_r]] = 7 if t_rsts[i_r] == 4 else max(day[t_pid_index[i_r]], -7) for i_d, d in enumerate(i_days): print(i_d, np.count_nonzero(d)) with utils.Timer("calculating progression"): for da in (i_days, t_days): for i_d in range(len(da)-1): prior_d = da[i_d] next_d = da[i_d + 1] next_d[:] = np.where(next_d != 0, next_d, np.where(prior_d > 0, prior_d-1, np.minimum(prior_d+1, 0))) for d in range(len(i_days)): i_d = i_days[d] t_d = t_days[d] i_present = np.count_nonzero(i_d != 0) i_positive = np.count_nonzero(i_d > 0) t_present = np.count_nonzero(t_d != 0) t_positive = np.count_nonzero(t_d > 0) c_d = np.where(t_d == 0, i_d, t_d) c_present = np.count_nonzero(c_d != 0) c_positive = np.count_nonzero(c_d > 0) day = first_day + timedelta(days=d) if c_present != 0: print(day, i_present, i_positive, t_present, t_positive, c_present, c_positive, c_positive / c_present) else: print(day, i_present, i_positive, t_present, t_positive, c_present, c_positive, "NA")
def method_paper_summary_pipeline(ds, src_data, dest_data, first_timestamp, last_timestamp): s_ptnts = src_data['patients'] s_asmts = src_data['assessments'] filters = ds.get_or_create_group(dest_data, 'filters') print(s_ptnts.keys()) print(src_data['tests'].keys()) conditions = ('has_kidney_disease', 'has_lung_disease', 'has_heart_disease', 'has_diabetes', 'has_hayfever', 'has_cancer') symptoms = ('persistent_cough', 'fatigue', 'delirium', 'shortness_of_breath', 'fever', 'diarrhoea', 'abdominal_pain', 'chest_pain', 'hoarse_voice', 'skipped_meals', 'loss_of_smell') symptom_thresholds = {s: 2 for s in symptoms} symptom_thresholds['fatigue'] = 3 symptom_thresholds['shortness_of_breath'] = 3 intercept = -1.19015973 weights = { 'persistent_cough': 0.23186655, 'fatigue': 0.56532346, 'delirium': -0.12935112, 'shortness_of_breath': 0.58273967, 'fever': 0.16580974, 'diarrhoea': 0.10236126, 'abdominal_pain': -0.11204163, 'chest_pain': -0.12318634, 'hoarse_voice': -0.17818597, 'skipped_meals': 0.25902482, 'loss_of_smell': 1.82895239 } # Filter patients to be only from England # ======================================= eng_pats = set() p_ids_ = ds.get_reader(s_ptnts['id'])[:] p_lsoas_ = ds.get_reader(s_ptnts['lsoa11cd'])[:] for i in range(len(p_ids_)): lsoa = p_lsoas_[i] if len(lsoa) > 0 and lsoa[0] == 69: # E eng_pats.add(p_ids_[i]) print("eng pats:", len(eng_pats)) # generating patient filter # ------------------------- if 'patient_filter' not in filters.keys(): with utils.Timer("generating patient filter", new_line=True): p_filter = ds.get_reader(s_ptnts['year_of_birth_valid'])[:] # valid age ranges r_ = ds.get_reader(s_ptnts['age'])[:] f_ = (r_ >= 18) & (r_ <= 100) p_filter = p_filter & f_ # gender filter r_ = ds.get_reader(s_ptnts['gender'])[:] f_ = (r_ == 1) | (r_ == 2) p_filter = p_filter & f_ # country code r_ = ds.get_reader(s_ptnts['country_code'])[:] f_ = r_ == b'GB' p_filter = p_filter & f_ print("UK:", p_filter.sum(), len(p_filter)) # # England only # r_ = ds.get_reader(s_ptnts['lsoa11cd'])[:] # f_ = np.zeros(len(r_), dtype=np.bool) # for i in range(len(r_)): # lsoa = r_[i] # if len(lsoa) > 0 and lsoa[0] == 69: # E # f_[i] = True # p_filter = p_filter & f_ # print("Eng:", p_filter.sum(), len(p_filter)) # no assessments r_ = ds.get_reader(s_ptnts['assessment_count'])[:] f_ = r_ > 0 p_filter = p_filter & f_ print("No asmts:", p_filter.sum(), len(p_filter)) print(" {}, {}".format(np.count_nonzero(p_filter), np.count_nonzero(p_filter == False))) ds.get_numeric_writer(filters, 'patient_filter', 'bool').write(p_filter) # generating assessment filter # ---------------------------- if 'assessment_filter' not in filters.keys(): with utils.Timer("generating assessment filter", new_line=True): a_filter = np.ones(len(ds.get_reader(s_asmts['id'])), dtype=np.bool) # created_at in range r_ = ds.get_reader(s_asmts['created_at'])[:] f_ = (r_ >= first_timestamp) & (r_ < last_timestamp) a_filter = a_filter & f_ # country code r_ = ds.get_reader(s_asmts['country_code'])[:] f_ = r_ == b'GB' a_filter = a_filter & f_ with utils.Timer(f"filtering out orphaned assessments"): p_ids_ = ds.get_reader(s_ptnts['id'])[:] p_ids_ = ds.apply_filter( ds.get_reader(filters['patient_filter'])[:], p_ids_) a_pids_ = ds.get_reader(s_asmts['patient_id'])[:] f_ = persistence.foreign_key_is_in_primary_key(p_ids_, a_pids_) a_filter = a_filter & f_ print(" {}, {}".format(np.count_nonzero(a_filter), np.count_nonzero(a_filter == False))) ds.get_numeric_writer(filters, 'assessment_filter', 'bool').write(a_filter) # filtering patients # ------------------ if 'filtered_patients' not in dest_data.keys(): flt_ptnts = dest_data.create_group('filtered_patients') with utils.Timer("filtering/flattening patient fields", new_line=True): p_filter = ds.get_reader(filters['patient_filter'])[:] r = ds.get_reader(s_ptnts['age']) r.get_writer(flt_ptnts, 'age').write(ds.apply_filter(p_filter, r[:])) for k in conditions: r = ds.get_reader(s_ptnts[k]) ds.get_numeric_writer( flt_ptnts, k, 'bool').write(ds.apply_filter(p_filter, r[:]) == 2) smoker1 = ds.get_reader(s_ptnts['is_smoker']) smoker2 = ds.get_reader(s_ptnts['smoker_status']) smoker = (smoker1[:] == 2) | (smoker2[:] == 3) ds.get_numeric_writer(flt_ptnts, 'smoker', 'bool').write(smoker) gender_ = ds.get_reader(s_ptnts['gender']) ds.get_numeric_writer( flt_ptnts, 'gender', 'uint8').write(ds.apply_filter(p_filter, gender_) - 1) else: flt_ptnts = dest_data['filtered_patients'] # filtering assessments # --------------------- if 'filtered_assessments' not in dest_data.keys(): flt_asmts = dest_data.create_group('filtered_assessments') with utils.Timer("filtering/flattening symptoms", new_line=True): a_filter = ds.get_reader(filters['assessment_filter'])[:] for s in symptoms: r_ = ds.get_reader(s_asmts[s])[:] ds.get_numeric_writer(flt_asmts, s, 'bool').write( ds.apply_filter(a_filter, r_) >= symptom_thresholds[s]) a_pids = ds.get_reader(s_asmts['patient_id']) a_pids.get_writer(flt_asmts, 'patient_id').write( ds.apply_filter(a_filter, a_pids[:])) else: flt_asmts = dest_data['filtered_assessments'] # predicting covid # ---------------- if 'prediction' not in dest_data['filtered_assessments']: with utils.Timer("generating covid prediction", new_line=True): cumulative = np.zeros(len( ds.get_reader(flt_asmts['persistent_cough'])), dtype='float64') for s in symptoms: reader = ds.get_reader(flt_asmts[s]) cumulative += reader[:] * weights[s] cumulative += intercept print("positive predictions", np.count_nonzero(cumulative > 0.0), len(cumulative)) a_pids_ = ds.get_reader(flt_asmts['patient_id'])[:] spans = ds.get_spans(a_pids_) max_prediction_inds = ds.apply_spans_index_of_max( spans, cumulative) max_predictions = cumulative[max_prediction_inds] ds.get_numeric_writer(flt_asmts, 'prediction', 'float32').write(max_predictions) pos_filter = max_predictions > 0.0 print("pos_filter: ", np.count_nonzero(pos_filter), len(pos_filter)) # generating table results print('total_assessments:', np.count_nonzero(ds.get_reader(filters['assessment_filter'])[:])) subjects = np.count_nonzero(ds.get_reader(filters['patient_filter'])[:]) genders = ds.get_reader(flt_ptnts['gender'])[:] predicted_c19 = np.count_nonzero( ds.get_reader(flt_asmts['prediction'])[:] > 0.0) age_mean = np.mean(ds.get_reader(flt_ptnts['age'])[:]) age_std = np.std(ds.get_reader(flt_ptnts['age'])[:]) print('subjects:', subjects) male = np.count_nonzero(genders) female = np.count_nonzero(genders == False) print('gender: {}:{}, {:.2%}:{:.2%}'.format(male, female, male / len(genders), female / len(genders))) # print('predicted covid-19:', predicted_c19) print( '{}:'.format('predicted covid-19'), '{} {:.2%}'.format( predicted_c19, predicted_c19 / len(ds.get_reader(flt_asmts['prediction'])))) print('age {:.2f} ({:.2f})'.format(age_mean, age_std)) for k in conditions + ('smoker', ): kr_ = ds.get_reader(flt_ptnts[k])[:] pos = np.count_nonzero(kr_) print('{}:'.format(k), '{} {:.2%}'.format(pos, pos / len(kr_)))
def ppe_use_and_travel(ds, src, tmp, start_timestamp): logging = True s_asmts = src['assessments'] if 'filtered_assessments' not in tmp.keys(): f_asmts = tmp.create_group('filtered_assessments') cats = ds.get_reader(s_asmts['created_at']) asmt_filter = cats[:] >= start_timestamp ccs = ds.get_reader(s_asmts['country_code']) asmt_filter = asmt_filter & (ccs[:] == b'GB') symptom_keys = ('persistent_cough', 'fatigue', 'delirium', 'shortness_of_breath', 'fever', 'diarrhoea', 'abdominal_pain', 'chest_pain', 'hoarse_voice', 'skipped_meals', 'loss_of_smell') mask_keys = ('mask_cloth_or_scarf', 'mask_surgical', 'mask_n95_ffp') isolation_keys = ('isolation_healthcare_provider', 'isolation_little_interaction', 'isolation_lots_of_people') other_keys = ('patient_id', ) symptom_thresholds = {s: 2 for s in symptom_keys} symptom_thresholds.update({m: 2 for m in mask_keys}) symptom_thresholds['fatigue'] = 3 symptom_thresholds['shortness_of_breath'] = 3 for k in symptom_keys + mask_keys + isolation_keys + other_keys: with utils.Timer("filtering {}".format(k)): reader = ds.get_reader(s_asmts[k]) if k in mask_keys + symptom_keys: values = np.where(reader[:] >= symptom_thresholds[k], 1, 0) ds.get_numeric_writer(f_asmts, k, 'int8').write( ds.apply_filter(asmt_filter, values)) hist = np.unique(reader[:], return_counts=True) print(sorted(zip(hist[0], hist[1]))) hist = np.unique(values, return_counts=True) print(sorted(zip(hist[0], hist[1]))) else: reader.get_writer(f_asmts, k).write( ds.apply_filter(asmt_filter, reader)) print('filtered assessments:', np.count_nonzero(asmt_filter), len(asmt_filter)) # # # if 'filtered_assessment_predictions' not in tmp.keys(): # f_pred_asmts = tmp.create_group('filtered_assessment_predictions') symptom_readers = dict() for s in symptom_keys: symptom_readers[s] = ds.get_reader(f_asmts[s]) predictions = ds.get_numeric_writer(f_asmts, 'prediction', 'float32') method_paper_model(ds, symptom_readers, predictions) predictions = ds.get_reader(f_asmts['prediction']) print('predictions:', np.count_nonzero(predictions[:] > 0), len(predictions)) if 'patient_assessment_summaries' not in tmp.keys(): asmt_psum = tmp.create_group('patient_assessment_summaries') pids = ds.get_reader(f_asmts['patient_id']) mcos = ds.get_reader(f_asmts['mask_cloth_or_scarf']) msurg = ds.get_reader(f_asmts['mask_surgical']) m95 = ds.get_reader(f_asmts['mask_n95_ffp']) with utils.Timer("generating patient_id spans"): asmt_spans = ds.get_spans(field=pids[:]) for k in mask_keys: with utils.Timer( "getting per patient mask summary for {}".format(k)): writer = ds.get_numeric_writer(asmt_psum, k, 'int8') ds.apply_spans_max(asmt_spans, ds.get_reader(f_asmts[k])[:], writer) print( sorted( utils.build_histogram(ds.get_reader(asmt_psum[k])[:]))) for k in isolation_keys: with utils.Timer( "getting per patient isolation summary for {}".format(k)): writer = ds.get_numeric_writer(asmt_psum, k, 'int32') ds.apply_spans_max(asmt_spans, ds.get_reader(f_asmts[k])[:], writer) print( sorted( utils.build_histogram(ds.get_reader(asmt_psum[k])[:]))) with utils.Timer("getting prediction maxes for patients"): p_predictions = predictions.get_writer(asmt_psum, 'prediction') ds.apply_spans_max(asmt_spans, predictions, p_predictions) p_predictions = ds.get_reader(asmt_psum[k]) positives = p_predictions[:] > 0 print("max covid prediction:", np.count_nonzero(positives), len(positives)) with utils.Timer("getting patient ids from assessments"): writer = pids.get_writer(asmt_psum, 'patient_id') writer.write(pd.unique(pids[:])) else: asmt_psum = tmp['patient_assessment_summaries'] s_ptnts = src['patients'] print(s_ptnts.keys()) pdf = pd.DataFrame({ 'id': ds.get_reader(s_ptnts['id'])[:], 'hwwc': ds.get_reader(s_ptnts['health_worker_with_contact'])[:] }) adf = pd.DataFrame( {'patient_id': ds.get_reader(asmt_psum['patient_id'])[:]}) jdf = pd.merge(left=adf, right=pdf, left_on='patient_id', right_on='id', how='left') print(len(jdf['hwwc'])) class TestResults: def __init__(self): self.positive = 0 self.total = 0 def add(self, result): if result: self.positive += 1 self.total += 1 results = defaultdict(TestResults) positives = ds.get_reader(asmt_psum['prediction'])[:] positives = positives > 0 mask_0 = ds.get_reader(asmt_psum['mask_cloth_or_scarf'])[:] mask_1 = ds.get_reader(asmt_psum['mask_surgical'])[:] mask_2 = ds.get_reader(asmt_psum['mask_cloth_or_scarf'])[:] # mask = mask_0 | mask_1 | mask_2 mask = mask_0 print(np.unique(mask, return_counts=True)) isol_lots = ds.get_reader(asmt_psum['isolation_lots_of_people'])[:] isol_lots_7 = np.where(isol_lots > 7, 7, isol_lots) print(np.unique(isol_lots_7, return_counts=True)) print(len(mask), len(positives), len(isol_lots_7)) # isolation lots of users for i_r in range(len(mask)): results[(isol_lots_7[i_r], mask[i_r])].add(positives[i_r]) groupings = sorted( list((r[0], (r[1].positive, r[1].total)) for r in results.items())) for g in groupings: print(g[0], g[1][0], g[1][1], g[1][0] / g[1][1])
def ppe_use_and_travel_2(ds, src, dest, start_ts): ds = session.Session() s_ptnts = src['patients'] s_asmts = src['assessments'] print(s_asmts.keys()) s_tests = src['tests'] if 'filtered_patients' not in dest.keys(): f_ptnts = dest.create_group('filtered_patients') f_asmts = dest.create_group('filtered_assessments') f_tests = dest.create_group('filtered_tests') # calculate patient first positives raw_p_ids = ds.get(s_ptnts['id']).data[:] raw_p_acts = ds.get(s_ptnts['assessment_count']).data[:] raw_a_pids = ds.get(s_asmts['patient_id']).data[:] raw_t_pids = ds.get(s_tests['patient_id']).data[:] # filter out anyone without assessments patient_filter = raw_p_acts > 0 print("patient_filter:", np.count_nonzero(patient_filter), np.count_nonzero(patient_filter == 0)) # filter patients f_p_ids = ds.get(s_ptnts['id']).create_like(f_ptnts, 'id') f_p_ids.data.write(ds.apply_filter(patient_filter, raw_p_ids)) # filter out any orphaned assessments with utils.Timer("fk in pk"): assessment_filter = persistence.foreign_key_is_in_primary_key( raw_p_ids, raw_a_pids) print("assessment_filter:", np.count_nonzero(assessment_filter), np.count_nonzero(assessment_filter == False)) f_a_pids = ds.get(s_asmts['patient_id']).create_like( f_asmts, 'patient_id') f_a_pids.data.write(ds.apply_filter(assessment_filter, raw_a_pids)) for k in ('created_at', 'tested_covid_positive'): field = ds.get(s_asmts[k]).create_like(f_asmts, k) field.data.write( ds.apply_filter(assessment_filter, ds.get(s_asmts[k]).data[:])) # filter out any orphaned tests test_filter = persistence.foreign_key_is_in_primary_key( raw_p_ids, raw_t_pids) print("test_filter:", np.count_nonzero(test_filter), np.count_nonzero(test_filter == False)) f_t_pids = ds.get(s_tests['patient_id']).create_like( f_tests, 'patient_id') f_t_pids.data.write(ds.apply_filter(test_filter, raw_t_pids)) else: f_ptnts = dest['filtered_patients'] f_asmts = dest['filtered_assessments'] f_tests = dest['filtered_tests'] f_p_ids = ds.get(f_ptnts['id']) f_a_pids = ds.get(f_asmts['patient_id']) f_t_pids = ds.get(f_tests['patient_id']) # calculate the shared set of indices for assessments / tests back to patients with utils.Timer("get_shared_index"): p_inds, a_pinds, t_pinds = ds.get_shared_index( (f_p_ids, f_a_pids, f_t_pids)) print(max(p_inds.max(), a_pinds.max(), t_pinds.max())) # now filter only assessments with positive test results pos_asmt_tests = ds.get(f_asmts['tested_covid_positive']).data[:] == 3 print("old tests positive:", np.count_nonzero(pos_asmt_tests), np.count_nonzero(pos_asmt_tests == False)) # now filter only tests with positive test results s_asmts = src['assessments'] a_cats = ds.get(f_asmts['created_at']) asmt_filter = a_cats.data[:] >= start_ts print(np.count_nonzero(asmt_filter), len(asmt_filter)) raw_a_cats = ds.apply_filter(asmt_filter, a_cats.data[:]) a_days = np.zeros(len(raw_a_cats), dtype=np.int32) start_dt = datetime.fromtimestamp(start_ts) for i_r in range(len(raw_a_cats)): a_days[i_r] = (datetime.fromtimestamp(raw_a_cats[i_r]) - start_dt).days print(sorted(utils.build_histogram(a_days)))