def match_assessment_v1(test_df, assessment_df, dest_df, gap, positive_only=False): """ Mapping the test with a previous assessment record. :param test_df: The tests dataframe. :param assessment_df: The assessment dataframe. :param dest_df: The destination dataframe to write the result to. :param gap: Limit the number of days assessment prior the tests. :param positive_only: Filter tests with positive result only. :return: The result dataframe. """ # merge tests with assessment edf.merge(test_df, assessment_df, dest=dest_df, left_on='patient_id', right_on='patient_id', how='inner') # created_at_l test date, created_at_r assessment date flt = dest_df['created_at_r'] <= dest_df[ 'created_at_l'] # assessment happens before test flt &= dest_df['created_at_r'] + gap * 3600 * 24 >= dest_df[ 'created_at_l'] # asmt < test < asmt + gap if positive_only: flt &= dest_df['result'] == 4 dest_df.apply_filter(flt) return dest_df
def tests_merge_left_compound_key(self): l_id_1 = np.asarray([0, 0, 0, 0, 1, 1, 1, 1], dtype='int32') l_id_2 = np.asarray([0, 1, 2, 3, 0, 1, 2, 3], dtype='int32') r_id_1 = np.asarray([0, 1, 0, 1, 0, 1, 0, 1], dtype='int32') r_id_2 = np.asarray([0, 0, 1, 1, 2, 2, 3, 3], dtype='int32') l_vals = ['00', '01', '02', '03', '10', '11', '12', '13'] r_vals = ['00', '10', '01', '11', '02', '12', '03', '13'] expected = ['00', '01', '02', '03', '10', '11', '12', '13'] bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'w', 'dst') ldf = dst.create_dataframe('ldf') rdf = dst.create_dataframe('rdf') ldf.create_numeric('l_id_1', 'int32').data.write(l_id_1) ldf.create_numeric('l_id_2', 'int32').data.write(l_id_2) ldf.create_indexed_string('l_vals').data.write(l_vals) rdf.create_numeric('r_id_1', 'int32').data.write(r_id_1) rdf.create_numeric('r_id_2', 'int32').data.write(r_id_2) rdf.create_indexed_string('r_vals').data.write(r_vals) ddf = dst.create_dataframe('ddf') dataframe.merge(ldf, rdf, ddf, ('l_id_1', 'l_id_2'), ('r_id_1', 'r_id_2'), how='left') self.assertEqual(expected, ddf['l_vals'].data[:]) self.assertEqual(expected, ddf['r_vals'].data[:]) self.assertEqual(ddf['l_id_1'].data[:].tolist(), ddf['r_id_1'].data[:].tolist()) self.assertEqual(ddf['r_id_2'].data[:].tolist(), ddf['r_id_2'].data[:].tolist())
def join_tests(): """ Merge tests to previous merged (assessments, vaccine), filter out subjects has test records within 10days after vaccine """ with sess.Session() as s: # open related datasets src = s.open_dataset(ADATA, 'r', 'asmt') tests_src = src['tests'] dst = s.open_dataset(DSTDATA, 'r+', 'dst') vacc = dst['asmt_v'] tests_m = dst.create_dataframe('tests_m') dataframe.merge(vacc, tests_src, tests_m, 'patient_id_l', 'patient_id', how='inner') # filter out subjects has tests after 10days of vaccine # date_taken_specific_l is vaccine date, date_taken_specific_r is tests date test_filter = tests_m['date_taken_specific_l'] < tests_m[ 'date_taken_specific_r'] # test after vaccine test_filter &= tests_m['date_taken_specific_l'] > ( tests_m['date_taken_specific_r'] - 3600 * 24 * 10) tests_m.apply_filter(test_filter)
def tests_merge_outer(self): r_id = np.asarray([0, 1, 2, 3, 4, 5, 6, 7], dtype='int32') l_id = np.asarray([2, 3, 0, 4, 7, 6, 2, 0, 3], dtype='int32') r_vals = [ 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven' ] l_vals = [ 'bb1', 'ccc1', '', 'dddd1', 'ggggggg1', 'ffffff1', 'bb2', '', 'ccc2' ] expected_left = [ 'bb1', 'bb2', 'ccc1', 'ccc2', '', '', 'dddd1', 'ggggggg1', 'ffffff1', '', '' ] expected_right = [ 'two', 'two', 'three', 'three', 'zero', 'zero', 'four', 'seven', 'six', 'one', 'five' ] bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'w', 'dst') ldf = dst.create_dataframe('ldf') rdf = dst.create_dataframe('rdf') ldf.create_numeric('l_id', 'int32').data.write(l_id) ldf.create_indexed_string('l_vals').data.write(l_vals) rdf.create_numeric('r_id', 'int32').data.write(r_id) rdf.create_indexed_string('r_vals').data.write(r_vals) ddf = dst.create_dataframe('ddf') dataframe.merge(ldf, rdf, ddf, 'l_id', 'r_id', how='outer') self.assertEqual(expected_left, ddf['l_vals'].data[:]) self.assertEqual(expected_right, ddf['r_vals'].data[:])
def tests_merge_right(self): r_id = np.asarray([0, 1, 2, 3, 4, 5, 6, 7], dtype='int32') l_id = np.asarray([2, 3, 0, 4, 7, 6, 2, 0, 3], dtype='int32') l_vals = [ 'bb1', 'ccc1', '', 'dddd1', 'ggggggg1', 'ffffff1', 'bb2', '', 'ccc2' ] expected = [ '', '', '', 'bb1', 'bb2', 'ccc1', 'ccc2', 'dddd1', '', 'ffffff1', 'ggggggg1' ] bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'w', 'dst') ldf = dst.create_dataframe('ldf') rdf = dst.create_dataframe('rdf') ldf.create_numeric('l_id', 'int32').data.write(l_id) ldf.create_indexed_string('l_vals').data.write(l_vals) rdf.create_numeric('r_id', 'int32').data.write(r_id) ddf = dst.create_dataframe('ddf') dataframe.merge(ldf, rdf, ddf, 'l_id', 'r_id', how='right') self.assertEqual(expected, ddf['l_vals'].data[:]) valid_if_equal = (ddf['l_id'].data[:] == ddf['r_id'].data[:]) | \ np.logical_not(ddf['valid_l'].data[:]) self.assertTrue(np.all(valid_if_equal))
def asmt_merge_vacc(): """ Merge assessment df with vaccine dataframe, filter out subject has a healthy assessments before vaccine date """ with sess.Session() as s: # open related datasets src = s.open_dataset(ADATA, 'r', 'asmt') asmt = src['assessments'] vacc = s.open_dataset(VDATA, 'r', 'vacc') dst = s.open_dataset(DSTDATA, 'w', 'dst') #filter vaccine type vbrand_filter = (vacc['vaccine_doses']['brand'].data[:] == 2) | \ (vacc['vaccine_doses']['brand'].data[:] == 3) dvacc = dst.create_dataframe('vacc') vacc['vaccine_doses'].apply_filter(vbrand_filter, ddf=dvacc) #join asmt with vaccine using patient_id, write to result asmt_v = dst.create_dataframe('asmt_v') dataframe.merge(asmt, dvacc, asmt_v, 'patient_id', 'patient_id', how='inner') #filter healthy asmt record within 10days of vaccine date symp_list = [ 'persistent_cough', 'fever', 'fatigue', 'delirium', 'shortness_of_breath', 'diarrhoea', 'abdominal_pain', 'chest_pain', 'hoarse_voice', 'skipped_meals', 'loss_of_smell', 'headache', 'sore_throat', 'chills_or_shivers', 'eye_soreness', 'nausea', 'blisters_on_feet', 'unusual_muscle_pains', 'runny_nose', 'red_welts_on_face_or_lips', 'dizzy_light_headed', 'swollen_glands', 'sneezing', 'skin_burning', 'earache', 'altered_smell', 'brain_fog', 'irregular_heartbeat' ] symp_filter = asmt_v['persistent_cough'].data[:] > 1 # has symptom for symptom1 in symp_list: symp_filter |= asmt_v[symptom1].data[:] > 1 # has symptom symp_filter = ~symp_filter # has no symptom symp_filter &= asmt_v['date_taken_specific'].data[:] > asmt_v[ 'updated_at_l'].data[:] # asmt before vaccine symp_filter &= asmt_v['updated_at_l'].data[:] > asmt_v[ 'date_taken_specific'].data[:] - 3600 * 24 * 10 # 10 days asmt_v.apply_filter(symp_filter) # has symptom after vaccine yes_symp_filter = asmt_v['persistent_cough'].data[:] > 1 for symptom1 in symp_list: yes_symp_filter |= asmt_v[symptom1].data[:] > 1 # has symptom yes_symp_filter &= asmt_v['date_taken_specific'].data[:] < asmt_v[ 'updated_at_l'].data[:] # assessment after vaccine yes_symp_filter &= asmt_v[ 'date_taken_specific'].data[:] + 3600 * 24 * 10 > asmt_v[ 'updated_at_l'].data[:] # assessment within 7 days of vaccine asmt_v.apply_filter(yes_symp_filter) print("finish asmt join vaccine.")
def SymptomJoinTests(): begin = datetime.strptime("2020-12-08", '%Y-%m-%d').timestamp() end = datetime.strptime("2021-05-17", '%Y-%m-%d').timestamp() with sess.Session() as s: # open related datasets src = s.open_dataset(ADATA, 'r', 'src') asmt = src['assessments'] tests = src['tests'] dst = s.open_dataset(DSTDATA, 'w', 'dst') dst_asmt = dst.create_dataframe('dst_asmt') #copy asmt dst_asmt['patient_id'] = asmt['patient_id'] dst_asmt['updated_at'] = asmt['updated_at'] #filter asmt filter = asmt['updated_at'] >= begin filter &= asmt['updated_at'] <= end nhs_criteria = asmt['loss_of_smell'] == 2 nhs_criteria |= asmt['fever'] == 2 nhs_criteria |= asmt['persistent_cough'] == 2 filter &= nhs_criteria #apply filter dst_asmt.apply_filter(filter) print('number of unique patient for 1) ', len(dst_asmt['patient_id'].get_spans()) - 1) #join tests dst_tests = dst.create_dataframe('dst_tests') dataframe.merge(dst_asmt, tests, dest=dst_tests, left_on='patient_id', right_on='patient_id', how='inner') # filter tests againt symp updated_at_l symp date updated_at_r tests date filter = dst_tests['updated_at_l'] < dst_tests['updated_at_r'] filter &= dst_tests['updated_at_l'] + 10 * 24 * 3600 > dst_tests[ 'updated_at_r'] dst_tests.apply_filter(filter) print('number of unique patient for 2) ', len(dst_tests['patient_id_l'].get_spans()) - 1) aggidx = get_aggregate_index(dst_tests['patient_id_l'], dst_tests['result'], how='max') print( 'unique tested positive: ', np.sum(np.where(dst_tests['result'].data[aggidx] == 4, True, False)))
src_ptnts = src['patients'] src_asmts = src['assessments'] print(pd.unique([2, 1, 3, 2, 3])) print(np.unique([2, 1, 3, 2, 3])) with Timer('calculating shared index'): v = s.get_shared_index((src_ptnts['id'], src_asmts['patient_id'])) dest_asmts = dest.create_dataframe('assessments') with Timer("left join assessments <- patients"): merge(src_asmts, src_ptnts, dest_asmts, 'patient_id', 'id', ('patient_id', ), ('age', 'height_cm', 'weight_kg'), how='left', hint_left_keys_ordered=True, hint_right_keys_ordered=True) print('done!') # from io import BytesIO # bio = BytesIO() # src = s.open_dataset(bio, 'w', 'src') # # ptnts = src.create_dataframe('patients') # ptnts.create_fixed_string('id', 2).data.write( # np.asarray(['a', 'b', 'c', 'd', 'e'], dtype='S2')) # ptnts.create_numeric('age', 'int32').data.write( # np.asarray([20, 40, 60, 80, 100], dtype=np.int32)) #
r_df2 = ds.create_dataframe('r_df') r_df2.create_numeric('r_key', 'int32').data.write(r_key_) r_df2.create_numeric('lf_key', 'int32').data.write(lf_key_) r_df2.create_numeric('r_a', 'int32').data.write(r_a_) m_df2 = ds.create_dataframe('m_df_2') # for f in [l_df2['l_key'], l_df2['l_a'], r_df2['r_key'], r_df2['lf_key'], r_df2['r_a']]: # print(f.name, f.data[:20]) with Timer("exetera unordered {} merge:".format(how)): merge(l_df2, r_df2, m_df2, left_on='l_key', right_on='lf_key', left_fields=['l_a'], right_fields=['r_a'], how=how) # print(m_df['r_a'].to_numpy(dtype='int32')[:100]) print("pd/m 'l_a':", np.array_equal(m_df['l_a'].to_numpy(), m_df2['l_a'].data[:])) print( "pd/m 'r_a':", np.array_equal(get_from_pandas(m_df['r_a'], 0, 'int32'), m_df2['r_a'].data[:])) m_df3 = ds.create_dataframe('m_df_3') with Timer("exetera ordered {} merge:".format(how)):