def covid_test_date_v1(session: Session, test_table, dest_test_table, dest_field_name='test_date', dest_field_flags_name='test_date_valid'): """ Infer the test date from 'date_taken_specific', 'date_taken_between_start' and 'date_taken_between_end' columns. :param session: The Exetera session instance. :param test_table: The tests dataframe which contains 'date_taken_specific', 'date_taken_between_start' and 'date_taken_between_end' columns.. :param dest_test_table: The destination dataframe to write the result to. :param dest_field_name: The name of the result date column. :param dest_field_flags_name: The name of the column to store the flat indicates if the date is set or inferred. """ exact = session.get(test_table['date_taken_specific']) exact_ = exact.data[:] between_start_ = session.get( test_table['date_taken_between_start']).data[:] between_end_ = session.get(test_table['date_taken_between_end']).data[:] # flag dates where neither exact or between_start are set test_date_valid = (exact_ == 0.0) & (between_start_ != 0.0) & (between_end_ != 0.0) &\ (between_end_ >= between_start_) test_date_valid = test_date_valid | ((exact_ != 0.0) & (between_start_ == 0.0) & (between_end_ == 0.0)) test_date = np.where(exact_ != 0.0, exact_, between_start_ + (between_end_ - between_start_) / 2) exact.create_like(dest_test_table, dest_field_name).data.write(test_date) session.create_numeric(dest_test_table, dest_field_flags_name, 'bool').data.write(test_date_valid)
def iterator_test_1(length): a_ids, a_vals, b_ids = generate_dataset_1(length) s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'w') as hf: wa_vals = s.create_numeric(hf, 'a_vals', 'int32') wa_vals.data.write(a_vals) wa_vals2 = s.get(hf['a_vals']) print(fast_sum(iter(ops.data_iterator(wa_vals2))))
def new_hs_test(vcount): s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf: with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest: print(hf.keys()) a_ids_f = s.get(hf['fk_ids']) b_ids_f = s.get(hf['ids']) all_b_val_fields = list() for v in range(vcount): b_vals_f = s.create_numeric(dest, 'left_data_{}'.format(v), 'int32') all_b_val_fields.append(b_vals_f) a_to_b = s.create_numeric(dest, 'a_to_b', 'int64') all_a_val_fields = list() for v in range(vcount): a_vals_f = s.get(hf['right_data_{}'.format(v)]) all_a_val_fields.append(a_vals_f) print("running test") t0 = time.time() # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True, # left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,)) print(a_ids_f.data[:100]) print(b_ids_f.data[:100]) print(all_a_val_fields[0].data[:100]) s.ordered_merge_left(a_ids_f, b_ids_f, left_to_right_map=a_to_b, right_unique=True, right_field_sources=tuple(all_a_val_fields), left_field_sinks=tuple(all_b_val_fields)) print(a_to_b.data[:100]) results = s.merge_left(a_ids_f, b_ids_f, right_fields=tuple(all_a_val_fields)) elapsed = time.time() - t0 print("total:", elapsed) print(all_b_val_fields[0].data[:100]) print(results[0][:100])
def first_test_date_per_patient(session: Session, patient_table, test_table, test_date_name, dest_patient_table, dest_patient_name): """ Filter the first date of test performed for each patient id. :param session: The Exetera session instance. :param patient_table: The patient dataframe. :param test_table: The tests dataframe. :param test_date_name: The name of the test dataframe, not used. :param dest_patient_table: The destination dataframe to store the results. :param dest_patient_name: The name of the destination field to store the results. """ pid = 'id' pids = session.get(patient_table[pid]) pids_ = pids.data[:] if not ops.is_ordered(pids.data[:]): raise ValueError("The patient table must be ordered by '{}'".format(pid)) t_pid = 'patient_id' t_pids = session.get(test_table[t_pid]) t_pids_ = t_pids.data[:] if not ops.is_ordered(t_pids_): raise ValueError("The test table must be ordered by '{}'".format(t_pid)) # collapse the test data by patient_id and get the counts cats = session.get(test_table['created_at']) spans_ = session.get_spans(t_pids_) s_t_pids_ = session.apply_spans_first(spans_, t_pids_) counts_ = session.apply_spans_first(spans_, cats) # merge the counts for the test table into the patient table dest = session.create_numeric(dest_patient_table, dest_patient_name, 'int32') session.ordered_merge_left(left_on=pids_, right_on=s_t_pids_, right_field_sources=(counts_,), left_field_sinks=(dest,), left_unique=True, right_unique=True)
def read_fields_from_hdf5(file_name, field_count): fields = ('id', 'created_at', 'updated_at', 'version', 'country_code', 'reported_by_another', 'same_household_as_reporter', 'contact_additional_studies', 'year_of_birth', 'height_cm', 'weight_kg', 'gender', 'race_other', 'ethnicity', 'profile_attributes_updated_at', 'has_diabetes') print(len(fields)) s = Session() with h5py.File(file_name, 'r') as hf: with utils.Timer("reading {} fields from dataset".format(field_count)): for f in range(field_count): field = s.get(hf['patients'][fields[f]]) if isinstance(field, flds.IndexedStringField): indices = field.indices[:] values = field.values[:] else: data = field.data[:]
# print(substrs) substrs = replace_multi_with_str("#!,\"(){}[].:;", substrs) substrs = [s_.strip() for s_ in substrs.split() if len(s_) > 0] for s in substrs: if s in words_to_check: total_count += 1 break print(total_count) with h5py.File('/home/ben/covid/ds_20200901_full.hdf5', 'r') as hf: with h5py.File('/home/ben/covid/ds_20200901_othersymp.hdf5', 'w') as tmp: s = Session() print([k for k in hf['patients'].keys() if 'result' in k]) old_test = s.get(hf['patients']['max_assessment_test_result']).data[:] new_test = s.get(hf['patients']['max_test_result']).data[:] test_results = np.where((old_test == 3) | (new_test == 4), 2, 0) test_results = np.where( (test_results == 0) & ((old_test == 2) | (new_test == 3)), 1, test_results) p_test_results = s.create_numeric(tmp, 'p_test_results', 'int8') p_test_results.data.write(test_results) print("overall tests:", np.unique(test_results, return_counts=True)) other = s.get(hf['assessments']['other_symptoms']) cc = s.get(hf['assessments']['country_code']).data[:] otherstart = other.indices[:-1] otherend = other.indices[1:] ofilter = otherend - otherstart > 0 print("ofilter:", ofilter.sum(), len(ofilter))
def hs_test_1(length, val_column_count): # rng = np.random.RandomState(12345678) # id_base = 1000000000 # mapping = [0, 1, 2, 1] s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf: with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest: # print('creating a_ids') # a_ids = generate_a_ids(length, id_base) # a_ids_f = s.create_numeric(hf, 'a_ids', 'int64') # a_ids_f.data.write(a_ids) # del a_ids # # print('creating a_vals') # # all_a_val_fields = list() # for v in range(val_column_count): # a_vals = generate_a_vals(length, 0, 100, rng) # a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64') # a_vals_f.data.write(a_vals) # # all_a_val_fields.append(a_vals_f) # del a_vals # # print('creating b_ids') # b_ids = generate_b_ids(length, id_base, mapping) # b_ids_f = s.create_numeric(hf, 'b_ids', 'int64') # b_ids_f.data.write(b_ids) # del b_ids a_ids_f = s.get(hf['a_ids']) b_ids_f = s.get(hf['b_ids']) all_b_val_fields = list() for v in range(val_column_count): b_vals_f = s.create_numeric(dest, 'b_vals_{}'.format(v), 'int32') all_b_val_fields.append(b_vals_f) a_to_b = s.create_numeric(dest, 'a_to_b', 'int64') all_a_val_fields = list() for v in range(val_column_count): a_vals_f = s.get(hf['a_vals_{}'.format(v)]) all_a_val_fields.append(a_vals_f) print("running test") t0 = time.time() # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True, # left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,)) print(b_ids_f.data[:100]) print(a_ids_f.data[:100]) s.ordered_merge_left(b_ids_f, a_ids_f, right_field_sources=tuple(all_a_val_fields), left_field_sinks=tuple(all_b_val_fields), left_to_right_map=a_to_b, right_unique=True) print(a_to_b.data[:100]) results = s.merge_left(b_ids_f, a_ids_f, right_fields=tuple(all_a_val_fields)) elapsed = time.time() - t0 print(elapsed) print(all_b_val_fields[0].data[:100]) print(results[0][:100])
def postprocess(dataset, destination, timestamp=None, flags=None): if flags is None: flags = set() do_daily_asmts = 'daily' in flags has_patients = 'patients' in dataset.keys() has_assessments = 'assessments' in dataset.keys() has_tests = 'tests' in dataset.keys() has_diet = 'diet' in dataset.keys() sort_enabled = lambda x: True process_enabled = lambda x: True sort_patients = sort_enabled(flags) and True sort_assessments = sort_enabled(flags) and True sort_tests = sort_enabled(flags) and True sort_diet = sort_enabled(flags) and True make_assessment_patient_id_fkey = process_enabled(flags) and True year_from_age = process_enabled(flags) and True clean_weight_height_bmi = process_enabled(flags) and True health_worker_with_contact = process_enabled(flags) and True clean_temperatures = process_enabled(flags) and True check_symptoms = process_enabled(flags) and True create_daily = process_enabled(flags) and do_daily_asmts make_patient_level_assessment_metrics = process_enabled(flags) and True make_patient_level_daily_assessment_metrics = process_enabled( flags) and do_daily_asmts make_new_test_level_metrics = process_enabled(flags) and True make_diet_level_metrics = True make_healthy_diet_index = True # ds = DataStore(timestamp=timestamp) s = Session() # patients ================================================================ sorted_patients_src = None if has_patients: patients_src = dataset['patients'] write_mode = 'write' if 'patients' not in destination.keys(): patients_dest = s.get_or_create_group(destination, 'patients') sorted_patients_src = patients_dest # Patient sort # ============ if sort_patients: duplicate_filter = \ persistence.filter_duplicate_fields(s.get(patients_src['id']).data[:]) for k in patients_src.keys(): t0 = time.time() r = s.get(patients_src[k]) w = r.create_like(patients_dest, k) s.apply_filter(duplicate_filter, r, w) print(f"'{k}' filtered in {time.time() - t0}s") print(np.count_nonzero(duplicate_filter == True), np.count_nonzero(duplicate_filter == False)) sort_keys = ('id', ) s.sort_on(patients_dest, patients_dest, sort_keys, write_mode='overwrite') # Patient processing # ================== if year_from_age: log("year of birth -> age; 18 to 90 filter") t0 = time.time() yobs = s.get(patients_dest['year_of_birth']) yob_filter = s.get(patients_dest['year_of_birth_valid']) age = s.create_numeric(patients_dest, 'age', 'uint32') age_filter = s.create_numeric(patients_dest, 'age_filter', 'bool') age_16_to_90 = s.create_numeric(patients_dest, '16_to_90_years', 'bool') print('year_of_birth:', patients_dest['year_of_birth']) for k in patients_dest['year_of_birth'].attrs.keys(): print(k, patients_dest['year_of_birth'].attrs[k]) calculate_age_from_year_of_birth_v1(yobs, yob_filter, 16, 90, age, age_filter, age_16_to_90, 2020) log(f"completed in {time.time() - t0}") print('age_filter count:', np.sum(patients_dest['age_filter']['values'][:])) print('16_to_90_years count:', np.sum(patients_dest['16_to_90_years']['values'][:])) if clean_weight_height_bmi: log("height / weight / bmi; standard range filters") t0 = time.time() weights_clean = s.create_numeric(patients_dest, 'weight_kg_clean', 'float32') weights_filter = s.create_numeric(patients_dest, '40_to_200_kg', 'bool') heights_clean = s.create_numeric(patients_dest, 'height_cm_clean', 'float32') heights_filter = s.create_numeric(patients_dest, '110_to_220_cm', 'bool') bmis_clean = s.create_numeric(patients_dest, 'bmi_clean', 'float32') bmis_filter = s.create_numeric(patients_dest, '15_to_55_bmi', 'bool') weight_height_bmi_v1(s, 40, 200, 110, 220, 15, 55, None, None, None, None, patients_dest['weight_kg'], patients_dest['weight_kg_valid'], patients_dest['height_cm'], patients_dest['height_cm_valid'], patients_dest['bmi'], patients_dest['bmi_valid'], weights_clean, weights_filter, None, heights_clean, heights_filter, None, bmis_clean, bmis_filter, None) log(f"completed in {time.time() - t0}") if health_worker_with_contact: with utils.Timer("health_worker_with_contact field"): #writer = ds.get_categorical_writer(patients_dest, 'health_worker_with_contact', 'int8') combined_hcw_with_contact_v1( s, s.get(patients_dest['healthcare_professional']), s.get(patients_dest['contact_health_worker']), s.get(patients_dest['is_carer_for_community']), patients_dest, 'health_worker_with_contact') # assessments ============================================================= sorted_assessments_src = None if has_assessments: assessments_src = dataset['assessments'] if 'assessments' not in destination.keys(): assessments_dest = s.get_or_create_group(destination, 'assessments') sorted_assessments_src = assessments_dest if sort_assessments: sort_keys = ('patient_id', 'created_at') with utils.Timer("sorting assessments"): s.sort_on(assessments_src, assessments_dest, sort_keys) if has_patients: if make_assessment_patient_id_fkey: print( "creating 'assessment_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) assessment_patient_ids =\ s.get(sorted_assessments_src['patient_id']) assessment_patient_id_fkey =\ s.create_numeric(assessments_dest, 'assessment_patient_id_fkey', 'int64') s.get_index(patient_ids.data[:], assessment_patient_ids.data[:], assessment_patient_id_fkey) print(f"completed in {time.time() - t0}s") if clean_temperatures: print("clean temperatures") t0 = time.time() temps = s.get(sorted_assessments_src['temperature']) temp_units = s.get(sorted_assessments_src['temperature_unit']) temps_valid = s.get( sorted_assessments_src['temperature_valid']) dest_temps = temps.create_like(assessments_dest, 'temperature_c_clean') dest_temps_valid = temps_valid.create_like( assessments_dest, 'temperature_35_to_42_inclusive') dest_temps_modified = temps_valid.create_like( assessments_dest, 'temperature_modified') validate_temperature_v1(s, 35.0, 42.0, temps, temp_units, temps_valid, dest_temps, dest_temps_valid, dest_temps_modified) print(f"temperature cleaning done in {time.time() - t0}") if check_symptoms: print('check inconsistent health_status') t0 = time.time() check_inconsistent_symptoms_v1(s, sorted_assessments_src, assessments_dest) print(time.time() - t0) # tests =================================================================== if has_tests: if sort_tests: tests_src = dataset['tests'] tests_dest = s.get_or_create_group(destination, 'tests') sort_keys = ('patient_id', 'created_at') s.sort_on(tests_src, tests_dest, sort_keys) # diet ==================================================================== if has_diet: diet_src = dataset['diet'] if 'diet' not in destination.keys(): diet_dest = s.get_or_create_group(destination, 'diet') sorted_diet_src = diet_dest if sort_diet: sort_keys = ('patient_id', 'display_name', 'id') s.sort_on(diet_src, diet_dest, sort_keys) if has_assessments: if do_daily_asmts: daily_assessments_dest = s.get_or_create_group( destination, 'daily_assessments') # post process patients # TODO: need an transaction table print(patients_src.keys()) print(dataset['assessments'].keys()) print(dataset['tests'].keys()) # write_mode = 'overwrite' write_mode = 'write' # Daily assessments # ================= if has_assessments: if create_daily: print("generate daily assessments") patient_ids = s.get(sorted_assessments_src['patient_id']) created_at_days = s.get(sorted_assessments_src['created_at_day']) raw_created_at_days = created_at_days.data[:] if 'assessment_patient_id_fkey' in assessments_src.keys(): patient_id_index = assessments_src[ 'assessment_patient_id_fkey'] else: patient_id_index = assessments_dest[ 'assessment_patient_id_fkey'] patient_id_indices = s.get(patient_id_index) raw_patient_id_indices = patient_id_indices.data[:] print("Calculating patient id index spans") t0 = time.time() patient_id_index_spans = s.get_spans( fields=(raw_patient_id_indices, raw_created_at_days)) print( f"Calculated {len(patient_id_index_spans)-1} spans in {time.time() - t0}s" ) print("Applying spans to 'health_status'") t0 = time.time() default_behavour_overrides = { 'id': s.apply_spans_last, 'patient_id': s.apply_spans_last, 'patient_index': s.apply_spans_last, 'created_at': s.apply_spans_last, 'created_at_day': s.apply_spans_last, 'updated_at': s.apply_spans_last, 'updated_at_day': s.apply_spans_last, 'version': s.apply_spans_max, 'country_code': s.apply_spans_first, 'date_test_occurred': None, 'date_test_occurred_guess': None, 'date_test_occurred_day': None, 'date_test_occurred_set': None, } for k in sorted_assessments_src.keys(): t1 = time.time() reader = s.get(sorted_assessments_src[k]) if k in default_behavour_overrides: apply_span_fn = default_behavour_overrides[k] if apply_span_fn is not None: apply_span_fn( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") else: print(f" Skipping field {k}") else: if isinstance(reader, fields.CategoricalField): s.apply_spans_max( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") elif isinstance(reader, rw.IndexedStringReader): s.apply_spans_concat( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") elif isinstance(reader, rw.NumericReader): s.apply_spans_max( patient_id_index_spans, reader, reader.create_like(daily_assessments_dest, k)) print(f" Field {k} aggregated in {time.time() - t1}s") else: print(f" No function for {k}") print(f"apply_spans completed in {time.time() - t0}s") if has_patients and has_assessments: if make_patient_level_assessment_metrics: if 'assessment_patient_id_fkey' in assessments_dest: src = assessments_dest['assessment_patient_id_fkey'] else: src = assessments_src['assessment_patient_id_fkey'] assessment_patient_id_fkey = s.get(src) # generate spans from the assessment-space patient_id foreign key spans = s.get_spans(field=assessment_patient_id_fkey.data[:]) ids = s.get(patients_dest['id']) print('calculate assessment counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'assessment_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated assessment counts per patient in {time.time() - t0}" ) print('calculate first assessment days per patient') t0 = time.time() reader = s.get(sorted_assessments_src['created_at_day']) writer = s.create_fixed_string(patients_dest, 'first_assessment_day', 10) aggregated_counts = s.apply_spans_first(spans, reader) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated first assessment days per patient in {time.time() - t0}" ) print('calculate last assessment days per patient') t0 = time.time() reader = s.get(sorted_assessments_src['created_at_day']) writer = s.create_fixed_string(patients_dest, 'last_assessment_day', 10) aggregated_counts = s.apply_spans_last(spans, reader) s.join(ids, assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated last assessment days per patient in {time.time() - t0}" ) print('calculate maximum assessment test result per patient') t0 = time.time() reader = s.get(sorted_assessments_src['tested_covid_positive']) writer = reader.create_like(patients_dest, 'max_assessment_test_result') max_result_value = s.apply_spans_max(spans, reader) s.join(ids, assessment_patient_id_fkey, max_result_value, writer, spans) print( f"calculated maximum assessment test result in {time.time() - t0}" ) if has_assessments and do_daily_asmts and make_patient_level_daily_assessment_metrics: print( "creating 'daily_assessment_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) daily_assessment_patient_ids =\ s.get(daily_assessments_dest['patient_id']) daily_assessment_patient_id_fkey =\ s.create_numeric(daily_assessments_dest, 'daily_assessment_patient_id_fkey', 'int64') s.get_index(patient_ids, daily_assessment_patient_ids, daily_assessment_patient_id_fkey) print(f"completed in {time.time() - t0}s") spans = s.get_spans(field=s.get( daily_assessments_dest['daily_assessment_patient_id_fkey'])) print('calculate daily assessment counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'daily_assessment_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) daily_assessment_patient_id_fkey =\ s.get(daily_assessments_dest['daily_assessment_patient_id_fkey']) s.join(ids, daily_assessment_patient_id_fkey, aggregated_counts, writer, spans) print( f"calculated daily assessment counts per patient in {time.time() - t0}" ) if has_tests and make_new_test_level_metrics: print( "creating 'test_patient_id_fkey' foreign key index for 'patient_id'" ) t0 = time.time() patient_ids = s.get(sorted_patients_src['id']) test_patient_ids = s.get(tests_dest['patient_id']) test_patient_id_fkey = s.create_numeric(tests_dest, 'test_patient_id_fkey', 'int64') s.get_index(patient_ids, test_patient_ids, test_patient_id_fkey) test_patient_id_fkey = s.get(tests_dest['test_patient_id_fkey']) spans = s.get_spans(field=test_patient_id_fkey) print(f"completed in {time.time() - t0}s") print('calculate test_counts per patient') t0 = time.time() writer = s.create_numeric(patients_dest, 'test_count', 'uint32') aggregated_counts = s.apply_spans_count(spans) s.join(ids, test_patient_id_fkey, aggregated_counts, writer, spans) print(f"calculated test counts per patient in {time.time() - t0}") print('calculate test_result per patient') t0 = time.time() test_results = s.get(tests_dest['result']) writer = test_results.create_like(patients_dest, 'max_test_result') aggregated_results = s.apply_spans_max(spans, test_results) s.join(ids, test_patient_id_fkey, aggregated_results, writer, spans) print(f"calculated max_test_result per patient in {time.time() - t0}") if has_diet and make_diet_level_metrics: with utils.Timer("Making patient-level diet questions count", new_line=True): d_pids_ = s.get(diet_dest['patient_id']).data[:] d_pid_spans = s.get_spans(d_pids_) d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_) d_pid_counts = s.apply_spans_count(d_pid_spans) p_diet_counts = s.create_numeric(patients_dest, 'diet_counts', 'int32') s.merge_left(left_on=s.get(patients_dest['id']).data[:], right_on=d_distinct_pids, right_fields=(d_pid_counts, ), right_writers=(p_diet_counts, ))
self.values[9] |= 1 if d else 0 self.values[10] |= 0 if z else 1 self.values[11] |= 1 if z else 0 src_file = '/home/ben/covid/ds_20200929_full.hdf5' dest_file = '/home/ben/covid/ds_diet_tmp.hdf5' with h5py.File(src_file, 'r') as hf: with h5py.File(dest_file, 'w') as dest: s = Session() ptnts = hf['patients'] print(hf['diet'].keys()) diet = hf['diet'] p_ids_ = s.get(hf['patients']['id']).data[:] d_pids_ = s.get(hf['diet']['patient_id']).data[:] d_pid_spans = s.get_spans(d_pids_) d_distinct_pids = s.apply_spans_first(d_pid_spans, d_pids_) d_pid_counts = s.apply_spans_count(d_pid_spans) print(np.unique(d_pid_counts, return_counts=True)) p_diet_counts_new = s.create_numeric(dest, 'diet_counts_new', 'int32') dcs = s.merge_left(left_on=p_ids_, right_on=d_distinct_pids, right_fields=(d_pid_counts, ), right_writers=(p_diet_counts_new, )) # res = np.unique(s.get(patients_dest['diet_counts']).data[:], return_counts=True) print(np.unique(p_diet_counts_new.data[:], return_counts=True)) # ddtest = defaultdict(int) # for p in d_pids_: