def new_hs_test(vcount): s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf: with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest: print(hf.keys()) a_ids_f = s.get(hf['fk_ids']) b_ids_f = s.get(hf['ids']) all_b_val_fields = list() for v in range(vcount): b_vals_f = s.create_numeric(dest, 'left_data_{}'.format(v), 'int32') all_b_val_fields.append(b_vals_f) a_to_b = s.create_numeric(dest, 'a_to_b', 'int64') all_a_val_fields = list() for v in range(vcount): a_vals_f = s.get(hf['right_data_{}'.format(v)]) all_a_val_fields.append(a_vals_f) print("running test") t0 = time.time() # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True, # left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,)) print(a_ids_f.data[:100]) print(b_ids_f.data[:100]) print(all_a_val_fields[0].data[:100]) s.ordered_merge_left(a_ids_f, b_ids_f, left_to_right_map=a_to_b, right_unique=True, right_field_sources=tuple(all_a_val_fields), left_field_sinks=tuple(all_b_val_fields)) print(a_to_b.data[:100]) results = s.merge_left(a_ids_f, b_ids_f, right_fields=tuple(all_a_val_fields)) elapsed = time.time() - t0 print("total:", elapsed) print(all_b_val_fields[0].data[:100]) print(results[0][:100])
def first_test_date_per_patient(session: Session, patient_table, test_table, test_date_name, dest_patient_table, dest_patient_name): """ Filter the first date of test performed for each patient id. :param session: The Exetera session instance. :param patient_table: The patient dataframe. :param test_table: The tests dataframe. :param test_date_name: The name of the test dataframe, not used. :param dest_patient_table: The destination dataframe to store the results. :param dest_patient_name: The name of the destination field to store the results. """ pid = 'id' pids = session.get(patient_table[pid]) pids_ = pids.data[:] if not ops.is_ordered(pids.data[:]): raise ValueError("The patient table must be ordered by '{}'".format(pid)) t_pid = 'patient_id' t_pids = session.get(test_table[t_pid]) t_pids_ = t_pids.data[:] if not ops.is_ordered(t_pids_): raise ValueError("The test table must be ordered by '{}'".format(t_pid)) # collapse the test data by patient_id and get the counts cats = session.get(test_table['created_at']) spans_ = session.get_spans(t_pids_) s_t_pids_ = session.apply_spans_first(spans_, t_pids_) counts_ = session.apply_spans_first(spans_, cats) # merge the counts for the test table into the patient table dest = session.create_numeric(dest_patient_table, dest_patient_name, 'int32') session.ordered_merge_left(left_on=pids_, right_on=s_t_pids_, right_field_sources=(counts_,), left_field_sinks=(dest,), left_unique=True, right_unique=True)
filt_asmt = tmp.create_group('filt_assessments') filt_other_symptoms = other.create_like(filt_asmt, 'other_symptoms') s.apply_filter(filter_, other, filt_other_symptoms) patient_id = s.get(hf['assessments']['patient_id']) filt_patient_id = patient_id.create_like(filt_asmt, 'patient_id') s.apply_filter(filter_, patient_id, filt_patient_id) print('filtered symptoms len =', len(filt_other_symptoms.data)) with utils.Timer("merging test_results"): p_to_a = s.create_numeric(tmp, 'p_to_a', 'int64') a_test_results = s.create_numeric(tmp, 'a_test_results', 'int8') s.ordered_merge_left(left_on=s.get( tmp['filt_assessments']['patient_id']), right_on=s.get(hf['patients']['id']), left_field_sources=(p_test_results, ), left_field_sinks=(a_test_results, ), left_to_right_map=p_to_a, right_unique=True) print(len(a_test_results.data)) print(np.unique(a_test_results.data[:], return_counts=True)) a_test_results_ = a_test_results.data[:] # filtered_test_results = test_results[filter_] # print("filtered tests:", np.unique(filtered_test_results, return_counts=True)) indices, text = s.apply_filter(filter_, other) istart = indices[:-1] iend = indices[1:] print(len(indices), len(text))
def hs_test_1(length, val_column_count): # rng = np.random.RandomState(12345678) # id_base = 1000000000 # mapping = [0, 1, 2, 1] s = Session() with h5py.File('/home/ben/covid/benchmarking.hdf5', 'r') as hf: with h5py.File('/home/ben/covid/benchmark_dest.hdf5', 'w') as dest: # print('creating a_ids') # a_ids = generate_a_ids(length, id_base) # a_ids_f = s.create_numeric(hf, 'a_ids', 'int64') # a_ids_f.data.write(a_ids) # del a_ids # # print('creating a_vals') # # all_a_val_fields = list() # for v in range(val_column_count): # a_vals = generate_a_vals(length, 0, 100, rng) # a_vals_f = s.create_numeric(hf, 'a_vals_{}'.format(v), 'int64') # a_vals_f.data.write(a_vals) # # all_a_val_fields.append(a_vals_f) # del a_vals # # print('creating b_ids') # b_ids = generate_b_ids(length, id_base, mapping) # b_ids_f = s.create_numeric(hf, 'b_ids', 'int64') # b_ids_f.data.write(b_ids) # del b_ids a_ids_f = s.get(hf['a_ids']) b_ids_f = s.get(hf['b_ids']) all_b_val_fields = list() for v in range(val_column_count): b_vals_f = s.create_numeric(dest, 'b_vals_{}'.format(v), 'int32') all_b_val_fields.append(b_vals_f) a_to_b = s.create_numeric(dest, 'a_to_b', 'int64') all_a_val_fields = list() for v in range(val_column_count): a_vals_f = s.get(hf['a_vals_{}'.format(v)]) all_a_val_fields.append(a_vals_f) print("running test") t0 = time.time() # s.ordered_left_merge(a_ids, b_ids, a_to_b, left_unique=True, # left_field_sources=(a_vals_f,), left_field_sinks=(b_vals_f,)) print(b_ids_f.data[:100]) print(a_ids_f.data[:100]) s.ordered_merge_left(b_ids_f, a_ids_f, right_field_sources=tuple(all_a_val_fields), left_field_sinks=tuple(all_b_val_fields), left_to_right_map=a_to_b, right_unique=True) print(a_to_b.data[:100]) results = s.merge_left(b_ids_f, a_ids_f, right_fields=tuple(all_a_val_fields)) elapsed = time.time() - t0 print(elapsed) print(all_b_val_fields[0].data[:100]) print(results[0][:100])