def generate_row_info(in_files, join_key): '''generate information about the row''' row_info = {} for file_name in in_files.keys(): with open(resolve(file_name), mode='r', encoding='utf-8-sig') as file: reader = csv.reader(file) headers = next(reader, None) headers = [x.strip(' ') for x in headers] if 'columns_to_keep' in in_files[file_name]: headers_to_keep = in_files[file_name]['columns_to_keep'] if in_files[file_name]['skip_first_line']: next(reader, None) else: headers_to_keep = in_files[file_name][ 'multiple_columns_to_keep'] row_indices = get_row_indices(headers, headers_to_keep) for row in reader: join_key_col_index = headers.index(join_key) current_row_info = get_current_row(row_info, row[join_key_col_index]) for index in row_indices: current_row_info[headers[index]] = row[index] row_info[row[join_key_col_index]] = current_row_info return row_info
def generate_row_info(in_files, join_key): """generate information about the row""" row_info = {} first_line_metadata = "first_line_metadata" header_meta_dict = dict() for file_name in in_files.keys(): with open(resolve(file_name), mode="r", encoding="utf-8-sig") as file: reader = csv.reader(file) headers = next(reader, None) headers = [x.strip(" ") for x in headers] if in_files[file_name]["skip_first_line"]: next(reader, None) if in_files[file_name][first_line_metadata]: header_meta_desc = file.readline() header_meta_desc = [ meta_header for meta_header in header_meta_desc.split(",") ] header_meta_dict = dict(zip(headers, header_meta_desc)) if "columns_to_keep" in in_files[file_name]: headers_to_keep = in_files[file_name]["columns_to_keep"] else: headers_to_keep = in_files[file_name][ "multiple_columns_to_keep"] row_indices = get_row_indices(headers, headers_to_keep) for row in reader: join_key_col_index = headers.index(join_key) current_row_info = get_current_row(row_info, row[join_key_col_index]) for index in row_indices: current_row_info[headers[index]] = row[index] row_info[row[join_key_col_index]] = current_row_info return row_info, header_meta_dict
from datetime import timedelta import petl as etl from fhir_petl.fhir import to_json from fhir_petl.util import resolve, mkdirp, number, join, year, dateparser, ISOFormat def map_race(race): return { 'AMERICAN INDIAN AND ALASKA NATIVE': ('http://hl7.org/fhir/v3/Race', '1002-5', 'American Indian or Alaska Native'), 'ASIAN': ('http://hl7.org/fhir/v3/Race', '2028-9', 'Asian'), 'BLACK OR AFRICAN AMERICAN': ('http://hl7.org/fhir/v3/Race', '2054-5', 'Black or African American'), 'HISPANIC OR LATINO': ('http://hl7.org/fhir/v3/Race', '2106-3', 'White'), 'WHITE': ('http://hl7.org/fhir/v3/Race', '2106-3', 'White'), 'NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER': ('http://hl7.org/fhir/v3/Race', '2076-8', 'Native Hawaiian or Other Pacific Islander') }.get(race, None) patients = (etl.io.csv.fromcsv(resolve('work/Patient.csv')) .fieldmap({ 'id': 'ID', 'STUDYID': 'STUDYID', 'subject_id': ('STUDYID', lambda x: 'CASE-' + x), 'race': ('RACE', map_race), 'gender': ('SEX', {'F': 'female', 'M': 'male'}), 'birth_date': ('BIRTH_YR', year), 'index_date': ('INDEX_YEAR', dateparser('%Y', ISOFormat.DAY)), 'tag': lambda rec: ('subject-type', 'case') }, True)) index = (patients .cut('STUDYID', 'id', 'index_date') .rename('id', 'subject'))
import petl as etl from fhir_petl.util import preprocess, resolve, mkdirp import random mkdirp(resolve('work')) selection = etl.io.csv.fromtsv(resolve('sa1cases.txt')).columns()['STUDYID'] selection = set(random.sample(selection, 1000)) preprocess( etl.io.csv.fromcsv(resolve('case_demog_27065.csv')).selectin( 'STUDYID', selection), 'STUDYID').tocsv(resolve('work/Patient.csv')) preprocess( etl.io.csv.fromtsv(resolve('dx_case_inst.txt')).selectin( 'STUDYID', selection), 'STUDYID', ).tocsv(resolve('work/Condition.csv')) preprocess( etl.io.csv.fromtsv(resolve('lab_case_inst.txt')).selectin( 'STUDYID', selection), 'STUDYID').tocsv(resolve('work/Observation.csv')) preprocess( etl.io.csv.fromtsv(resolve('med_case_inst_gpi.txt')).selectin( 'CASE_ID', selection), 'CASE_ID').tocsv(resolve('work/MedicationDispense.csv')) preprocess( etl.io.csv.fromtsv(resolve('order_case_inst_gpi.txt')).selectin(
('http://hl7.org/fhir/v3/Race', '2106-3', 'White'), 'WHITE': ('http://hl7.org/fhir/v3/Race', '2106-3', 'White'), 'NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER': ('http://hl7.org/fhir/v3/Race', '2076-8', 'Native Hawaiian or Other Pacific Islander') }.get(race, None) def index_date(rec): birth = number(rec['BIRTH_YR']) index_age = number(rec['INDEX_AGE']) index_date = str(birth + index_age) return dateparser('%Y', ISOFormat.DAY)(index_date) patients = (etl.io.csv.fromcsv(resolve('work/Patient.csv')).fieldmap( { 'id': 'ID', 'CONTROL_ID': 'control_id', 'subject_id': ('control_id', lambda x: 'CONTROL-' + x), 'race': ('RACE', map_race), 'gender': ('SEX', { 'F': 'female', 'M': 'male' }), 'birth_date': ('BIRTH_YR', year), 'index_date': index_date, 'tag': lambda rec: ('subject-type', 'control') }, True)) index = (patients.cut('CONTROL_ID', 'id',
elif month and text and not day: parser = dateparser("%m/%Y", ISOFormat.MONTH) date_string = "{0}/{1}".format(month, text) elif not complete_date_bool: parser = dateparser("%Y", ISOFormat.YEAR) date_string = "{0}".format(text) else: parser = dateparser("%m/%d/%Y", ISOFormat.DAY) date_string = "{0}".format(text) return parser(date_string) patients = etl.io.csv.fromcsv(resolve("work/Patient_ktb_updated.csv")).fieldmap( { "id": "ID", "subject_id": "SID", "SID": "SID", "race": ("RACE", map_race), "ethnicity": ("ETHNICITY", map_ethnicity), "marital_status": ("MARITAL_STATUS", map_marital_status), # "gender": ("GENDER", {"F": "female", "M": "male"}), # "birth_date": ("BIRTH_DATE", parse_date), # 'death_date': ('DEATH_YR', year), # 'sample_date': ('SAMPLE_DATE', sample_date), # 'tag': ('Cohort', lambda cohort: ('cohort', cohort.upper())) }, True, )
import petl as etl from fhir_petl.util import resolve observations = (etl.io.csv.fromcsv(resolve('work/Observation.csv'))) patients = (etl.io.csv.fromcsv(resolve('work/Patient.csv'))) procedures = (etl.io.csv.fromcsv(resolve('work/Procedure.csv'))) conditions = (etl.io.csv.fromcsv(resolve('work/Condition.csv'))) requests = (etl.io.csv.fromcsv(resolve('work/MedicationRequest.csv'))) dispenses = (etl.io.csv.fromcsv(resolve('work/MedicationDispense.csv'))) print(requests.look(10))
import petl as etl from fhir_petl.util import preprocess, resolve, mkdirp mkdirp(resolve('work')) preprocess(etl.io.csv.fromcsv( resolve('Table_1_Demographics_New_Cohorts.csv'))).tocsv( resolve('work/Patient.csv')) preprocess(etl.io.csv.fromcsv(resolve('Diagnoses.csv'))).tocsv( resolve('work/Condition.csv')) preprocess(etl.io.csv.fromcsv(resolve('fairbanks_cv.dedup.csv'))).tocsv( resolve('work/Observation.csv')) preprocess(etl.io.csv.fromcsv(resolve('Prescriptions.csv'))).tocsv( resolve('work/MedicationRequest.csv')) preprocess(etl.io.csv.fromcsv(resolve('Procedures.csv'))).tocsv( resolve('work/Procedure.csv'))
import petl as etl from fhir_petl.util import preprocess, resolve, mkdirp import random mkdirp(resolve('work')) selection = etl.io.csv.fromtsv( resolve('sa1controls.txt')).columns()['control_id'] selection = set(random.sample(selection, 1000)) preprocess( etl.io.csv.fromtsv(resolve('controls.txt')).selectin( 'control_id', selection), 'control_id').tocsv(resolve('work/Patient.csv')) preprocess( etl.io.csv.fromtsv(resolve('dx_control_inst.txt')).selectin( 'CONTROL_ID', selection), 'CONTROL_ID', ).tocsv(resolve('work/Condition.csv')) preprocess( etl.io.csv.fromtsv(resolve('lab_control_inst.txt')).selectin( 'CONTROL_ID', selection), 'CONTROL_ID').tocsv(resolve('work/Observation.csv')) preprocess( etl.io.csv.fromtsv(resolve('med_control_inst_gpi.txt')).selectin( 'CONTROL_ID', selection), 'CONTROL_ID').tocsv(resolve('work/MedicationDispense.csv'))