stays = add_age_to_icustays(stays) stays = add_inunit_mortality_to_icustays(stays) stays = add_inhospital_mortality_to_icustays(stays) stays = filter_icustays_on_age(stays) if args.verbose: print('REMOVE PATIENTS AGE < 18:', stays.ICUSTAY_ID.unique().shape[0], stays.HADM_ID.unique().shape[0], stays.SUBJECT_ID.unique().shape[0]) stays.to_csv(os.path.join(args.output_path, 'all_stays.csv'), index=False) diagnoses = read_icd_diagnoses_table(args.mimic3_path) diagnoses = filter_diagnoses_on_stays(diagnoses, stays) diagnoses.to_csv(os.path.join(args.output_path, 'all_diagnoses.csv'), index=False) count_icd_codes(diagnoses, output_path=os.path.join(args.output_path, 'diagnosis_counts.csv')) phenotypes = add_hcup_ccs_2015_groups(diagnoses, yaml.load(open(args.phenotype_definitions, 'r'))) make_phenotype_label_matrix(phenotypes, stays).to_csv(os.path.join(args.output_path, 'phenotype_labels.csv'), index=False, quoting=csv.QUOTE_NONNUMERIC) if args.test: pat_idx = np.random.choice(patients.shape[0], size=1000) patients = patients.iloc[pat_idx] stays = stays.merge(patients[['SUBJECT_ID']], left_on='SUBJECT_ID', right_on='SUBJECT_ID') args.event_tables = [args.event_tables[0]] print('Using only', stays.shape[0], 'stays and only', args.event_tables[0], 'table') subjects = stays.SUBJECT_ID.unique() break_up_stays_by_subject(stays, args.output_path, subjects=subjects, verbose=args.verbose) break_up_diagnoses_by_subject(phenotypes, args.output_path, subjects=subjects, verbose=args.verbose) items_to_keep = set( [int(itemid) for itemid in dataframe_from_csv(args.itemids_file)['ITEMID'].unique()]) if args.itemids_file else None for table in args.event_tables: read_events_table_and_break_up_by_subject(args.mimic3_path, table, args.output_path, items_to_keep=items_to_keep,
stays.SUBJECT_ID.unique().shape[0]) stays.to_csv(os.path.join(args.output_path, 'all_stays.csv'), index=False) diagnoses = read_icd_diagnoses_table(args.mimic3_path) diagnoses = filter_diagnoses_on_stays(diagnoses, stays) diagnoses.to_csv(os.path.join(args.output_path, 'all_diagnoses.csv'), index=False) count_icd_codes(diagnoses, output_path=os.path.join(args.output_path, 'diagnosis_counts.csv')) phenotypes = add_hcup_ccs_2015_groups( diagnoses, yaml.load(open(args.phenotype_definitions, 'r'))) make_phenotype_label_matrix(phenotypes, stays).to_csv(os.path.join(args.output_path, 'phenotype_labels.csv'), index=False, quoting=csv.QUOTE_NONNUMERIC) if args.test: pat_idx = np.random.choice(patients.shape[0], size=1000) patients = patients.iloc[pat_idx] stays = stays.merge(patients[['SUBJECT_ID']], left_on='SUBJECT_ID', right_on='SUBJECT_ID') args.event_tables = [args.event_tables[0]] print('Using only', stays.shape[0], 'stays and only', args.event_tables[0], 'table') subjects = stays.SUBJECT_ID.unique() break_up_stays_by_subject(stays,
diagnoses_df = read_icd_diagnoses_table_df(args.mimic3_path, sqlContext) diagnoses = filter_diagnoses_on_stays(diagnoses, stays) diagnoses_df = filter_diagnoses_on_stays_df(diagnoses_df, stays_df) # diagnoses.to_csv(os.path.join(args.output_path, 'all_diagnoses.csv'), index=False) codes_df = count_icd_codes(diagnoses, output_path=os.path.join(args.output_path, 'diagnosis_counts.csv')) # codes_df = count_icd_codes_df(diagnoses_df, output_path=os.path.join(args.output_path, 'diagnosis_counts.csv')) # phenotypes = add_hcup_ccs_2015_groups( diagnoses, yaml.load(open(args.phenotype_definitions, 'r'))) # phenotypes_df = add_hcup_ccs_2015_groups(diagnoses, yaml.load(open(args.phenotype_definitions, 'r'))) # phenotypes_df = spark.createDataFrame(phenotypes_df) # make_phenotype_label_matrix(phenotypes, stays).to_csv(os.path.join(args.output_path, 'phenotype_labels.csv'), # index=False, quoting=csv.QUOTE_NONNUMERIC) phenotypes_label_df = make_phenotype_label_matrix(phenotypes, stays) phenotypes_df = spark.createDataFrame(phenotypes) diagnoses_df = spark.createDataFrame(diagnoses) codes_df = spark.createDataFrame(codes_df) # stays_df = spark.createDataFrame(stays) # # if args.test: # pat_idx = np.random.choice(patients.shape[0], size=1000) # patients = patients.iloc[pat_idx] # stays = stays.merge(patients[['SUBJECT_ID']], left_on='SUBJECT_ID', right_on='SUBJECT_ID') # args.event_tables = [args.event_tables[0]] # print('Using only', stays.shape[0], 'stays and only', args.event_tables[0], 'table') subjects = stays.SUBJECT_ID.unique() start = time.time()