def main(): dataset = data_loader.DataBowl(args, phase='train') train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) dataset = data_loader.DataBowl(args, phase='valid') valid_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) args.vocab = dataset.vocab args.relation = dataset.relation # net, loss = model.Net(args), model.Loss() net, loss = model.FCModel(args), model.Loss() net = _cuda(net, 0) loss = _cuda(loss, 0) parameters_all = [] for p in net.parameters(): parameters_all.append(p) optimizer = torch.optim.Adam(parameters_all, args.lr) best_auc = [0, 0, 0, 0, 0, 0] cui_con_dict = {} if args.phase == 'train': for epoch in range(args.epochs): train(train_loader, net, loss, epoch, optimizer, best_auc) best_auc, cui_con_dict = test(valid_loader, net, loss, epoch, best_auc, 'valid', cui_con_dict) print args.words if 1: cons_dir = '../result/cons/{:s}/{:d}'.format( args.model, args.predict_day) py_op.mkdir(cons_dir) num = len(os.listdir(cons_dir)) py_op.mywritejson(os.path.join(cons_dir, '{:d}.json'.format(num)), cui_con_dict) # break print 'best auc', best_auc auc = best_auc[0] with open('../result/log.txt', 'a') as f: f.write('#model {:s} #auc {:3.4f}\n'.format(args.model, auc)) elif args.phase == 'test': net.load_state_dict(torch.load(args.resume)) test(valid_loader, net, loss, 0, best_auc, 'valid', cui_con_dict)
def generate_feature_mm_dict(): files = sorted( glob(os.path.join(args.data_dir, args.dataset, 'train_groundtruth/*'))) feature_value_dict = dict() for ifi, fi in enumerate(tqdm(files)): if 'csv' not in fi: continue for iline, line in enumerate(open(fi)): line = line.strip() if iline == 0: feat_list = line.split(',') else: data = line.split(',') for iv, v in enumerate(data): if v in ['NA', '']: continue else: feat = feat_list[iv] if feat not in feature_value_dict: feature_value_dict[feat] = [] feature_value_dict[feat].append(float(v)) feature_mm_dict = dict() feature_ms_dict = dict() feature_range_dict = dict() for feat, vs in feature_value_dict.items(): vs = sorted(vs) value_split = [] for i in range(args.split_num): n = int(i * len(vs) / args.split_num) value_split.append(vs[n]) value_split.append(vs[-1]) feature_range_dict[feat] = value_split n = int(len(vs) / args.split_num) feature_mm_dict[feat] = [vs[n], vs[-n - 1]] feature_ms_dict[feat] = [np.mean(vs), np.std(vs)] py_op.mkdir(args.file_dir) py_op.mywritejson( os.path.join(args.file_dir, args.dataset + '_feature_mm_dict.json'), feature_mm_dict) py_op.mywritejson( os.path.join(args.file_dir, args.dataset + '_feature_ms_dict.json'), feature_ms_dict) py_op.mywritejson( os.path.join(args.file_dir, args.dataset + '_feature_list.json'), feat_list) py_op.mywritejson( os.path.join( args.file_dir, args.dataset + '_feature_value_dict_{:d}.json'.format(args.split_num)), feature_range_dict)
def generate_ehr_files(): hadm_time_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_time_dict.json')) hadm_demo_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_demo_dict.json')) hadm_sid_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_sid_dict.json')) hadm_icd_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_icd_dict.json')) hadm_time_drug_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_time_drug_dict.json')) groundtruth_dir = os.path.join(args.data_dir, 'train_groundtruth') py_op.mkdir(groundtruth_dir) ehr_count_dict = dict() for hadm_id in hadm_sid_dict: time_drug_dict = hadm_time_drug_dict.get(hadm_id, {}) icd_list = hadm_icd_dict.get(hadm_id, []) demo = hadm_demo_dict[hadm_id] demo[0] = demo[0] + '1' demo[1] = 'A' + str(int(demo[1] / 9)) icd_demo = icd_list + demo for icd in icd_demo: ehr_count_dict[icd] = ehr_count_dict.get(icd, 0) + 1 ehr_dict = {'drug': {}, 'icd_demo': icd_demo} for setime, drug_list in time_drug_dict.items(): try: stime, etime = setime.split(' -- ') start_second = time_to_second(hadm_time_dict[hadm_id]) stime = str((time_to_second(stime) - start_second) / 3600) etime = str((time_to_second(etime) - start_second) / 3600) setime = stime + ' -- ' + etime for drug in drug_list: ehr_count_dict[drug] = ehr_count_dict.get(drug, 0) + 1 ehr_dict['drug'][setime] = list(set(drug_list)) except: pass py_op.mywritejson(os.path.join(groundtruth_dir, hadm_id + '.json'), ehr_dict) # break py_op.mywritejson(os.path.join(args.data_dir, 'ehr_count_dict.json'), ehr_count_dict)
def sort_pivoted_data(): sort_dir = os.path.join(args.data_dir, args.dataset, 'sort_pivoted') os.system('rm -r ' + sort_dir) os.system('mkdir ' + sort_dir) merge_dir = os.path.join(args.data_dir, args.dataset, 'merge_pivoted') for i_fi, fi in enumerate(tqdm(os.listdir(merge_dir))): wf = open(os.path.join(sort_dir, fi), 'w') time_line_dict = dict() for i_line, line in enumerate(open(os.path.join(merge_dir, fi))): if i_line: line_data = line.strip().split(',') delta = 3 ctime = delta * int(int(line_data[1]) / delta) if ctime not in time_line_dict: time_line_dict[ctime] = [] time_line_dict[ctime].append(line_data) else: line_data = line.split(',')[1:] line_data[0] = 'time' wf.write(','.join(line_data)) for t in sorted(time_line_dict): line_list = time_line_dict[t] new_line = line_list[0] for line_data in line_list[1:]: for iv, v in enumerate(line_data): if len(v.strip()): new_line[iv] = v new_line = ','.join(new_line[1:]) + '\n' wf.write(new_line) wf.close() py_op.mkdir('../../data/MIMIC/train_groundtruth') py_op.mkdir('../../data/MIMIC/train_with_missing') os.system('rm ../../data/MIMIC/train_groundtruth/*.csv') os.system( 'cp ../../data/MIMIC/sort_pivoted/* ../../data/MIMIC/train_groundtruth/' )
def merge_pivoted_data(csv_list): name_list = ['hadm_id', 'charttime'] for k, v in variable_map_dict.items(): if k not in ['age', 'gender']: if len(v): name_list.append(v) elif k in item_id_dict: name_list.append(k) name_index_dict = {name: id for id, name in enumerate(name_list)} hadm_time_dict = py_op.myreadjson( os.path.join(args.data_dir, args.dataset, 'hadm_time_dict.json')) icu_hadm_dict = py_op.myreadjson( os.path.join(args.data_dir, args.dataset, 'icu_hadm_dict.json')) merge_dir = os.path.join(args.data_dir, args.dataset, 'merge_pivoted') os.system('rm -r ' + merge_dir) os.system('mkdir ' + merge_dir) pivoted_dir = os.path.join(args.result_dir, 'mimic/pivoted_sofa') py_op.mkdir(pivoted_dir) for fi in csv_list: print(fi) for i_line, line in enumerate(open(os.path.join(args.mimic_dir, fi))): if i_line: line_data = line.strip().split(',') if len(line_data) <= 0: continue line_dict = dict() for iv, v in enumerate(line_data): if len(v.strip()): name = head[iv] line_dict[name] = v if fi == 'pivoted_sofa.csv': icu_id = line_dict.get('icustay_id', 'xxx') if icu_id not in icu_hadm_dict: continue hadm_id = str(icu_hadm_dict[icu_id]) line_dict['hadm_id'] = hadm_id line_dict['charttime'] = line_dict['starttime'] hadm_id = line_dict.get('hadm_id', 'xxx') if hadm_id not in hadm_time_dict: continue hadm_time = time_to_second(hadm_time_dict[hadm_id]) now_time = time_to_second(line_dict['charttime']) delta_hour = int((now_time - hadm_time) / 3600) line_dict['charttime'] = str(delta_hour) if fi == 'pivoted_sofa.csv': sofa_file = os.path.join(pivoted_dir, hadm_id + '.csv') if not os.path.exists(sofa_file): with open(sofa_file, 'w') as f: f.write(sofa_head) wf = open(sofa_file, 'a') sofa_line = [str(delta_hour)] + line.split(',')[4:] wf.write(','.join(sofa_line)) wf.close() assert 'hadm_id' in line_dict assert 'charttime' in line_dict new_line = [] for name in name_list: new_line.append(line_dict.get(name, '')) new_line = ','.join(new_line) + '\n' hadm_file = os.path.join(merge_dir, hadm_id + '.csv') if not os.path.exists(hadm_file): with open(hadm_file, 'w') as f: f.write(','.join(name_list) + '\n') wf = open(hadm_file, 'a') wf.write(new_line) wf.close() else: if fi == 'pivoted_sofa.csv': sofa_head = ','.join(['time'] + line.replace('"', '').split(',')[4:]) # "icustay_id","hr","starttime","endtime","pao2fio2ratio_novent","pao2fio2ratio_vent","rate_epinephrine","rate_norepinephrine","rate_dopamine","rate_dobutamine","meanbp_min","gcs_min","urineoutput","bilirubin_max","creatinine_max","platelet_min","respiration","coagulation","liver","cardiovascular","cns","renal","respiration_24hours","coagulation_24hours","liver_24hours","cardiovascular_24hours","cns_24hours","renal_24hours","sofa_24hours" head = line.replace('"', '').strip().split(',') head = [h.strip() for h in head] # print(line) for h in head: if h not in name_index_dict: print(h)
def compute_dist_mat(): files = glob( os.path.join(args.result_dir, args.dataset, 'imputation_result/*.csv')) # [:100] feature_ms_dict = py_op.myreadjson( os.path.join(args.file_dir, args.dataset + '_feature_ms_dict.json')) subtyping_dir = os.path.join(args.result_dir, args.dataset, 'subtyping') py_op.mkdir(subtyping_dir) hadm_id_list = [] mean_variables = [] hadm_variable_dict = {} all_values = [] for i_fi, fi in enumerate(tqdm(files)): hadm_id = fi.split('/')[-1].split('.')[0] hadm_data = [] for i_line, line in enumerate(open(fi)): if i_line: line_data = line.strip().split(',') line_data = np.array([float(x) for x in line_data]) if len(line_data) != n_variables + 1: print(i_fi, fi) if line_data[0] < 0: continue elif line_data[0] < 24: hadm_data.append(line_data) else: break else: head = line.strip().split(',')[1:] assert len(head) == n_variables values = np.array(hadm_data, dtype=np.float32) values = values[-24:] times = values[:, 0] values = values[:, 1:] assert len(values.shape) == 2 assert values.shape[1] == n_variables hadm_variable_dict[hadm_id] = values hadm_id_list.append(hadm_id) all_values.append(values) all_values = np.concatenate(all_values, 0) ms = [all_values.mean(0), all_values.std(0)] hadm_dist_matrix = np.zeros((len(hadm_id_list), len(hadm_id_list))) - 1 for i in tqdm(range(len(hadm_id_list))): hadm_dist_matrix[i, i] = 0 for j in range(i + 1, len(hadm_id_list)): if hadm_dist_matrix[i, j] >= 0 or i == j: continue s1 = hadm_variable_dict[hadm_id_list[i]] s2 = hadm_variable_dict[hadm_id_list[j]] s1 = norm(s1, ms) s2 = norm(s2, ms) dist_mat = dist_func(s1, s2) path = np.zeros([dist_mat.shape[0], dist_mat.shape[1], 3 ]) - inf - 1 compute_dtw(dist_mat, path, hadm_dist_matrix, i, j) py_op.mywritejson(os.path.join(subtyping_dir, 'hadm_id_list.json'), hadm_id_list) np.save(os.path.join(subtyping_dir, 'hadm_dist_matrix.npy'), hadm_dist_matrix)