def generate_diagnosis_data(): sid_hadm_dict = py_op.myreadjson( os.path.join(args.data_dir, 'sid_hadm_dict.json')) hadm_sid_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_sid_dict.json')) hadm_map_dict = dict() for hadm in hadm_sid_dict: sid = hadm_sid_dict[hadm] hadm_list = sid_hadm_dict[sid] if len(hadm_list) > 1: hadm_list = sorted(hadm_list, key=lambda k: int(k)) idx = hadm_list.index(hadm) if idx > 0: for h in hadm_list[:idx]: if h not in hadm_map_dict: hadm_map_dict[h] = [] hadm_map_dict[h].append(hadm) hadm_icd_dict = dict() for i_line, line in enumerate( open(os.path.join(args.mimic_dir, 'DIAGNOSES_ICD.csv'))): if i_line: if i_line % 10000 == 0: print(i_line) line_data = [x.strip('"') for x in py_op.csv_split(line.strip())] ROW_ID, SUBJECT_ID, hadm_id, SEQ_NUM, icd = line_data if hadm_id in hadm_map_dict: for h in hadm_map_dict[hadm_id]: if h not in hadm_icd_dict: hadm_icd_dict[h] = [] hadm_icd_dict[h].append(icd) hadm_icd_dict = {h: list(set(icds)) for h, icds in hadm_icd_dict.items()} py_op.mywritejson(os.path.join(args.data_dir, 'hadm_icd_dict.json'), hadm_icd_dict)
def stat_drug_effect(): for fi in ['train.json', 'valid.json', 'test.json']: ehr_data = json.load( open(os.path.join(args.data_dir, args.dataset, fi))) new_ehr_data = [] has_drug = [] has_hf = [[], []] for pdata in ehr_data: patient_dict = pdata[0] hf = pdata[1] vis, new_patient_dict = find_drug(patient_dict) if len(new_patient_dict): has_drug.append(vis) has_hf[vis].append(hf) new_ehr_data.append([new_patient_dict, hf]) print('') print('In {:s}:'.format(fi.split('.')[0])) print( 'There are {:d} patients. {:d} patients has drug. {:d} patients hasn\'t drugs.' .format(len(new_ehr_data), sum(has_drug), len(ehr_data) - sum(has_drug))) print( 'Drug patients with hf: {:3.4f}. No drug patients with hf: {:3.4}.' .format(np.mean(has_hf[1]), np.mean(has_hf[0]))) py_op.mywritejson( os.path.join(args.data_dir, args.dataset, 'new_' + fi), new_ehr_data)
def main(): dataset = data_loader.DataBowl(args, phase='train') train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) dataset = data_loader.DataBowl(args, phase='valid') valid_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) args.vocab = dataset.vocab args.relation = dataset.relation # net, loss = model.Net(args), model.Loss() net, loss = model.FCModel(args), model.Loss() net = _cuda(net, 0) loss = _cuda(loss, 0) parameters_all = [] for p in net.parameters(): parameters_all.append(p) optimizer = torch.optim.Adam(parameters_all, args.lr) best_auc = [0, 0, 0, 0, 0, 0] cui_con_dict = {} if args.phase == 'train': for epoch in range(args.epochs): train(train_loader, net, loss, epoch, optimizer, best_auc) best_auc, cui_con_dict = test(valid_loader, net, loss, epoch, best_auc, 'valid', cui_con_dict) print args.words if 1: cons_dir = '../result/cons/{:s}/{:d}'.format( args.model, args.predict_day) py_op.mkdir(cons_dir) num = len(os.listdir(cons_dir)) py_op.mywritejson(os.path.join(cons_dir, '{:d}.json'.format(num)), cui_con_dict) # break print 'best auc', best_auc auc = best_auc[0] with open('../result/log.txt', 'a') as f: f.write('#model {:s} #auc {:3.4f}\n'.format(args.model, auc)) elif args.phase == 'test': net.load_state_dict(torch.load(args.resume)) test(valid_loader, net, loss, 0, best_auc, 'valid', cui_con_dict)
def generate_icu_mortality_dict(icustay_id_list): icu_mortality_dict = dict() for i_line, line in enumerate( open(os.path.join(args.mimic_dir, 'sepsis_mortality.csv'))): if i_line: if i_line % 10000 == 0: print(i_line) line_data = line.strip().split(',') icustay_id = line_data[0] icu_mortality_dict[icustay_id] = int(line_data[-1]) py_op.mywritejson(os.path.join(args.data_dir, 'icu_mortality_dict.json'), icu_mortality_dict)
def map_ehr_id(): print('start') ehr_count_dict = py_op.myreadjson( os.path.join(args.data_dir, 'ehr_count_dict.json')) ehr_list = [ehr for ehr, c in ehr_count_dict.items() if c > 100] ns = set('0123456789') print(ns) drug_list = [e for e in ehr_list if e[1] in ns] med_list = [e for e in ehr_list if e[1] not in ns] print(len(drug_list)) print(len(med_list)) py_op.mywritejson(os.path.join(args.data_dir, 'ehr_list.json'), ehr_list)
def split_data_to_ten_set(): files = sorted( glob(os.path.join(args.data_dir, args.dataset, 'train_with_missing/*'))) np.random.shuffle(files) splits = [] for i in range(10): st = int(len(files) * i / 10) en = int(len(files) * (i + 1) / 10) splits.append(files[st:en]) py_op.mywritejson( os.path.join(args.file_dir, args.dataset + '_splits.json'), splits)
def compute_pred_clean_psnr(epoch='ensemble', clean_dir='../../data/AI/testB/', result_dir='../../data/result'): psnr_list = [] file_psnr_dict = dict() if not os.path.exists(result_dir): os.mkdir(result_dir) f = open(os.path.join(result_dir, epoch + '.csv'), 'w') for i, clean in enumerate(os.listdir(clean_dir)): file = clean clean = os.path.join(clean_dir, clean) clean_file = clean pred = clean.replace('.jpg', '.png').replace( 'data', 'data/test_clean/{:s}'.format(epoch)) stain = clean.replace('trainB', 'trainA').replace('testB', 'testA').replace( '.jpg', '_.jpg') try: pred = Image.open(pred) pred = pred.resize((256, 256)) pred = pred.resize((250, 250)) pred = np.array(pred).astype(np.float32) clean = np.array(Image.open(clean)).astype(np.float32) stain = np.array(Image.open(stain)).astype(np.float32) psnr_pred = psnr(clean, pred) psnr_stain = psnr(clean, stain) psnr_list.append([psnr_stain, psnr_pred]) file_psnr_dict[file] = psnr_pred except: traceback.print_exc() continue print i, 1000 f.write(clean_file.split('/')[-1]) f.write(',') f.write(str(psnr_stain)) f.write(',') f.write(str(psnr_pred)) f.write(',') f.write(str(psnr_pred / psnr_stain - 1)) f.write('\n') psnr_list = np.array(psnr_list) psnr_mean = ((psnr_list[:, 1] - psnr_list[:, 0]) / psnr_list[:, 0]).mean() f.write(str(psnr_mean)) f.close() py_op.mywritejson( os.path.join(result_dir, epoch + '.json'), py_op.mysorteddict(file_psnr_dict, key=lambda s: file_psnr_dict[s])) print '网纹图PSNR', psnr_list[:, 0].mean() print '预测图PSNR', psnr_list[:, 1].mean() print '增益率', psnr_mean return psnr_mean
def generate_ehr_files(): hadm_time_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_time_dict.json')) hadm_demo_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_demo_dict.json')) hadm_sid_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_sid_dict.json')) hadm_icd_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_icd_dict.json')) hadm_time_drug_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_time_drug_dict.json')) groundtruth_dir = os.path.join(args.data_dir, 'train_groundtruth') py_op.mkdir(groundtruth_dir) ehr_count_dict = dict() for hadm_id in hadm_sid_dict: time_drug_dict = hadm_time_drug_dict.get(hadm_id, {}) icd_list = hadm_icd_dict.get(hadm_id, []) demo = hadm_demo_dict[hadm_id] demo[0] = demo[0] + '1' demo[1] = 'A' + str(int(demo[1] / 9)) icd_demo = icd_list + demo for icd in icd_demo: ehr_count_dict[icd] = ehr_count_dict.get(icd, 0) + 1 ehr_dict = {'drug': {}, 'icd_demo': icd_demo} for setime, drug_list in time_drug_dict.items(): try: stime, etime = setime.split(' -- ') start_second = time_to_second(hadm_time_dict[hadm_id]) stime = str((time_to_second(stime) - start_second) / 3600) etime = str((time_to_second(etime) - start_second) / 3600) setime = stime + ' -- ' + etime for drug in drug_list: ehr_count_dict[drug] = ehr_count_dict.get(drug, 0) + 1 ehr_dict['drug'][setime] = list(set(drug_list)) except: pass py_op.mywritejson(os.path.join(groundtruth_dir, hadm_id + '.json'), ehr_dict) # break py_op.mywritejson(os.path.join(args.data_dir, 'ehr_count_dict.json'), ehr_count_dict)
def generate_feature_mm_dict(): files = sorted( glob(os.path.join(args.data_dir, args.dataset, 'train_groundtruth/*'))) feature_value_dict = dict() for ifi, fi in enumerate(tqdm(files)): if 'csv' not in fi: continue for iline, line in enumerate(open(fi)): line = line.strip() if iline == 0: feat_list = line.split(',') else: data = line.split(',') for iv, v in enumerate(data): if v in ['NA', '']: continue else: feat = feat_list[iv] if feat not in feature_value_dict: feature_value_dict[feat] = [] feature_value_dict[feat].append(float(v)) feature_mm_dict = dict() feature_ms_dict = dict() feature_range_dict = dict() for feat, vs in feature_value_dict.items(): vs = sorted(vs) value_split = [] for i in range(args.split_num): n = int(i * len(vs) / args.split_num) value_split.append(vs[n]) value_split.append(vs[-1]) feature_range_dict[feat] = value_split n = int(len(vs) / args.split_num) feature_mm_dict[feat] = [vs[n], vs[-n - 1]] feature_ms_dict[feat] = [np.mean(vs), np.std(vs)] py_op.mkdir(args.file_dir) py_op.mywritejson( os.path.join(args.file_dir, args.dataset + '_feature_mm_dict.json'), feature_mm_dict) py_op.mywritejson( os.path.join(args.file_dir, args.dataset + '_feature_ms_dict.json'), feature_ms_dict) py_op.mywritejson( os.path.join(args.file_dir, args.dataset + '_feature_list.json'), feat_list) py_op.mywritejson( os.path.join( args.file_dir, args.dataset + '_feature_value_dict_{:d}.json'.format(args.split_num)), feature_range_dict)
def wkmeans(n_cluster): subtyping_dir = os.path.join(args.result_dir, args.dataset, 'subtyping') hadm_id_list = py_op.myreadjson(os.path.join(subtyping_dir, 'hadm_id_list.json')) hadm_dist_matrix = np.load(os.path.join(subtyping_dir, 'hadm_dist_matrix.npy')) assert len(hadm_dist_matrix) == len(hadm_id_list) # initialization indices = range(len(hadm_id_list)) np.random.shuffle(indices) init_groups = [indices[i*10: i*10 + 10] for i in range(n_cluster)] groups = init_groups for epoch in range(100): groups = wkmeans_epoch(hadm_dist_matrix, groups) print([len(g) for g in groups]) if epoch and epoch % 10 == 0: cluster_results = [] for g in groups: cluster_results.append([hadm_id_list[i] for i in g]) py_op.mywritejson(os.path.join(subtyping_dir, 'cluster_results.json'), cluster_results)
def generate_drug_data(): hadm_sid_dict = py_op.myreadjson( os.path.join(args.data_dir, 'hadm_sid_dict.json')) hadm_id_set = set(hadm_sid_dict) hadm_time_drug_dict = dict() for i_line, line in enumerate( open(os.path.join(args.mimic_dir, 'PRESCRIPTIONS.csv'))): if i_line: if i_line % 10000 == 0: print(i_line) line_data = [x.strip('"') for x in py_op.csv_split(line.strip())] _, SUBJECT_ID, hadm_id, _, startdate, enddate, _, drug, DRUG_NAME_POE, DRUG_NAME_GENERIC, FORMULARY_DRUG_CD, gsn, ndc, PROD_STRENGTH, DOSE_VAL_RX, DOSE_UNIT_RX, FORM_VAL_DISP, FORM_UNIT_DISP, ROUTE = line_data if len(hadm_id) and hadm_id in hadm_id_set: if hadm_id not in hadm_time_drug_dict: hadm_time_drug_dict[hadm_id] = dict() time = startdate + ' -- ' + enddate if time not in hadm_time_drug_dict[hadm_id]: hadm_time_drug_dict[hadm_id][time] = [] hadm_time_drug_dict[hadm_id][time].append(drug) # hadm_time_drug_dict[hadm_id][time].append(ndc) py_op.mywritejson(os.path.join(args.data_dir, 'hadm_time_drug_dict.json'), hadm_time_drug_dict)
def generate_demo(): icu_hadm_dict = py_op.myreadjson('../../src/icu_hadm_dict.json') py_op.mywritejson(os.path.join(args.data_dir, 'icu_hadm_dict.json'), icu_hadm_dict) sid_demo_dict = dict() sid_hadm_dict = dict() for i_line, line in enumerate( open(os.path.join(args.mimic_dir, 'PATIENTS.csv'))): if i_line: data = line.split(',') sid = data[1] gender = data[2].replace('"', '') dob = data[3][:4] sid_demo_dict[sid] = [gender, int(dob)] py_op.mywritejson(os.path.join(args.data_dir, 'sid_demo_dict.json'), sid_demo_dict) hadm_sid_dict = dict() hadm_demo_dict = dict() hadm_time_dict = dict() for i_line, line in enumerate( open(os.path.join(args.mimic_dir, 'ICUSTAYS.csv'))): if i_line: line = line.replace('"', '') data = line.split(',') sid = data[1] hadm_id = data[2] icu_id = data[3] intime = data[-3] sid_hadm_dict[sid] = sid_hadm_dict.get(sid, []) + [hadm_id] if icu_id not in icu_hadm_dict: continue hadm_sid_dict[hadm_id] = sid gender = sid_demo_dict[sid][0] dob = sid_demo_dict[sid][1] age = int(intime[:4]) - dob if age < 18: print(age) assert age >= 18 if age > 150: age = 90 hadm_demo_dict[hadm_id] = [gender, age] hadm_time_dict[hadm_id] = intime py_op.mywritejson(os.path.join(args.data_dir, 'hadm_demo_dict.json'), hadm_demo_dict) py_op.mywritejson(os.path.join(args.data_dir, 'hadm_time_dict.json'), hadm_time_dict) py_op.mywritejson(os.path.join(args.data_dir, 'sid_hadm_dict.json'), sid_hadm_dict) py_op.mywritejson(os.path.join(args.data_dir, 'hadm_sid_dict.json'), hadm_sid_dict)
def compute_dist_mat(): files = glob( os.path.join(args.result_dir, args.dataset, 'imputation_result/*.csv')) # [:100] feature_ms_dict = py_op.myreadjson( os.path.join(args.file_dir, args.dataset + '_feature_ms_dict.json')) subtyping_dir = os.path.join(args.result_dir, args.dataset, 'subtyping') py_op.mkdir(subtyping_dir) hadm_id_list = [] mean_variables = [] hadm_variable_dict = {} all_values = [] for i_fi, fi in enumerate(tqdm(files)): hadm_id = fi.split('/')[-1].split('.')[0] hadm_data = [] for i_line, line in enumerate(open(fi)): if i_line: line_data = line.strip().split(',') line_data = np.array([float(x) for x in line_data]) if len(line_data) != n_variables + 1: print(i_fi, fi) if line_data[0] < 0: continue elif line_data[0] < 24: hadm_data.append(line_data) else: break else: head = line.strip().split(',')[1:] assert len(head) == n_variables values = np.array(hadm_data, dtype=np.float32) values = values[-24:] times = values[:, 0] values = values[:, 1:] assert len(values.shape) == 2 assert values.shape[1] == n_variables hadm_variable_dict[hadm_id] = values hadm_id_list.append(hadm_id) all_values.append(values) all_values = np.concatenate(all_values, 0) ms = [all_values.mean(0), all_values.std(0)] hadm_dist_matrix = np.zeros((len(hadm_id_list), len(hadm_id_list))) - 1 for i in tqdm(range(len(hadm_id_list))): hadm_dist_matrix[i, i] = 0 for j in range(i + 1, len(hadm_id_list)): if hadm_dist_matrix[i, j] >= 0 or i == j: continue s1 = hadm_variable_dict[hadm_id_list[i]] s2 = hadm_variable_dict[hadm_id_list[j]] s1 = norm(s1, ms) s2 = norm(s2, ms) dist_mat = dist_func(s1, s2) path = np.zeros([dist_mat.shape[0], dist_mat.shape[1], 3 ]) - inf - 1 compute_dtw(dist_mat, path, hadm_dist_matrix, i, j) py_op.mywritejson(os.path.join(subtyping_dir, 'hadm_id_list.json'), hadm_id_list) np.save(os.path.join(subtyping_dir, 'hadm_dist_matrix.npy'), hadm_dist_matrix)