def check_interpolation_and_subgraphs(): ft = 'raw' minp = 0.5 minc = 0.6 seed = 2222 standardize_method = "cz" is_cz = True # standardize_method = "z" # is_cz = False freq_list = [ '001', '002', '003', '004', '005', '006', '008', '009', '01', '011' ] freq_to_trainFreq_map = { '001': '001', '002': '002', '003': '004', '004': '005', '005': '007', '006': '008', '008': '01', '009': '011', '01': '013', '011': '014' } nel_graph_length = 13 fout = '../observer/check_interpolation_and_subgraphs/seed%s_%s_mice_mp%s_mc%s_%s' % ( seed, ft, minp, minc, standardize_method) cu.checkAndCreate(fout) cu.checkAndCreate('../data/seed%s/%s/mice/mp%s_mc%s/%s' % (seed, ft, minp, minc, standardize_method)) e = run.Experiment( '../data/seed%s/%s/mice/mp%s_mc%s/%s' % (seed, ft, minp, minc, standardize_method), '../data/seed%s/%s/mice/mp%s_mc%s/dataset' % (seed, ft, minp, minc), seed, is_cz, standardize_method, freq_list, freq_to_trainFreq_map, nel_graph_length) foldi = 2 train = e.ftrain % (e.dataset_folder, foldi, e.standardize_method) test = e.ftest % (e.dataset_folder, foldi, e.standardize_method) print train print test ftrnel = "%s/mimic_train_fold%d.nel" % (fout, foldi) ftrnode = "%s/mimic_train_fold%d.node" % (fout, foldi) fnel = "%s/mimic_fold%d.nel" % (fout, foldi) fnode = "%s/mimic_fold%d.node" % (fout, foldi) # e.interpolation(trcsv=train, tecsv=test, ftrnel=ftrnel, ftrnode=ftrnode, fnel=fnel, fnode=fnode) e.subgraph_mining(tr_nel=ftrnel, tr_te_nel=fnel, freq_t='011', foldi=foldi, cfolder=fout)
def split_test_by_patient(cdn, out_folder, suffix=''): ''' This is a help function for MICE imputation. Given folder, extract data for each patient to be used in MICE ''' for i in range(5): test = pd.read_csv('%s/test_fold%d%s.csv' % (cdn, i, suffix)) gp = test.groupby('sid') cu.checkAndCreate('%s/test_fold%d' % (out_folder, i)) fn = open('%s/test_fold%d/sid_list.txt' % (out_folder, i), 'w') for sid, group in gp: group.to_csv('%s/test_fold%d/%d.csv' % (out_folder, i, sid), index=False) fn.write('%d\n' % sid) fn.close()
def split_by_feature_type(cdn, fn_prefix): ''' split data into two sets, one contains raw features + medical features, another contains standardized features + medical features. ''' print 'split_by_feature_type' for i in range(5): training = pd.read_csv('%s_train_fold%d.csv' % (fn_prefix, i)) testing = pd.read_csv('%s_test_fold%d.csv' % (fn_prefix, i)) raw_train = training[raw_features_for_classify] raw_test = testing[raw_features_for_classify] # z_train = training[standardized_features_for_classify] # z_test = testing[standardized_features_for_classify] cu.checkAndCreate('%s/raw/' % cdn) # cu.checkAndCreate('%s/z/'%cdn) raw_train.to_csv('%s/raw/train_fold%d.csv' % (cdn, i), index=False) raw_test.to_csv('%s/raw/test_fold%d.csv' % (cdn, i), index=False)
def split_by_feature_type(cdn, fn_prefix, raw_colname, z_colname): ''' split data into two sets, one contains raw features + medical features, another contains standardized features + medical features. ''' for i in range(5): training = pd.read_csv('%s_train_fold%d.csv' % (fn_prefix, i)) testing = pd.read_csv('%s_test_fold%d.csv' % (fn_prefix, i)) raw_train = training[raw_colname] raw_test = testing[raw_colname] z_train = training[z_colname] z_test = testing[z_colname] cu.checkAndCreate('%s/raw/' % cdn) cu.checkAndCreate('%s/z/' % cdn) raw_train.to_csv('%s/raw/train_fold%d.csv' % (cdn, i), index=False) raw_test.to_csv('%s/raw/test_fold%d.csv' % (cdn, i), index=False) z_train.to_csv('%s/z/train_fold%d.csv' % (cdn, i), index=False) z_test.to_csv('%s/z/test_fold%d.csv' % (cdn, i), index=False)
def nmfClassfyExperiments(l): minp = l[0] minc = l[1] print minp, minc output = '' for isg in [0, 3]: output += 'isg %d:\n' % isg cu.checkAndCreate('%s/isg%d/nmf_piks' % (self.cdn, isg)) for freq_t in [ '001', '002', '003', '004', '005', '006', '008', '009', '01', '011' ]: output += 'freq_t %s:\n' % freq_t bauc = 0. htauc = 0. bnc = 0 for nc in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]: for foldi in range(5): nmfclassify(isg, freq_t, 2, foldi, nc=nc, minp=minp, minc=minc) best_auc, highest_tr_auc = mean_auc(isg, freq_t, 2, nc=nc, nmf=True) if best_auc > bauc: bnc = nc bauc = max(bauc, best_auc) htauc = max(htauc, highest_tr_auc) output += 'nc %s: %s\n%s\n' % (bnc, bauc, htauc) # output += '\n' output += '\n' fn = open('../data/mice/nmfresult_mp%s_mc%s.txt' % (minp, minc), 'w') fn.write(output) fn.close()
def tuneSGParamForClassification(nmf=False): output = '' for isg in [0, 3]: output += 'isg %d: ' % isg if nmf: cu.checkAndCreate('%s/isg%d/nmf_piks' % (self.cdn, isg)) for freq_t in [ '001', '002', '003', '004', '005', '006', '008', '009', '01', '011' ]: prediction_matrics = read_prediction_matrics(isg, freq_t) if nmf: bauc = 0. tbtauc = 0. bparams = '' for nc in [ 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120 ]: (tbauc, tbparams, tbtauc) = tuneCLFParamForClassification( l, prediction_matrics, nmf=True, nc=nc) if tbauc > bauc: bauc = tbauc bparams = tbparams btauc = tbtauc else: (bauc, bparams, btauc) = tuneCLFParamForClassification(l, prediction_matrics, nmf=False) output += '%f (%s)\n' % (bauc, bparams) if nmf: fn = open('%s/nmfResult.txt' % (cdn), 'w') else: fn = open('%s/dirResult.txt' % (cdn), 'w') fn.write(output) fn.close()
def run(feature_type, minp, minc): # self.cdn = '../data/mean_last12h' # self.cdn = '../data/seed2222/%s/mice/mp%s_mc%s'%(feature_type,minp,minc) # print self.cdn cu.checkAndCreate(self.cdn) for isg in [0, 3]: cu.checkAndCreate('%s/isg%d' % (self.cdn, isg)) cu.checkAndCreate('%s/isg%d/pt_sg_w' % (self.cdn, isg)) cu.checkAndCreate('%s/isg%d/res' % (self.cdn, isg)) for foldi in range(5): train = self.ftrain % (self.cdn, foldi) test = self.ftest % (self.cdn, foldi) ftrnel = "%s/mimic_train_fold%d.nel" % (self.cdn, foldi) ftrnode = "%s/mimic_train_fold%d.node" % (self.cdn, foldi) fnel = "%s/mimic_fold%d.nel" % (self.cdn, foldi) fnode = "%s/mimic_fold%d.node" % (self.cdn, foldi) interpolation(trcsv=train, tecsv=test, ftrnel=ftrnel, ftrnode=ftrnode, fnel=fnel, fnode=fnode) get_freq_to_trainFreq_map(foldi) for freq_t in [ '001', '002', '003', '004', '005', '006', '008', '009', '01', '011' ]: subgraph_mining(tr_nel=ftrnel, tr_te_nel=fnel, freq_t=freq_t, foldi=foldi) for isg in [0, 3]: gen_pt_sg_files(isg, freq_t, foldi)
def run_best_model(cdn): ft = 'raw' seed = 2222 standardize_method = 'z' is_cz = False cu.checkAndCreate('%s/seed%d' % (cdn, seed)) pp.split_nfolds('%s/alldata_readmit.csv' % cdn, '%s/seed%d/alldata_readmit' % (cdn, seed), shuffle=True, seed=seed) pp.split_by_feature_type(cdn='%s/seed%d' % (cdn, seed), fn_prefix='%s/seed%d/alldata_readmit' % (cdn, seed)) cu.checkAndCreate('%s/seed%d/raw/interp' % (cdn, seed)) cu.checkAndCreate('%s/seed%d/raw/interp/mean/dataset' % (cdn, seed)) for i in range(5): pp.impute_by_interpolation_on_last12h( '%s/seed%d/raw/test_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/test_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/extrapolation_log_test_fold%d.txt' % (cdn, seed, i)) pp.impute_by_interpolation_on_last12h( '%s/seed%d/raw/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/extrapolation_log_train_fold%d.txt' % (cdn, seed, i)) pp.impute_by_mean( '%s/seed%d/raw/interp/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/test_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/mean/dataset/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/mean/dataset/test_fold%d.csv' % (cdn, seed, i)) pp.standardize_data( '%s/seed%d/raw/interp/mean/dataset/train_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/mean/dataset/test_fold%d.csv' % (cdn, seed, i), '%s/seed%d/raw/interp/mean/dataset/train_fold%d_%s.csv' % (cdn, seed, i, standardize_method), '%s/seed%d/raw/interp/mean/dataset/test_fold%d_%s.csv' % (cdn, seed, i, standardize_method)) # run temporal model freq_list = ['011'] freq_to_trainFreq_map = {'011': '014'} nel_graph_length = 13 cu.checkAndCreate('%s/seed%d/%s/interp/mean/%s' % (cdn, seed, ft, standardize_method)) e = rn.Experiment( '%s/seed%d/%s/interp/mean/%s' % (cdn, seed, ft, standardize_method), '%s/seed%d/%s/interp/mean/dataset' % (cdn, seed, ft), seed, is_cz, standardize_method, freq_list, freq_to_trainFreq_map, nel_graph_length) isg = 0 freq_t = '011' nc = 110 c = 2 pl = 'l1' cw = 'balanced' ntestth = 2 cu.checkAndCreate('%s/isg%d' % (e.cdn, isg)) cu.checkAndCreate('%s/isg%d/pt_sg_w' % (e.cdn, isg)) cu.checkAndCreate('%s/isg%d/res' % (e.cdn, isg)) cu.checkAndCreate('%s/isg%d/nmf_piks' % (e.cdn, isg)) for foldi in range(5): train = e.ftrain % (e.dataset_folder, foldi, e.standardize_method) test = e.ftest % (e.dataset_folder, foldi, e.standardize_method) print train print test ftrnel = "%s/mimic_train_fold%d.nel" % (e.cdn, foldi) ftrnode = "%s/mimic_train_fold%d.node" % (e.cdn, foldi) fnel = "%s/mimic_fold%d.nel" % (e.cdn, foldi) fnode = "%s/mimic_fold%d.node" % (e.cdn, foldi) e.interpolation(trcsv=train, tecsv=test, ftrnel=ftrnel, ftrnode=ftrnode, fnel=fnel, fnode=fnode) e.get_freq_to_trainFreq_map(foldi) for freq_t in e.moss_freq_threshold_list: e.subgraph_mining(tr_nel=ftrnel, tr_te_nel=fnel, freq_t=freq_t, foldi=foldi) e.gen_pt_sg_files(isg, freq_t, foldi) cu.checkAndCreate('%s/seed%d/raw/interp/mean/last_measures/dataset' % (cdn, seed)) # run baseline model for i in range(5): pp.get_last_measurements( '%s/seed%d/raw/interp/mean/dataset/train_fold%d_%s.csv' % (cdn, seed, i, standardize_method), '%s/seed%d/raw/interp/mean/last_measures/dataset/train_fold%d_%s.csv' % (cdn, seed, i, standardize_method)) pp.get_last_measurements( '%s/seed%d/raw/interp/mean/dataset/test_fold%d_%s.csv' % (cdn, seed, i, standardize_method), '%s/seed%d/raw/interp/mean/last_measures/dataset/test_fold%d_%s.csv' % (cdn, seed, i, standardize_method)) best_features = rfe( '%s/seed%d/raw/interp/mean/last_measures' % (cdn, seed), 50, standardize_method, 5, 'l1', 'balanced') print best_features # best_features = ['urineByHrByWeight', 'HCT', 'INR', 'Platelets', 'RBC', # 'DeliveredTidalVolume', 'PlateauPres', 'RAW', 'RSBI', 'mDBP', 'CV_HR', # 'Art_BE', 'Art_CO2', 'Art_PaCO2', 'Art_pH', 'Cl', 'Mg', 'Anticoagulant', # 'beta.Blocking_agent', 'Somatostatin_preparation', 'Vasodilating_agent', # 'AIDS', 'MetCarcinoma'] baseline_auc = lr('%s/seed%d/raw/interp/mean/last_measures' % (cdn, seed), standardize_method, 5, 'l1', 'balanced', 50) print 'baseline AUC: %s' % baseline_auc res_list = [] for foldi in range(5): fnaddtr = '../data/seed2222/raw/interp/mean/last_measures/dataset/train_fold%d_%s.csv' % ( foldi, standardize_method) fnaddte = '../data/seed2222/raw/interp/mean/last_measures/dataset/test_fold%d_%s.csv' % ( foldi, standardize_method) prediction_matrics = e.read_prediction_matrics(isg, freq_t) (res, gt_te, pt_te, res_baseline) = e.nmfClassify_ob( prediction_matrics['ptsg'][foldi], prediction_matrics['ptwd'][foldi], prediction_matrics['sgs'][foldi], prediction_matrics['pt'][foldi], prediction_matrics['gt'][foldi], '%s/isg%d/nmf_piks/nmf_%s_fold%d_%d.pik' % (e.cdn, isg, freq_t, foldi, nc), ntestth, foldi, nc, c, pl, cw, fnaddtr, fnaddte, best_features) res_list.append(res) (auc, tr_auc) = e.get_mean_auc(res_list) print auc, tr_auc for i in range(len(res_list)): with open( '../data/seed2222/raw/interp/mean/z/isg0/res/c_pre_te_fold%d' % i, 'wb') as f: pickle.dump(res_list[i]['c_pre_te'], f) with open('../data/seed2222/raw/interp/mean/z/isg0/res/res_fold%d' % i, 'wb') as f: pickle.dump(res_list[i], f)