def main(data, generator_type, output_path, predictor_model): print('********** Running Generator Baseline Experiment **********') with open('config.json') as config_file: configs = json.load(config_file)[data]['feature_generator_explainer'] experiment = 'feature_generator_explainer' device = 'cuda' if torch.cuda.is_available() else 'cpu' if data == 'mimic': p_data, train_loader, valid_loader, test_loader = load_data( batch_size=configs['batch_size'], path='./data') feature_size = p_data.feature_size elif data == 'ghg': p_data, train_loader, valid_loader, test_loader = load_ghg_data( configs['batch_size']) feature_size = p_data.feature_size elif data == 'simulation_spike': p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data_generator/data/simulated_data', data_type='spike') feature_size = p_data.shape[1] elif data == 'simulation': p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data/simulated_data') feature_size = p_data.shape[1] testset = list(exp.test_loader.dataset) test_signals = torch.stack(([x[0] for x in testset])).to(device) true_generator = TrueFeatureGenerator() S = 100 for s in range(S): print('generating sample: ', s) signal = test_signals[s] ffc_sample = np.zeros( (test_signals.shape[1], test_signals.shape[-1] * S)) true_sample = np.zeros( (test_signals.shape[1], test_signals.shape[-1] * S)) for t in range(1, test_signals.shape[-1], 3): if t % 3 == 0: print('t: ', t) ffc_sample_t = exp.generator.forward_joint( signal[:, 0:t].unsqueeze(0)) ffc_sample[:, s * test_signals.shape[-1] + t] = ffc_sample_t.cpu().detach().numpy()[0] true_sample[:, s * test_signals.shape[-1] + t] = true_generator.sample(signal[:, 0:t], t) for f in range(test_signals.shape[1]): ks_stat_f, p_value = stats.ks_2samp(ffc_sample[f, :], true_sample[f, :]) print('feature: ', f, 'KS_stat: ', ks_stat_f, 'p_value: ', p_value)
def main(data, generator_type, all_samples, cv=0): print('********** Experiment with the %s data **********' % ("feature_generator_explainer")) with open('config.json') as config_file: configs = json.load(config_file)[data]["feature_generator_explainer"] if data == 'mimic': p_data, train_loader, valid_loader, test_loader = load_data(batch_size=configs['batch_size'], path='./data', cv=cv) feature_size = p_data.feature_size # samples_to_analyze = {'mimic':MIMIC_TEST_SAMPLES, 'simulation':SIMULATION_SAMPLES, 'ghg':[], 'simulation_spike':[]} elif data == 'ghg': p_data, train_loader, valid_loader, test_loader = load_ghg_data(configs['batch_size'], cv=cv) feature_size = p_data.feature_size elif data == 'simulation_spike': p_data, train_loader, valid_loader, test_loader = load_simulated_data(batch_size=configs['batch_size'], path='./data/simulated_spike_data', data_type='spike', cv=cv) feature_size = p_data.shape[1] elif data == 'simulation': percentage = 100. p_data, train_loader, valid_loader, test_loader = load_simulated_data(batch_size=configs['batch_size'], path='./data/simulated_data', percentage=percentage / 100, cv=cv) # generator_type = generator_type+'_%d'%percentage feature_size = p_data.shape[1] exp = FeatureGeneratorExplainer(train_loader, valid_loader, test_loader, feature_size, patient_data=p_data, generator_hidden_size=configs['encoding_size'], prediction_size=1, historical=(configs['historical'] == 1), generator_type=generator_type, data=data, experiment='feature_generator_explainer_' + generator_type) if all_samples: print('Experiment on all test data') print('Number of test samples: ', len(exp.test_loader.dataset)) exp.select_top_features(samples_to_analyze = range(0, len(exp.test_loader.dataset) // 2), sub_features=[[0], [1], [2], [0,1], [0,2], [1,2], [0,1,2]]) else: imp = exp.select_top_features(samples_to_analyze[data], sub_features=[[0], [1], [2], [0,1], [0,2], [1,2], [0,1,2]]) print(imp[1])
def main(experiment, train, data, generator_type, predictor_model, all_samples, cv, output_path): print('********** Experiment with the %s data **********' % experiment) with open('config.json') as config_file: configs = json.load(config_file)[data][experiment] if not os.path.exists('./data'): os.mkdir('./data') ## Load the data if data == 'mimic': p_data, train_loader, valid_loader, test_loader = load_data( batch_size=configs['batch_size'], path='./data', cv=cv) feature_size = p_data.feature_size elif data == 'ghg': p_data, train_loader, valid_loader, test_loader = load_ghg_data( configs['batch_size'], cv=cv) feature_size = p_data.feature_size elif data == 'simulation_spike': p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data/simulated_spike_data', data_type='spike', cv=cv) feature_size = p_data.shape[1] elif data == 'simulation': percentage = 100. p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data/simulated_data', percentage=percentage / 100, cv=cv) feature_size = p_data.shape[1] ## Create the experiment class if experiment == 'baseline': exp = Baseline(train_loader, valid_loader, test_loader, p_data.feature_size) elif experiment == 'risk_predictor': exp = EncoderPredictor(train_loader, valid_loader, test_loader, feature_size, configs['encoding_size'], rnn_type=configs['rnn_type'], data=data, model=predictor_model) elif experiment == 'feature_generator_explainer': exp = FeatureGeneratorExplainer( train_loader, valid_loader, test_loader, feature_size, patient_data=p_data, output_path=output_path, predictor_model=predictor_model, generator_hidden_size=configs['encoding_size'], prediction_size=1, generator_type=generator_type, data=data, experiment=experiment + '_' + generator_type) elif experiment == 'lime_explainer': exp = BaselineExplainer(train_loader, valid_loader, test_loader, feature_size, data_class=p_data, data=data, baseline_method='lime') if all_samples: print('Experiment on all test data') print('Number of test samples: ', len(exp.test_loader.dataset)) exp.run(train=False, n_epochs=configs['n_epochs'], samples_to_analyze=list(range(0, len(exp.test_loader.dataset))), plot=False, cv=cv) else: exp.run(train=train, n_epochs=configs['n_epochs'], samples_to_analyze=samples_to_analyze[data])
def main(experiment, train, user, data, n_features_to_use=3): #sys.stdout = open('/scratch/gobi1/shalmali/global_importance_'+data+'.txt', 'w') filelist = glob.glob( os.path.join('/scratch/gobi1/%s/TSX_results' % user, data, 'results_*.pkl')) N = len(filelist) with open(filelist[0], 'rb') as f: arr = pkl.load(f) n_features = arr['FFC']['imp'].shape[0] Tt = arr['FFC']['imp'].shape[1] y_ffc = np.zeros((N, n_features)) y_afo = np.zeros((N, n_features)) y_suresh = np.zeros((N, n_features)) y_sens = np.zeros((N, n_features)) y_lime = np.zeros((N, n_features)) for n, file in enumerate(filelist): with open(file, 'rb') as f: arr = pkl.load(f) y_ffc[n, :] = arr['FFC']['imp'].sum(1) y_afo[n, :] = arr['AFO']['imp'].sum(1) y_suresh[n, :] = arr['Suresh_et_al']['imp'].sum(1) y_sens[n, :] = arr['Sens']['imp'][:len(arr['FFC']['imp']), 1:].sum(1) y_lime[n, :] = parse_lime_results(arr, Tt, n_features, data=data).sum(1) y_rank_ffc = np.flip(np.argsort( y_ffc.sum(0)).flatten()) # sorted in order of relevance y_rank_afo = np.flip(np.argsort( y_afo.sum(0)).flatten()) # sorted in order of relevance y_rank_suresh = np.flip(np.argsort( y_suresh.sum(0)).flatten()) # sorted in order of relevance y_rank_sens = np.flip(np.argsort( y_sens.sum(0)).flatten()) # sorted in order of relevance y_rank_lime = np.flip(np.argsort( y_lime.sum(0)).flatten()) # sorted in order of relevance ranked_features = { 'ffc': y_rank_ffc, 'afo': y_rank_afo, 'suresh': y_rank_suresh, 'sens': y_rank_sens, 'lime': y_rank_lime } with open('config.json') as config_file: configs = json.load(config_file)[data][experiment] methods = ranked_features.keys() for m in methods: print('Experiment with 5 most relevant features: ', m) feature_rank = ranked_features[m] for ff in [n_features_to_use]: features = feature_rank[:ff] print('using features', features) if data == 'mimic': p_data, train_loader, valid_loader, test_loader = load_data( batch_size=configs['batch_size'], path='./data', features=features) feature_size = p_data.feature_size elif data == 'ghg': p_data, train_loader, valid_loader, test_loader = load_ghg_data( configs['batch_size'], features=features) feature_size = p_data.feature_size print(feature_size) elif data == 'simulation_spike': p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data_generator/data/simulated_data', data_type='spike', features=features) feature_size = p_data.shape[1] elif data == 'simulation': p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data/simulated_data', features=features) feature_size = p_data.shape[1] if data == 'simulation_spike': data = 'simulation' spike_data = True else: spike_data = False print('training on ', feature_size, ' features!') exp = EncoderPredictor(train_loader, valid_loader, test_loader, feature_size, configs['encoding_size'], rnn_type=configs['rnn_type'], data=data) exp.run(train=train, n_epochs=configs['n_epochs']) n_features_to_remove = 10 #add/remove same number for now #Exp 1 remove and evaluate for m in methods: print('Experiment for removing features using method: ', m) feature_rank = ranked_features[m] #for ff in range(min(n_features-1,n_features_to_remove)): for ff in [n_features_to_remove]: features = [ elem for elem in list(range(n_features)) if elem not in feature_rank[:ff] ] #print('using features:', features) if data == 'mimic': p_data, train_loader, valid_loader, test_loader = load_data( batch_size=configs['batch_size'], path='./data', features=features) feature_size = p_data.feature_size elif data == 'ghg': p_data, train_loader, valid_loader, test_loader = load_ghg_data( configs['batch_size'], features=features) feature_size = p_data.feature_size print(feature_size) elif data == 'simulation_spike': p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data_generator/data/simulated_data', data_type='spike', features=features) feature_size = p_data.shape[1] elif data == 'simulation': p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data/simulated_data', features=features) feature_size = p_data.shape[1] if data == 'simulation_spike': data = 'simulation' spike_data = True else: spike_data = False print('training on ', feature_size, ' features!') exp = EncoderPredictor(train_loader, valid_loader, test_loader, feature_size, configs['encoding_size'], rnn_type=configs['rnn_type'], data=data) exp.run(train=train, n_epochs=configs['n_epochs'])
def main(experiment, train, uncertainty_score, data, generator_type): print('********** Experiment with the %s data **********' % (experiment)) with open('config.json') as config_file: configs = json.load(config_file)[data][experiment] if data == 'mimic': p_data, train_loader, valid_loader, test_loader = load_data( batch_size=configs['batch_size'], path='./data') feature_size = p_data.feature_size elif data == 'ghg': p_data, train_loader, valid_loader, test_loader = load_ghg_data( configs['batch_size']) feature_size = p_data.feature_size elif data == 'simulation_spike': p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data_generator/data/simulated_data', data_type='spike') feature_size = p_data.shape[1] elif data == 'simulation': p_data, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=configs['batch_size'], path='./data/simulated_data') feature_size = p_data.shape[1] if data == 'simulation_spike': data = 'simulation' spike_data = True else: spike_data = False if experiment == 'baseline': exp = Baseline(train_loader, valid_loader, test_loader, p_data.feature_size) elif experiment == 'risk_predictor': exp = EncoderPredictor(train_loader, valid_loader, test_loader, feature_size, configs['encoding_size'], rnn_type=configs['rnn_type'], data=data) elif experiment == 'feature_generator_explainer': #print(spike_data) exp = FeatureGeneratorExplainer( train_loader, valid_loader, test_loader, feature_size, patient_data=p_data, generator_hidden_size=configs['encoding_size'], prediction_size=1, historical=(configs['historical'] == 1), generator_type=generator_type, data=data, experiment=experiment + '_' + generator_type, spike_data=spike_data) elif experiment == 'lime_explainer': exp = BaselineExplainer(train_loader, valid_loader, test_loader, feature_size, data_class=p_data, data=data, baseline_method='lime') exp.run(train=train, n_epochs=configs['n_epochs'], samples_to_analyze=samples_to_analyze[data]) #exp.final_reported_plots(samples_to_analyze=samples_to_analyze[data]) # For MIMIC experiment, extract population level importance for interventions # print('********** Extracting population level intervention statistics **********') # if data == 'mimic' and experiment == 'feature_generator_explainer': # for id in range(len(intervention_list)): # if not os.path.exists("./interventions/int_%d.pkl" % (id)): # exp.summary_stat(id) # exp.plot_summary_stat(id) if uncertainty_score: # Evaluate output uncertainty using deep KNN method print('\n********** Uncertainty Evaluation: **********') device = 'cuda' if torch.cuda.is_available() else 'cpu' sample_ind = 1 n_nearest_neighbors = 10 dknn = DeepKnn(exp.model, p_data.train_data[0:int(0.8 * p_data.n_train), :, :], p_data.train_label[0:int(0.8 * p_data.n_train)], device) knn_labels = dknn.evaluate_confidence( sample=p_data.test_data[sample_ind, :, :].reshape((1, -1, 48)), sample_label=p_data.test_label[sample_ind], _nearest_neighbors=n_nearest_neighbors, verbose=True)
def main(args): if args.data == 'simulation': feature_size = 3 data_path = './data/simulated_data' data_type = 'state' elif args.data == 'simulation_l2x': feature_size = 3 data_path = './data/simulated_data_l2x' data_type = 'state' elif args.data == 'simulation_spike': feature_size = 3 data_path = './data/simulated_spike_data' data_type = 'spike' elif args.data == 'mimic': data_type = 'mimic' timeseries_feature_size = len(feature_map_mimic) # Load data if args.data == 'mimic': p_data, train_loader, valid_loader, test_loader = load_data( batch_size=100, path='./data', cv=args.cv) feature_size = p_data.feature_size else: _, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=100, datapath=data_path, percentage=0.8, data_type=data_type) model = StateClassifier(feature_size=feature_size, n_state=2, hidden_size=200) if args.explainer == 'fit': generator = JointFeatureGenerator(feature_size, hidden_size=feature_size * 3, data=args.data) generator.load_state_dict( torch.load( os.path.join('./ckpt/%s/%s.pt' % (args.data, 'joint_generator')))) testset = [smpl[0] for smpl in test_loader.dataset] samples = torch.stack( [testset[sample] for sample in samples_to_analyze[args.data]]) model.load_state_dict( torch.load(os.path.join('./ckpt/%s/%s.pt' % (args.data, 'model')))) if args.explainer == 'fit': explainer = FITExplainer(model, generator) elif args.explainer == 'integrated_gradient': explainer = IGExplainer(model) elif args.explainer == 'deep_lift': explainer = DeepLiftExplainer(model) elif args.explainer == 'fo': explainer = FOExplainer(model) elif args.explainer == 'afo': explainer = AFOExplainer(model, train_loader) elif args.explainer == 'gradient_shap': explainer = GradientShapExplainer(model) elif args.explainer == 'retain': model = RETAIN(dim_input=feature_size, dim_emb=128, dropout_emb=0.4, dim_alpha=8, dim_beta=8, dropout_context=0.4, dim_output=2) explainer = RETAINexplainer(model, args.data) model.load_state_dict( torch.load(os.path.join('./ckpt/%s/%s.pt' % (args.data, 'retain')))) gt_importance = explainer.attribute(samples, torch.zeros(samples.shape)) for r_ind, ratio in enumerate([.2, .4, .6, .8, 1.]): for param in model.parameters(): params = param.data.cpu().numpy().reshape(-1) params[int(r_ind * 0.2):int(ratio * len(params))] = torch.randn( int(ratio * len(params))) param.data = torch.Tensor(params.reshape(param.data.shape)) if args.explainer == 'fit': explainer = FITExplainer(model, generator) elif args.explainer == 'integrated_gradient': explainer = IGExplainer(model) elif args.explainer == 'deep_lift': explainer = DeepLiftExplainer(model) elif args.explainer == 'fo': explainer = FOExplainer(model) elif args.explainer == 'afo': explainer = AFOExplainer(model, train_loader) elif args.explainer == 'gradient_shap': explainer = GradientShapExplainer(model) elif args.explainer == 'retain': model = RETAIN(dim_input=feature_size, dim_emb=128, dropout_emb=0.4, dim_alpha=8, dim_beta=8, dropout_context=0.4, dim_output=2) explainer = RETAINexplainer(model, args.data) model.load_state_dict( torch.load( os.path.join('./ckpt/%s/%s.pt' % (args.data, 'retain')))) score = explainer.attribute(samples, torch.zeros(samples.shape)) corr = [] for sig in range(len(score)): corr.append( abs( spearmanr(score[sig].reshape(-1, ), gt_importance[sig].reshape(-1, ), nan_policy='omit')[0])) print("correlation for %d percent randomization: %.3f +- %.3f" % (100 * ratio, np.mean(corr), np.std(corr)))
if not os.path.exists(plot_path): os.mkdir(plot_path) # Load data if args.data == 'mimic' or args.data == 'mimic_int': if args.mimic_path is None: raise ValueError( 'Specify the data directory containing processed mimic data') p_data, train_loader, valid_loader, test_loader = load_data(batch_size=batch_size, \ path=args.mimic_path,task=task,cv=args.cv) feature_size = p_data.feature_size class_weight = p_data.pos_weight else: _, train_loader, valid_loader, test_loader = load_simulated_data( batch_size=batch_size, datapath=data_path, percentage=0.8, data_type=data_type, cv=args.cv) # Prepare model to explain if args.explainer == 'retain': if args.data == 'mimic' or args.data == 'simulation' or args.data == 'simulation_l2x': model = RETAIN(dim_input=feature_size, dim_emb=128, dropout_emb=0.4, dim_alpha=8, dim_beta=8, dropout_context=0.4, dim_output=2) elif args.data == 'mimic_int': model = RETAIN(dim_input=feature_size,