def main(data, generator_type, output_path, predictor_model):
    print('********** Running Generator Baseline Experiment **********')
    with open('config.json') as config_file:
        configs = json.load(config_file)[data]['feature_generator_explainer']

    experiment = 'feature_generator_explainer'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if data == 'mimic':
        p_data, train_loader, valid_loader, test_loader = load_data(
            batch_size=configs['batch_size'], path='./data')
        feature_size = p_data.feature_size
    elif data == 'ghg':
        p_data, train_loader, valid_loader, test_loader = load_ghg_data(
            configs['batch_size'])
        feature_size = p_data.feature_size
    elif data == 'simulation_spike':
        p_data, train_loader, valid_loader, test_loader = load_simulated_data(
            batch_size=configs['batch_size'],
            path='./data_generator/data/simulated_data',
            data_type='spike')
        feature_size = p_data.shape[1]

    elif data == 'simulation':
        p_data, train_loader, valid_loader, test_loader = load_simulated_data(
            batch_size=configs['batch_size'], path='./data/simulated_data')
        feature_size = p_data.shape[1]

    testset = list(exp.test_loader.dataset)
    test_signals = torch.stack(([x[0] for x in testset])).to(device)

    true_generator = TrueFeatureGenerator()

    S = 100
    for s in range(S):
        print('generating sample: ', s)
        signal = test_signals[s]
        ffc_sample = np.zeros(
            (test_signals.shape[1], test_signals.shape[-1] * S))
        true_sample = np.zeros(
            (test_signals.shape[1], test_signals.shape[-1] * S))
        for t in range(1, test_signals.shape[-1], 3):
            if t % 3 == 0:
                print('t: ', t)
            ffc_sample_t = exp.generator.forward_joint(
                signal[:, 0:t].unsqueeze(0))
            ffc_sample[:, s * test_signals.shape[-1] +
                       t] = ffc_sample_t.cpu().detach().numpy()[0]
            true_sample[:, s * test_signals.shape[-1] +
                        t] = true_generator.sample(signal[:, 0:t], t)

    for f in range(test_signals.shape[1]):
        ks_stat_f, p_value = stats.ks_2samp(ffc_sample[f, :],
                                            true_sample[f, :])
        print('feature: ', f, 'KS_stat: ', ks_stat_f, 'p_value: ', p_value)
Example #2
0
def main(data, generator_type, all_samples, cv=0):
    print('********** Experiment with the %s data **********' % ("feature_generator_explainer"))
    with open('config.json') as config_file:
        configs = json.load(config_file)[data]["feature_generator_explainer"]

    if data == 'mimic':
        p_data, train_loader, valid_loader, test_loader = load_data(batch_size=configs['batch_size'],
                                                                    path='./data', cv=cv)
        feature_size = p_data.feature_size
        # samples_to_analyze = {'mimic':MIMIC_TEST_SAMPLES, 'simulation':SIMULATION_SAMPLES, 'ghg':[], 'simulation_spike':[]}
    elif data == 'ghg':
        p_data, train_loader, valid_loader, test_loader = load_ghg_data(configs['batch_size'], cv=cv)
        feature_size = p_data.feature_size
    elif data == 'simulation_spike':
        p_data, train_loader, valid_loader, test_loader = load_simulated_data(batch_size=configs['batch_size'],
                                                                              path='./data/simulated_spike_data',
                                                                              data_type='spike', cv=cv)
        feature_size = p_data.shape[1]

    elif data == 'simulation':
        percentage = 100.
        p_data, train_loader, valid_loader, test_loader = load_simulated_data(batch_size=configs['batch_size'],
                                                                              path='./data/simulated_data',
                                                                              percentage=percentage / 100, cv=cv)
        # generator_type = generator_type+'_%d'%percentage
        feature_size = p_data.shape[1]

    exp = FeatureGeneratorExplainer(train_loader, valid_loader, test_loader, feature_size, patient_data=p_data,
                                    generator_hidden_size=configs['encoding_size'], prediction_size=1,
                                    historical=(configs['historical'] == 1),
                                    generator_type=generator_type, data=data,
                                    experiment='feature_generator_explainer_' + generator_type)

    if all_samples:
        print('Experiment on all test data')
        print('Number of test samples: ', len(exp.test_loader.dataset))
        exp.select_top_features(samples_to_analyze = range(0, len(exp.test_loader.dataset) // 2), sub_features=[[0], [1], [2], [0,1], [0,2], [1,2], [0,1,2]])
    else:
        imp = exp.select_top_features(samples_to_analyze[data], sub_features=[[0], [1], [2], [0,1], [0,2], [1,2], [0,1,2]])
        print(imp[1])
def main(experiment, train, data, generator_type, predictor_model, all_samples,
         cv, output_path):
    print('********** Experiment with the %s data **********' % experiment)
    with open('config.json') as config_file:
        configs = json.load(config_file)[data][experiment]

    if not os.path.exists('./data'):
        os.mkdir('./data')
    ## Load the data
    if data == 'mimic':
        p_data, train_loader, valid_loader, test_loader = load_data(
            batch_size=configs['batch_size'], path='./data', cv=cv)
        feature_size = p_data.feature_size
    elif data == 'ghg':
        p_data, train_loader, valid_loader, test_loader = load_ghg_data(
            configs['batch_size'], cv=cv)
        feature_size = p_data.feature_size
    elif data == 'simulation_spike':
        p_data, train_loader, valid_loader, test_loader = load_simulated_data(
            batch_size=configs['batch_size'],
            path='./data/simulated_spike_data',
            data_type='spike',
            cv=cv)
        feature_size = p_data.shape[1]

    elif data == 'simulation':
        percentage = 100.
        p_data, train_loader, valid_loader, test_loader = load_simulated_data(
            batch_size=configs['batch_size'],
            path='./data/simulated_data',
            percentage=percentage / 100,
            cv=cv)
        feature_size = p_data.shape[1]

    ## Create the experiment class
    if experiment == 'baseline':
        exp = Baseline(train_loader, valid_loader, test_loader,
                       p_data.feature_size)
    elif experiment == 'risk_predictor':
        exp = EncoderPredictor(train_loader,
                               valid_loader,
                               test_loader,
                               feature_size,
                               configs['encoding_size'],
                               rnn_type=configs['rnn_type'],
                               data=data,
                               model=predictor_model)
    elif experiment == 'feature_generator_explainer':
        exp = FeatureGeneratorExplainer(
            train_loader,
            valid_loader,
            test_loader,
            feature_size,
            patient_data=p_data,
            output_path=output_path,
            predictor_model=predictor_model,
            generator_hidden_size=configs['encoding_size'],
            prediction_size=1,
            generator_type=generator_type,
            data=data,
            experiment=experiment + '_' + generator_type)
    elif experiment == 'lime_explainer':
        exp = BaselineExplainer(train_loader,
                                valid_loader,
                                test_loader,
                                feature_size,
                                data_class=p_data,
                                data=data,
                                baseline_method='lime')

    if all_samples:
        print('Experiment on all test data')
        print('Number of test samples: ', len(exp.test_loader.dataset))
        exp.run(train=False,
                n_epochs=configs['n_epochs'],
                samples_to_analyze=list(range(0,
                                              len(exp.test_loader.dataset))),
                plot=False,
                cv=cv)
    else:
        exp.run(train=train,
                n_epochs=configs['n_epochs'],
                samples_to_analyze=samples_to_analyze[data])
def main(experiment, train, user, data, n_features_to_use=3):
    #sys.stdout = open('/scratch/gobi1/shalmali/global_importance_'+data+'.txt', 'w')
    filelist = glob.glob(
        os.path.join('/scratch/gobi1/%s/TSX_results' % user, data,
                     'results_*.pkl'))

    N = len(filelist)
    with open(filelist[0], 'rb') as f:
        arr = pkl.load(f)

    n_features = arr['FFC']['imp'].shape[0]
    Tt = arr['FFC']['imp'].shape[1]

    y_ffc = np.zeros((N, n_features))
    y_afo = np.zeros((N, n_features))
    y_suresh = np.zeros((N, n_features))
    y_sens = np.zeros((N, n_features))
    y_lime = np.zeros((N, n_features))

    for n, file in enumerate(filelist):
        with open(file, 'rb') as f:
            arr = pkl.load(f)

        y_ffc[n, :] = arr['FFC']['imp'].sum(1)
        y_afo[n, :] = arr['AFO']['imp'].sum(1)
        y_suresh[n, :] = arr['Suresh_et_al']['imp'].sum(1)
        y_sens[n, :] = arr['Sens']['imp'][:len(arr['FFC']['imp']), 1:].sum(1)
        y_lime[n, :] = parse_lime_results(arr, Tt, n_features,
                                          data=data).sum(1)

    y_rank_ffc = np.flip(np.argsort(
        y_ffc.sum(0)).flatten())  # sorted in order of relevance
    y_rank_afo = np.flip(np.argsort(
        y_afo.sum(0)).flatten())  # sorted in order of relevance
    y_rank_suresh = np.flip(np.argsort(
        y_suresh.sum(0)).flatten())  # sorted in order of relevance
    y_rank_sens = np.flip(np.argsort(
        y_sens.sum(0)).flatten())  # sorted in order of relevance
    y_rank_lime = np.flip(np.argsort(
        y_lime.sum(0)).flatten())  # sorted in order of relevance
    ranked_features = {
        'ffc': y_rank_ffc,
        'afo': y_rank_afo,
        'suresh': y_rank_suresh,
        'sens': y_rank_sens,
        'lime': y_rank_lime
    }

    with open('config.json') as config_file:
        configs = json.load(config_file)[data][experiment]

    methods = ranked_features.keys()

    for m in methods:
        print('Experiment with 5 most relevant features: ', m)
        feature_rank = ranked_features[m]

        for ff in [n_features_to_use]:
            features = feature_rank[:ff]
            print('using features', features)

            if data == 'mimic':
                p_data, train_loader, valid_loader, test_loader = load_data(
                    batch_size=configs['batch_size'],
                    path='./data',
                    features=features)
                feature_size = p_data.feature_size
            elif data == 'ghg':
                p_data, train_loader, valid_loader, test_loader = load_ghg_data(
                    configs['batch_size'], features=features)
                feature_size = p_data.feature_size
                print(feature_size)
            elif data == 'simulation_spike':
                p_data, train_loader, valid_loader, test_loader = load_simulated_data(
                    batch_size=configs['batch_size'],
                    path='./data_generator/data/simulated_data',
                    data_type='spike',
                    features=features)
                feature_size = p_data.shape[1]

            elif data == 'simulation':
                p_data, train_loader, valid_loader, test_loader = load_simulated_data(
                    batch_size=configs['batch_size'],
                    path='./data/simulated_data',
                    features=features)
                feature_size = p_data.shape[1]

            if data == 'simulation_spike':
                data = 'simulation'
                spike_data = True
            else:
                spike_data = False

            print('training on ', feature_size, ' features!')

            exp = EncoderPredictor(train_loader,
                                   valid_loader,
                                   test_loader,
                                   feature_size,
                                   configs['encoding_size'],
                                   rnn_type=configs['rnn_type'],
                                   data=data)
            exp.run(train=train, n_epochs=configs['n_epochs'])

    n_features_to_remove = 10  #add/remove same number for now
    #Exp 1 remove and evaluate
    for m in methods:
        print('Experiment for removing features using method: ', m)
        feature_rank = ranked_features[m]

        #for ff in range(min(n_features-1,n_features_to_remove)):
        for ff in [n_features_to_remove]:
            features = [
                elem for elem in list(range(n_features))
                if elem not in feature_rank[:ff]
            ]
            #print('using features:', features)

            if data == 'mimic':
                p_data, train_loader, valid_loader, test_loader = load_data(
                    batch_size=configs['batch_size'],
                    path='./data',
                    features=features)
                feature_size = p_data.feature_size
            elif data == 'ghg':
                p_data, train_loader, valid_loader, test_loader = load_ghg_data(
                    configs['batch_size'], features=features)
                feature_size = p_data.feature_size
                print(feature_size)
            elif data == 'simulation_spike':
                p_data, train_loader, valid_loader, test_loader = load_simulated_data(
                    batch_size=configs['batch_size'],
                    path='./data_generator/data/simulated_data',
                    data_type='spike',
                    features=features)
                feature_size = p_data.shape[1]

            elif data == 'simulation':
                p_data, train_loader, valid_loader, test_loader = load_simulated_data(
                    batch_size=configs['batch_size'],
                    path='./data/simulated_data',
                    features=features)
                feature_size = p_data.shape[1]

            if data == 'simulation_spike':
                data = 'simulation'
                spike_data = True
            else:
                spike_data = False

            print('training on ', feature_size, ' features!')

            exp = EncoderPredictor(train_loader,
                                   valid_loader,
                                   test_loader,
                                   feature_size,
                                   configs['encoding_size'],
                                   rnn_type=configs['rnn_type'],
                                   data=data)
            exp.run(train=train, n_epochs=configs['n_epochs'])
Example #5
0
def main(experiment, train, uncertainty_score, data, generator_type):
    print('********** Experiment with the %s data **********' % (experiment))
    with open('config.json') as config_file:
        configs = json.load(config_file)[data][experiment]

    if data == 'mimic':
        p_data, train_loader, valid_loader, test_loader = load_data(
            batch_size=configs['batch_size'], path='./data')
        feature_size = p_data.feature_size
    elif data == 'ghg':
        p_data, train_loader, valid_loader, test_loader = load_ghg_data(
            configs['batch_size'])
        feature_size = p_data.feature_size
    elif data == 'simulation_spike':
        p_data, train_loader, valid_loader, test_loader = load_simulated_data(
            batch_size=configs['batch_size'],
            path='./data_generator/data/simulated_data',
            data_type='spike')
        feature_size = p_data.shape[1]

    elif data == 'simulation':
        p_data, train_loader, valid_loader, test_loader = load_simulated_data(
            batch_size=configs['batch_size'], path='./data/simulated_data')
        feature_size = p_data.shape[1]

    if data == 'simulation_spike':
        data = 'simulation'
        spike_data = True
    else:
        spike_data = False

    if experiment == 'baseline':
        exp = Baseline(train_loader, valid_loader, test_loader,
                       p_data.feature_size)
    elif experiment == 'risk_predictor':
        exp = EncoderPredictor(train_loader,
                               valid_loader,
                               test_loader,
                               feature_size,
                               configs['encoding_size'],
                               rnn_type=configs['rnn_type'],
                               data=data)
    elif experiment == 'feature_generator_explainer':
        #print(spike_data)
        exp = FeatureGeneratorExplainer(
            train_loader,
            valid_loader,
            test_loader,
            feature_size,
            patient_data=p_data,
            generator_hidden_size=configs['encoding_size'],
            prediction_size=1,
            historical=(configs['historical'] == 1),
            generator_type=generator_type,
            data=data,
            experiment=experiment + '_' + generator_type,
            spike_data=spike_data)
    elif experiment == 'lime_explainer':
        exp = BaselineExplainer(train_loader,
                                valid_loader,
                                test_loader,
                                feature_size,
                                data_class=p_data,
                                data=data,
                                baseline_method='lime')

    exp.run(train=train,
            n_epochs=configs['n_epochs'],
            samples_to_analyze=samples_to_analyze[data])
    #exp.final_reported_plots(samples_to_analyze=samples_to_analyze[data])

    # For MIMIC experiment, extract population level importance for interventions
    # print('********** Extracting population level intervention statistics **********')
    # if data == 'mimic' and experiment == 'feature_generator_explainer':
    #     for id in range(len(intervention_list)):
    #         if not os.path.exists("./interventions/int_%d.pkl" % (id)):
    #             exp.summary_stat(id)
    #         exp.plot_summary_stat(id)

    if uncertainty_score:
        # Evaluate output uncertainty using deep KNN method
        print('\n********** Uncertainty Evaluation: **********')
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        sample_ind = 1
        n_nearest_neighbors = 10
        dknn = DeepKnn(exp.model,
                       p_data.train_data[0:int(0.8 * p_data.n_train), :, :],
                       p_data.train_label[0:int(0.8 * p_data.n_train)], device)
        knn_labels = dknn.evaluate_confidence(
            sample=p_data.test_data[sample_ind, :, :].reshape((1, -1, 48)),
            sample_label=p_data.test_label[sample_ind],
            _nearest_neighbors=n_nearest_neighbors,
            verbose=True)
Example #6
0
def main(args):
    if args.data == 'simulation':
        feature_size = 3
        data_path = './data/simulated_data'
        data_type = 'state'
    elif args.data == 'simulation_l2x':
        feature_size = 3
        data_path = './data/simulated_data_l2x'
        data_type = 'state'
    elif args.data == 'simulation_spike':
        feature_size = 3
        data_path = './data/simulated_spike_data'
        data_type = 'spike'
    elif args.data == 'mimic':
        data_type = 'mimic'
        timeseries_feature_size = len(feature_map_mimic)

    # Load data
    if args.data == 'mimic':
        p_data, train_loader, valid_loader, test_loader = load_data(
            batch_size=100, path='./data', cv=args.cv)
        feature_size = p_data.feature_size
    else:
        _, train_loader, valid_loader, test_loader = load_simulated_data(
            batch_size=100,
            datapath=data_path,
            percentage=0.8,
            data_type=data_type)

    model = StateClassifier(feature_size=feature_size,
                            n_state=2,
                            hidden_size=200)

    if args.explainer == 'fit':
        generator = JointFeatureGenerator(feature_size,
                                          hidden_size=feature_size * 3,
                                          data=args.data)
        generator.load_state_dict(
            torch.load(
                os.path.join('./ckpt/%s/%s.pt' %
                             (args.data, 'joint_generator'))))

    testset = [smpl[0] for smpl in test_loader.dataset]
    samples = torch.stack(
        [testset[sample] for sample in samples_to_analyze[args.data]])

    model.load_state_dict(
        torch.load(os.path.join('./ckpt/%s/%s.pt' % (args.data, 'model'))))
    if args.explainer == 'fit':
        explainer = FITExplainer(model, generator)
    elif args.explainer == 'integrated_gradient':
        explainer = IGExplainer(model)
    elif args.explainer == 'deep_lift':
        explainer = DeepLiftExplainer(model)
    elif args.explainer == 'fo':
        explainer = FOExplainer(model)
    elif args.explainer == 'afo':
        explainer = AFOExplainer(model, train_loader)
    elif args.explainer == 'gradient_shap':
        explainer = GradientShapExplainer(model)
    elif args.explainer == 'retain':
        model = RETAIN(dim_input=feature_size,
                       dim_emb=128,
                       dropout_emb=0.4,
                       dim_alpha=8,
                       dim_beta=8,
                       dropout_context=0.4,
                       dim_output=2)
        explainer = RETAINexplainer(model, args.data)
        model.load_state_dict(
            torch.load(os.path.join('./ckpt/%s/%s.pt' %
                                    (args.data, 'retain'))))
    gt_importance = explainer.attribute(samples, torch.zeros(samples.shape))

    for r_ind, ratio in enumerate([.2, .4, .6, .8, 1.]):
        for param in model.parameters():
            params = param.data.cpu().numpy().reshape(-1)
            params[int(r_ind * 0.2):int(ratio * len(params))] = torch.randn(
                int(ratio * len(params)))
            param.data = torch.Tensor(params.reshape(param.data.shape))
        if args.explainer == 'fit':
            explainer = FITExplainer(model, generator)
        elif args.explainer == 'integrated_gradient':
            explainer = IGExplainer(model)
        elif args.explainer == 'deep_lift':
            explainer = DeepLiftExplainer(model)
        elif args.explainer == 'fo':
            explainer = FOExplainer(model)
        elif args.explainer == 'afo':
            explainer = AFOExplainer(model, train_loader)
        elif args.explainer == 'gradient_shap':
            explainer = GradientShapExplainer(model)
        elif args.explainer == 'retain':
            model = RETAIN(dim_input=feature_size,
                           dim_emb=128,
                           dropout_emb=0.4,
                           dim_alpha=8,
                           dim_beta=8,
                           dropout_context=0.4,
                           dim_output=2)
            explainer = RETAINexplainer(model, args.data)
            model.load_state_dict(
                torch.load(
                    os.path.join('./ckpt/%s/%s.pt' % (args.data, 'retain'))))

        score = explainer.attribute(samples, torch.zeros(samples.shape))
        corr = []
        for sig in range(len(score)):
            corr.append(
                abs(
                    spearmanr(score[sig].reshape(-1, ),
                              gt_importance[sig].reshape(-1, ),
                              nan_policy='omit')[0]))
        print("correlation for %d percent randomization: %.3f +- %.3f" %
              (100 * ratio, np.mean(corr), np.std(corr)))
    if not os.path.exists(plot_path):
        os.mkdir(plot_path)

    # Load data
    if args.data == 'mimic' or args.data == 'mimic_int':
        if args.mimic_path is None:
            raise ValueError(
                'Specify the data directory containing processed mimic data')
        p_data, train_loader, valid_loader, test_loader = load_data(batch_size=batch_size, \
            path=args.mimic_path,task=task,cv=args.cv)
        feature_size = p_data.feature_size
        class_weight = p_data.pos_weight
    else:
        _, train_loader, valid_loader, test_loader = load_simulated_data(
            batch_size=batch_size,
            datapath=data_path,
            percentage=0.8,
            data_type=data_type,
            cv=args.cv)

    # Prepare model to explain
    if args.explainer == 'retain':
        if args.data == 'mimic' or args.data == 'simulation' or args.data == 'simulation_l2x':
            model = RETAIN(dim_input=feature_size,
                           dim_emb=128,
                           dropout_emb=0.4,
                           dim_alpha=8,
                           dim_beta=8,
                           dropout_context=0.4,
                           dim_output=2)
        elif args.data == 'mimic_int':
            model = RETAIN(dim_input=feature_size,