Example #1
0
    def __init__(self,
                 trainset,
                 testset,
                 net,
                 _id=None,
                 # TODO: multiple GPU
                 ):

        self.trainset = trainset
        if self.trainset is not None:
            self.trainloader = torch.utils.data.DataLoader(dataset=self.trainset, batch_size=4,
                                                           shuffle=True, num_workers=2)
        self.testset = testset
        if self.testset is not None:
            self.testloader = torch.utils.data.DataLoader(dataset=self.testset, batch_size=4,
                                                          shuffle=False, num_workers=2)

        self.net = net
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.SGD(self.net.parameters(), lr=0.001, momentum=0.9)

        assert(_id != None)
        self._id = _id  # TODO: assert error, global_id

        self.PATH = "clients/" + str(self._id) + "/"  # TODO: the other PATH for log
        recursive_mkdir(self.PATH)
Example #2
0
def calc_and_plot(data, optimizer, norm, lr, epochs, iteration, batch_size):
    data_dir = f'{batch_size}-weights-{optimizer}-{lr}-{norm}'
    calc(data, optimizer, norm, lr, epochs, iteration, batch_size)
    plot_parent_dir = './plots'
    for i in range(0, 4):
        data_path = f'{data_dir}/store-{i}.pkl'
        plot_path = f'{plot_parent_dir}/{data_dir}/layer{i}'
        recursive_mkdir(plot_path)
        plot(plot_path, data_path)
Example #3
0
def main():
    iteration = 0
    for data in configs['data']:
        for norm in configs['norm']:
            for optimizer in configs['optimizer']:
                for lr in configs['lr']:
                    iteration += 1
                    dir = f'{data}-weights-{optimizer}-{lr}-{norm}'
                    data_dir = f'{dir}/store.pkl'
                    plot_dir = f'./plots/{dir}'
                    calc(data, optimizer, norm, lr, epochs, iteration)
                    recursive_mkdir(plot_dir)
                    plot(plot_dir, data_dir)
Example #4
0
def main(args):
    print('available GPUs:', K.tensorflow_backend._get_available_gpus())
    # create checkpoint
    ck_path = './checkpoints/' + args.exp_nm
    if not os.path.exists(ck_path):
        recursive_mkdir(ck_path)
    all_res_path = os.path.join(ck_path, 'result_summary.csv')

    split_index_dict = get_split_index_dict()

    for fold in range(num_folds):
        log_dir = os.path.join(ck_path,
                               'fold_' + str(fold) + '/')  #skip if exists
        if not os.path.exists(log_dir):
            os.mkdir(log_dir)
        else:
            continue
        train_patient_indexes = split_index_dict[str(
            fold)]['train_patient_indexes']
        val_patient_indexes = split_index_dict[str(
            fold)]['val_patient_indexes']
        fold_mean_score = train(
            log_dir=log_dir,
            fold=fold,
            train_patient_indexes=train_patient_indexes,  #TODO
            val_patient_indexes=val_patient_indexes,
            data_file_path=args.data_file_path
        )  #for each fold of the 5, train & validate the model and return mean score, mean score is a dictionary

        fold_mean_score['fold'] = fold
        res_df = pd.DataFrame.from_dict(fold_mean_score)
        write_header = True if not os.path.exists(
            all_res_path) else False  # write header
        res_df.to_csv(all_res_path, mode='a', index=False, header=write_header)

    # calculate average score
    print('Final score from ', num_folds, ' folds cross validation saved to ',
          all_res_path)
Example #5
0
def run(data_fn,
        prop_missing=0.,
        max_num_feature=-1,
        feature_selection='random',
        k=10,
        data_dir='_data',
        out_dir='_out'):
    """Run RIDDLE classification interpretation pipeline.

    Arguments:
        data_fn: string
            data file filename
        prop_missing: float
            proportion of feature observations which should be randomly masked;
            values in [0, 1)
        max_num_feature: int
            maximum number of features to use
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k: int
            number of partitions for k-fold cross-validation
        interpret_model: bool
            whether to interpret the trained model for first k-fold partition
        which_half: str
            which half of experiments to do; values = {'first', 'last', 'both'}
        data_dir: string
            directory where data files are located
        cache_dir: string
            directory where cached files (e.g., saved parameters) are located
        out_dir: string
            outer directory where outputs (e.g., results) should be saved
    """
    from keras.models import load_model
    from riddle import emr, feature_importance
    from riddle.models import MLP

    start = time.time()

    base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing,
                                    max_num_feature, feature_selection)
    recursive_mkdir(base_out_dir)

    # get common data
    x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = (
        get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing))
    num_feature = len(idx_feat_dict)
    num_class = len(idx_class_dict)

    list_sums_D, list_sums_D2, list_sums_contribs = [], [], []

    for k_idx in range(k):
        full_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx)
        print('\nPartition k = {}'.format(k_idx))
        x_train_unvec, y_train, _, _, x_test_unvec, y_test = emr.get_k_fold_partition(
            x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices)

        if max_num_feature > 0:  # select features and re-encode
            feat_encoding_dict, idx_feat_dict = select_features(
                x_train_unvec,
                y_train,
                idx_feat_dict,
                method=feature_selection,
                num_feature=num_feature,
                max_num_feature=max_num_feature)
            x_test_unvec = subset_reencode_features(x_test_unvec,
                                                    feat_encoding_dict)
            num_feature = max_num_feature

        # interpret
        start = time.time()

        temp_mlp = MLP(num_feature=num_feature, num_class=num_class)
        hdf5_path = full_out_dir + '/model.h5'
        sums_D, sums_D2, sums_contribs, pairs = \
            feature_importance.get_diff_sums(
                hdf5_path,
                x_test_unvec,
                process_x_func=temp_mlp.process_x,
                num_feature=num_feature,
                num_class=num_class)

        with open(full_out_dir + '/sums_D.pkl', 'wb') as f:
            pickle.dump(sums_D, f)
        with open(full_out_dir + '/sums_D2.pkl', 'wb') as f:
            pickle.dump(sums_D2, f)
        with open(full_out_dir + '/sums_contribs.pkl', 'wb') as f:
            pickle.dump(sums_contribs, f)

        list_sums_D.append(sums_D)
        list_sums_D2.append(sums_D2)
        list_sums_contribs.append(sums_contribs)

    def compute_total_sums(list_sums):
        total_sums = list_sums[0]

        for i in range(1, len(list_sums)):
            for j in range(len(total_sums)):
                total_sums[j] = np.add(total_sums[j], list_sums[i][j])

        return total_sums

    total_sums_D = compute_total_sums(list_sums_D)
    total_sums_D2 = compute_total_sums(list_sums_D2)
    total_sums_contribs = compute_total_sums(list_sums_contribs)

    num_sample = len(x_unvec)
    run_interpretation_summary(x_unvec,
                               y,
                               total_sums_D,
                               total_sums_D2,
                               total_sums_contribs,
                               idx_feat_dict=idx_feat_dict,
                               idx_class_dict=idx_class_dict,
                               icd9_descript_dict=icd9_descript_dict,
                               pairs=pairs,
                               num_sample=num_sample,
                               full_out_dir=base_out_dir)

    print('Computed DeepLIFT scores and analysis in {:.4f} seconds'.format(
        time.time() - start))
    print('-' * 72)
    print()
Example #6
0
def run_kfold(data_fn,
              method='logit',
              prop_missing=0.,
              max_num_feature=-1,
              feature_selection='random',
              k=10,
              max_num_sample=10000,
              num_search=30,
              data_dir='_data',
              cache_dir='_cache',
              force_run=False):
    """Run several parameter searches a la k-fold cross-validation.

    Arguments:
        data_fn: string
            data file filename
        method: string
            name of classification method; values = {'logit', 'random_forest',
            'linear_svm', 'poly_svm', 'rbf_svm', 'gbdt', 'riddle'}
        prop_missing: float
            proportion of feature observations which should be randomly masked;
            values in [0, 1)
        max_num_feature: int
            maximum number of features to use
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k: int
            number of partitions for k-fold cross-validation
        max_num_sample: int
            maximum number of samples to use
        num_search: int
            number of searches (parameter configurations) to try for each
            partition
        data_dir: string
            directory where data files are located
        cache_dir: string
            directory where cached files (e.g., saved parameters) are located
        out_dir: string
            directory where outputs (e.g., results) should be saved
    """
    if 'debug' in data_fn:
        num_search = 3

    # check if already did param search, if so, skip
    param_path = get_param_path(cache_dir, method, data_fn, prop_missing,
                                max_num_feature, feature_selection)
    if not force_run and os.path.isfile(param_path):
        warnings.warn(
            'Already did search for {}, skipping the search'.format(method))
        return

    x_unvec, y, idx_feat_dict, idx_class_dict, _, perm_indices = (
        get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing))
    num_feature = len(idx_feat_dict)
    num_class = len(idx_class_dict)
    params = {}
    for k_idx in range(0, k):
        params[k_idx] = run(method,
                            x_unvec,
                            y,
                            idx_feat_dict,
                            num_feature=num_feature,
                            max_num_feature=max_num_feature,
                            num_class=num_class,
                            max_num_sample=max_num_sample,
                            feature_selection=feature_selection,
                            k_idx=k_idx,
                            k=k,
                            num_search=num_search,
                            perm_indices=perm_indices)

    recursive_mkdir(FLAGS.cache_dir)
    with open(param_path, 'wb') as f:  # save
        pickle.dump(params, f)

    print('Finished parameter search for method: {}'.format(method))
Example #7
0
def run_kfold(data_fn,
              method='logit',
              prop_missing=0.,
              max_num_feature=-1,
              feature_selection='random',
              k=10,
              which_half='both',
              data_dir='_data',
              cache_dir='_cache',
              out_dir='_out'):
    """Run several classification pipelines a la k-fold cross-validation.

    Arguments:
        data_fn: string
            data file filename
        method: string
            name of classification method; values = {'logit', 'random_forest',
            'linear_svm', 'poly_svm', 'rbf_svm', 'gbdt'}
        prop_missing: float
            proportion of feature observations which should be randomly masked;
            values in [0, 1)
        max_num_feature: int
            maximum number of features to use
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k: int
            number of partitions for k-fold cross-validation
        which_half: str
            which half of experiments to do; values = {'first', 'last', 'both'}
        data_dir: string
            directory where data files are located
        cache_dir: string
            directory where cached files (e.g., saved parameters) are located
        out_dir: string
            directory where
        perm_indices: np.ndarray, int
            array of indices representing a permutation of the samples with
            shape (num_sample, )
        init_args: {string: ?}
            dictionary mapping initialization argument names to values
        out_dir: string
            directory where outputs (e.g., results) should be saved
    """
    start = time.time()

    try:  # load saved parameters
        param_path = get_param_path(cache_dir, method, data_fn, prop_missing,
                                    max_num_feature, feature_selection)
        with open(param_path, 'rb') as f:
            params = pickle.load(f)
    except:
        warnings.warn('Cannot load parameters from: {}\n'.format(param_path) +
                      'Need to do parameter search; run parameter_search.py')
        raise

    # TODO(jisungkim) handle binary and multiclass separately, don't assume
    # multiclass!
    if method == 'logit':
        from sklearn.linear_model import LogisticRegression as ModelClass
        init_args = {'multi_class': 'multinomial', 'solver': 'lbfgs'}
    elif method == 'random_forest':
        from sklearn.ensemble import RandomForestClassifier as ModelClass
        init_args = {}
    elif method == 'linear_svm':
        from sklearn.svm import SVC as ModelClass
        # remark: due to a bug in scikit-learn / libsvm, the sparse 'linear'
        # kernel is much slower than the sparse 'poly' kernel, so we use
        # the 'poly' kernel with degree=1 over the 'linear' kernel
        init_args = {
            'kernel': 'poly',
            'degree': 1,
            'coef0': 0.,
            'gamma': 1.,
            'probability': True,
            'cache_size': 1000
        }
    elif method == 'poly_svm':
        from sklearn.svm import SVC as ModelClass
        init_args = {'kernel': 'poly', 'probability': True, 'cache_size': 1000}
    elif method == 'rbf_svm':
        from sklearn.svm import SVC as ModelClass
        init_args = {'kernel': 'rbf', 'probability': True, 'cache_size': 1000}
    elif method == 'gbdt':
        from xgboost import XGBClassifier as ModelClass
        init_args = {'objective': 'multi:softprob'}
    else:
        raise ValueError('unknown method: {}'.format(method))

    x_unvec, y, idx_feat_dict, idx_class_dict, _, perm_indices = (
        get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing))
    num_feature = len(idx_feat_dict)
    num_class = len(idx_class_dict)

    base_out_dir = get_base_out_dir(out_dir, method, data_fn, prop_missing,
                                    max_num_feature, feature_selection)
    recursive_mkdir(base_out_dir)

    if which_half == 'both':
        loop = range(0, k)
    elif which_half == 'first':
        loop = range(0, k / 2)
    elif which_half == 'last':
        loop = range(k / 2, k)
    else:
        raise ValueError('Unknown which_half: {}'.format(which_half))

    for k_idx in loop:
        sub_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx)
        recursive_mkdir(sub_out_dir)

        run(ModelClass,
            x_unvec,
            y,
            idx_feat_dict,
            num_feature=num_feature,
            max_num_feature=max_num_feature,
            num_class=num_class,
            feature_selection=feature_selection,
            k_idx=k_idx,
            k=k,
            params=params,
            perm_indices=perm_indices,
            init_args=init_args,
            full_out_dir=sub_out_dir)

    print('This k-fold {} multipipeline run script took {:.4f} seconds'.format(
        method,
        time.time() - start))
Example #8
0
def run_kfold(data_fn, prop_missing=0., max_num_feature=-1,
              feature_selection='random', k=10, which_half='both',
              data_dir='_data', cache_dir='_cache', out_dir='_out'):
    """Run several RIDDLE classification pipelines a la k-fold cross-validation.

    Arguments:
        data_fn: string
            data file filename
        prop_missing: float
            proportion of feature observations which should be randomly masked;
            values in [0, 1)
        max_num_feature: int
            maximum number of features to use
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k: int
            number of partitions for k-fold cross-validation
        which_half: str
            which half of experiments to do; values = {'first', 'last', 'both'}
        data_dir: string
            directory where data files are located
        cache_dir: string
            directory where cached files (e.g., saved parameters) are located
        out_dir: string
            outer directory where outputs (e.g., results) should be saved
    """
    start = time.time()

    base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing,
                                    max_num_feature, feature_selection)
    recursive_mkdir(base_out_dir)

    # get common data
    x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = (
        get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing))
    num_feature = len(idx_feat_dict)
    num_class = len(idx_class_dict)

    # print/save value-sorted dictionary of classes and features
    class_mapping = sorted(idx_class_dict.items(), key=lambda key: key[0])
    with open(base_out_dir + '/class_mapping.txt', 'w') as f:
        print(class_mapping, file=f)
    with open(base_out_dir + '/feature_mapping.txt', 'w') as f:
        for idx, feat in idx_feat_dict.items():
            f.write('{}\t{}\n'.format(idx, feat))

    try:  # load saved parameters
        param_path = get_param_path(cache_dir, 'riddle', data_fn, prop_missing,
                                    max_num_feature, feature_selection)
        with open(param_path, 'rb') as f:
            params = pickle.load(f)

        # for legacy compatability
        new_params = {}
        for k_idx, param in params.items():
            if 'nb_hidden_layers' in param:
                param['num_hidden_layer'] = param.pop('nb_hidden_layers')
            if 'nb_hidden_nodes' in param:
                param['num_hidden_node'] = param.pop('nb_hidden_nodes')
            new_params[k_idx] = param
        params = params

    except:
        warnings.warn('Cannot load parameters from: {}\n'.format(param_path) +
                      'Need to do parameter search; run parameter_search.py')
        raise

    if which_half == 'both':
        loop = range(0, k)
    elif which_half == 'first':
        loop = range(0, k / 2)
    elif which_half == 'last':
        loop = range(k / 2, k)
    else:
        raise ValueError('Unknown which_half: {}'.format(which_half))

    for k_idx in loop:
        sub_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx)
        recursive_mkdir(sub_out_dir)

        run(x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict,
            num_feature=num_feature, max_num_feature=max_num_feature,
            num_class=num_class, feature_selection=feature_selection,
            k_idx=k_idx, k=k, params=params, perm_indices=perm_indices,
            full_out_dir=sub_out_dir)

    print('This k-fold riddle multipipeline run script took {:.4f} seconds'
          .format(time.time() - start))