コード例 #1
0
    def test_get_k_fold_partition(self):
        num_sample = 100
        x, y = [], []
        for i in range(0, num_sample):
            features = [
                np.random.randint(11, 50),
                np.random.randint(11, 50),
                np.random.randint(11, 50)
            ]
            target = np.random.randint(0, 10)
            x.append(features)
            y.append(target)

        assert len(x) == num_sample
        assert len(y) == num_sample

        k = 10
        perm_indices = np.random.permutation(num_sample)
        test_partitions = []
        for k_idx in range(0, k):
            x_train, y_train, x_val, y_val, x_test, y_test = \
                emr.get_k_fold_partition(x, y, k_idx, k, perm_indices)
            # check sum of lengths
            assert num_sample == len(x_train) + len(x_val) + len(x_test)
            assert num_sample == len(y_train) + len(y_val) + len(y_test)

            # check values are equivalent
            assert sorted(x) == sorted(x_train + x_val + x_test)
            assert sorted(y) == sorted(y_train + y_val + y_test)

            test_partitions.extend(x_test)

        assert len(test_partitions) == num_sample
        assert sorted(test_partitions) == sorted(x)  # check full test coverage
コード例 #2
0
ファイル: other_clf.py プロジェクト: mydp2017/RIDDLE
def run(data_fn,
        method='lrfc',
        which_half='both',
        prop_missing=0.0,
        k=10,
        skip_nonlinear_svm=False,
        nb_searches=20):
    data_path = '{}/{}'.format(DATA_DIR, data_fn)

    def get_results_dir(method, k_idx):
        base_folder = 'out/more/{}_{}_{}'.format(method, data_fn, prop_missing)
        folder = '{}/{}_idx_partition'.format(base_folder, k_idx)

        if not os.path.exists('out'): os.makedirs('out')
        if not os.path.exists('out/more'): os.makedirs('out/models')
        if not os.path.exists(base_folder): os.makedirs(base_folder)
        if not os.path.exists(folder): os.makedirs(folder)

        return folder

    try:  # load saved parameters
        get_param_fn = lambda x: '{}/{}_{}_{}_param.pkl'.format(
            CACHE_DIR, x, data_fn, prop_missing)

        if method == 'lrfc':
            with open(get_param_fn('logit'), 'r') as f:
                logit_params = pickle.load(f)
            with open(get_param_fn('rfc'), 'r') as f:
                rfc_params = pickle.load(f)
        elif method == 'svm':
            with open(get_param_fn('linear-svm'), 'r') as f:
                linear_svm_params = pickle.load(f)
            if not skip_nonlinear_svm:
                with open(get_param_fn('poly-svm'), 'r') as f:
                    poly_svm_params = pickle.load(f)
                with open(get_param_fn('rbf-svm'), 'r') as f:
                    rbf_svm_params = pickle.load(f)
        else:
            raise ValueError('unknown method: {}'.format(method))
    except:
        eprint('Need to do parameter search!')
        eprint('Please run `parameter_search.py` with the relevant' +
               'command line arguments')
        raise

    X, y, perm_indices, nb_features, nb_classes = get_base_data(
        data_path, prop_missing)

    losses = {
        'logit': [],
        'rfc': [],
        'linear-svm': [],
        'poly-svm': [],
        'rbf-svm': []
    }
    accs = {
        'logit': [],
        'rfc': [],
        'linear-svm': [],
        'poly-svm': [],
        'rbf-svm': []
    }
    runtimes = {
        'logit': [],
        'rfc': [],
        'linear-svm': [],
        'poly-svm': [],
        'rbf-svm': []
    }

    if which_half == 'first': loop_seq = range(0, k / 2)
    elif which_half == 'last': loop_seq = range(k / 2, k)
    elif which_half == 'both': loop_seq = range(0, k)
    else:
        raise ValueError(
            '`which_half` must be \'first\', \'last\' or \'both\'')

    for k_idx in loop_seq:
        print('-' * 72)
        print('Partition k = {}'.format(k_idx))

        data_partition_dict = emr.get_k_fold_partition(
            X, y, k_idx=k_idx, k=k, perm_indices=perm_indices)
        X_train = data_partition_dict['X_train']
        y_train = data_partition_dict['y_train']
        X_val = data_partition_dict['X_val']
        y_val = data_partition_dict['y_val']
        X_test = data_partition_dict['X_test']
        y_test = data_partition_dict['y_test']

        selected_feat_indices = select_feats(X_train + X_val,
                                             y_train + y_val,
                                             nb_features=nb_features)

        X_train, y_train = preproc_for_sklearn(X_train, y_train, nb_features)
        X_test, y_test = preproc_for_sklearn(X_test, y_test, nb_features)

        old_nb_features = len(X_train[0])
        X_train = X_train[:, selected_feat_indices]
        X_test = X_test[:, selected_feat_indices]

        nb_features = len(X_train[0])  # extraneous but for future utility
        print('Reduced features from {} to {}'.format(old_nb_features,
                                                      nb_features))

        if method == 'lrfc':
            from sklearn.linear_model import LogisticRegression
            from sklearn.ensemble import RandomForestClassifier

            # logistic regression
            start = time.time()
            logit = LogisticRegression(multi_class='multinomial',
                                       solver='lbfgs',
                                       **logit_params[k_idx])
            logit.fit(X_train, y_train)
            logit_acc = accuracy_score(y_test, logit.predict(X_test))
            logit_y_test_proba = logit.predict_proba(X_test)
            logit_loss = log_loss(y_test, logit_y_test_proba)
            logit_time = time.time() - start
            print(
                'Logistic regression / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s'
                .format(logit_loss, logit_acc, logit_time))

            # random forest classifier
            start = time.time()
            rfc = RandomForestClassifier(**rfc_params[k_idx])
            rfc.fit(X_train, y_train)
            rfc_acc = accuracy_score(y_test, rfc.predict(X_test))
            rfc_y_test_proba = rfc.predict_proba(X_test)
            rfc_loss = log_loss(y_test, rfc_y_test_proba)
            rfc_time = time.time() - start
            print(
                'Random forest / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s'
                .format(rfc_loss, rfc_acc, rfc_time))

            save_test_results(
                logit_y_test_proba, y_test,
                '{}/test_results.txt'.format(get_results_dir('logit', k_idx)))
            save_test_results(
                rfc_y_test_proba, y_test,
                '{}/test_results.txt'.format(get_results_dir('rfc', k_idx)))
            # joblib.dump(logit, get_results_dir('logit', k_idx) + '/clf.pkl')
            # joblib.dump(rfc, get_results_dir('rfc', k_idx) + '/clf.pkl')

            losses['logit'].append(logit_loss)
            accs['logit'].append(logit_acc)
            runtimes['logit'].append(logit_time)

            losses['rfc'].append(rfc_loss)
            accs['rfc'].append(rfc_acc)
            runtimes['rfc'].append(rfc_time)

        elif method == 'svm':
            from sklearn.svm import SVC

            # linear SVM
            start = time.time()
            linear_svm = SVC(kernel='linear',
                             probability=True,
                             **linear_svm_params[k_idx])
            linear_svm.fit(X_train, y_train)
            linear_svm_acc = accuracy_score(y_test, linear_svm.predict(X_test))
            linear_svm_y_test_proba = linear_svm.predict_proba(X_test)
            linear_svm_loss = log_loss(y_test, linear_svm_y_test_proba)
            linear_svm_time = time.time() - start
            print(
                'Linear SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(linear_svm_acc, linear_svm_loss, linear_svm_time))

            save_test_results(
                linear_svm_y_test_proba, y_test, '{}/test_results.txt'.format(
                    get_results_dir('linear-svm', k_idx)))
            # joblib.dump(linear_svm, get_results_dir('linear-svm', k_idx) + '/clf.pkl')

            losses['linear-svm'].append(linear_svm_loss)
            accs['linear-svm'].append(linear_svm_acc)
            runtimes['linear-svm'].append(linear_svm_time)

            if skip_nonlinear_svm: continue  # skip

            # polynomial SVM
            start = time.time()
            poly_svm = SVC(kernel='poly',
                           probability=True,
                           **poly_svm_params[k_idx])
            poly_svm.fit(X_train, y_train)
            poly_svm_acc = accuracy_score(y_test, poly_svm.predict(X_test))
            poly_svm_y_test_proba = poly_svm.predict_proba(X_test)
            poly_svm_loss = log_loss(y_test, poly_svm_y_test_proba)
            poly_svm_time = time.time() - start
            print(
                'Polynomial SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(poly_svm_acc, poly_svm_loss, poly_svm_time))

            # RBF SVM
            start = time.time()
            rbf_svm = SVC(kernel='rbf',
                          probability=True,
                          **rbf_svm_params[k_idx])
            rbf_svm.fit(X_train, y_train)
            rbf_svm_acc = accuracy_score(y_test, rbf_svm.predict(X_test))
            rbf_svm_y_test_proba = rbf_svm.predict_proba(X_test)
            rbf_svm_loss = log_loss(y_test, rbf_svm_y_test_proba)
            rbf_svm_time = time.time() - start
            print('RBF SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'.
                  format(rbf_svm_acc, rbf_svm_loss, rbf_svm_time))

            save_test_results(
                poly_svm_y_test_proba, y_test, '{}/test_results.txt'.format(
                    get_results_dir('poly-svm', k_idx)))
            save_test_results(
                rbf_svm_y_test_proba, y_test,
                '{}/test_results.txt'.format(get_results_dir('rbf-svm',
                                                             k_idx)))
            # joblib.dump(poly_svm, get_results_dir('poly-svm', k_idx) + '/clf.pkl')
            # joblib.dump(rbf_svm, get_results_dir('rbf-svm', k_idx) + '/clf.pkl')

            losses['poly-svm'].append(poly_svm_loss)
            accs['poly-svm'].append(poly_svm_acc)
            runtimes['poly-svm'].append(poly_svm_time)

            losses['rbf-svm'].append(rbf_svm_loss)
            accs['rbf-svm'].append(rbf_svm_acc)
            runtimes['rbf-svm'].append(rbf_svm_time)

        else:
            raise ValueError('unknown method: {}'.format(method))

    print()
    print('#' * 72)
    if method == 'lrfc':
        print_metrics(losses['logit'], accs['logit'], runtimes['logit'],
                      'Logistic regression')
        print_metrics(losses['rfc'], accs['rfc'], runtimes['rfc'],
                      'Random forest')
    elif method == 'svm':
        print_metrics(losses['linear-svm'], accs['linear-svm'],
                      runtimes['linear-svm'], 'Linear SVM')
        if not skip_nonlinear_svm:
            print_metrics(losses['poly-svm'], accs['poly-svm'],
                          runtimes['poly-svm'], 'Polynomial SVM')
            print_metrics(losses['rbf-svm'], accs['rbf-svm'],
                          runtimes['rbf-svm'], 'RBF SVM')
    else:
        raise ValueError('unknown method: {}'.format(method))
    print('#' * 72)
コード例 #3
0
def run(data_fn,
        prop_missing=0.,
        max_num_feature=-1,
        feature_selection='random',
        k=10,
        data_dir='_data',
        out_dir='_out'):
    """Run RIDDLE classification interpretation pipeline.

    Arguments:
        data_fn: string
            data file filename
        prop_missing: float
            proportion of feature observations which should be randomly masked;
            values in [0, 1)
        max_num_feature: int
            maximum number of features to use
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k: int
            number of partitions for k-fold cross-validation
        interpret_model: bool
            whether to interpret the trained model for first k-fold partition
        which_half: str
            which half of experiments to do; values = {'first', 'last', 'both'}
        data_dir: string
            directory where data files are located
        cache_dir: string
            directory where cached files (e.g., saved parameters) are located
        out_dir: string
            outer directory where outputs (e.g., results) should be saved
    """
    from keras.models import load_model
    from riddle import emr, feature_importance
    from riddle.models import MLP

    start = time.time()

    base_out_dir = get_base_out_dir(out_dir, 'riddle', data_fn, prop_missing,
                                    max_num_feature, feature_selection)
    recursive_mkdir(base_out_dir)

    # get common data
    x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict, perm_indices = (
        get_preprocessed_data(data_dir, data_fn, prop_missing=prop_missing))
    num_feature = len(idx_feat_dict)
    num_class = len(idx_class_dict)

    list_sums_D, list_sums_D2, list_sums_contribs = [], [], []

    for k_idx in range(k):
        full_out_dir = '{}/k_idx={}'.format(base_out_dir, k_idx)
        print('\nPartition k = {}'.format(k_idx))
        x_train_unvec, y_train, _, _, x_test_unvec, y_test = emr.get_k_fold_partition(
            x_unvec, y, k_idx=k_idx, k=k, perm_indices=perm_indices)

        if max_num_feature > 0:  # select features and re-encode
            feat_encoding_dict, idx_feat_dict = select_features(
                x_train_unvec,
                y_train,
                idx_feat_dict,
                method=feature_selection,
                num_feature=num_feature,
                max_num_feature=max_num_feature)
            x_test_unvec = subset_reencode_features(x_test_unvec,
                                                    feat_encoding_dict)
            num_feature = max_num_feature

        # interpret
        start = time.time()

        temp_mlp = MLP(num_feature=num_feature, num_class=num_class)
        hdf5_path = full_out_dir + '/model.h5'
        sums_D, sums_D2, sums_contribs, pairs = \
            feature_importance.get_diff_sums(
                hdf5_path,
                x_test_unvec,
                process_x_func=temp_mlp.process_x,
                num_feature=num_feature,
                num_class=num_class)

        with open(full_out_dir + '/sums_D.pkl', 'wb') as f:
            pickle.dump(sums_D, f)
        with open(full_out_dir + '/sums_D2.pkl', 'wb') as f:
            pickle.dump(sums_D2, f)
        with open(full_out_dir + '/sums_contribs.pkl', 'wb') as f:
            pickle.dump(sums_contribs, f)

        list_sums_D.append(sums_D)
        list_sums_D2.append(sums_D2)
        list_sums_contribs.append(sums_contribs)

    def compute_total_sums(list_sums):
        total_sums = list_sums[0]

        for i in range(1, len(list_sums)):
            for j in range(len(total_sums)):
                total_sums[j] = np.add(total_sums[j], list_sums[i][j])

        return total_sums

    total_sums_D = compute_total_sums(list_sums_D)
    total_sums_D2 = compute_total_sums(list_sums_D2)
    total_sums_contribs = compute_total_sums(list_sums_contribs)

    num_sample = len(x_unvec)
    run_interpretation_summary(x_unvec,
                               y,
                               total_sums_D,
                               total_sums_D2,
                               total_sums_contribs,
                               idx_feat_dict=idx_feat_dict,
                               idx_class_dict=idx_class_dict,
                               icd9_descript_dict=icd9_descript_dict,
                               pairs=pairs,
                               num_sample=num_sample,
                               full_out_dir=base_out_dir)

    print('Computed DeepLIFT scores and analysis in {:.4f} seconds'.format(
        time.time() - start))
    print('-' * 72)
    print()
コード例 #4
0
def run(method, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
        num_class, max_num_sample, feature_selection, k_idx, k, num_search,
        perm_indices):
    """Run a parameter search for a single k-fold partitions

    Arguments:
        method: string
            name of classification method; values = {'logit', 'random_forest',
            'linear_svm', 'poly_svm', 'rbf_svm', 'gbdt', 'riddle'}
        x_unvec: [[int]]
            feature indices that have not been vectorized; each inner list
            collects the indices of features that are present (binary on)
            for a sample
        y: [int]
            list of class labels as integer indices
        idx_feat_dict: {int: string}
            dictionary mapping feature indices to features
        num_feature: int
            number of features present in the dataset
        max_num_feature: int
            maximum number of features to use
        num_class: int
            number of classes present
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k_idx: int
            index of the k-fold partition to use
        k: int
            number of partitions for k-fold cross-validation
        num_search: int
            number of searches (parameter configurations) to try
        perm_indices: np.ndarray, int
            array of indices representing a permutation of the samples with
            shape (num_sample, )

    Returns:
        best_param: {string: ?}
            dictionary mapping parameter names to the best values found
    """
    print('-' * 72)
    print('Partition k = {}'.format(k_idx))

    x_train_unvec, y_train, x_val_unvec, y_val, _, _ = (
        emr.get_k_fold_partition(x_unvec,
                                 y,
                                 k_idx=k_idx,
                                 k=k,
                                 perm_indices=perm_indices))

    if max_num_feature > 0:  # select features and re-encode
        feat_encoding_dict, _ = select_features(
            x_train_unvec,
            y_train,
            idx_feat_dict,
            method=feature_selection,
            num_feature=num_feature,
            max_num_feature=max_num_feature)
        x_val_unvec = subset_reencode_features(x_val_unvec, feat_encoding_dict)
        num_feature = max_num_feature

    # cap number of validation samples
    if max_num_sample != None and len(x_val_unvec) > max_num_sample:
        x_val_unvec = x_val_unvec[0:max_num_sample]
        y_val = y_val[0:max_num_sample]

    start = time.time()
    if method == 'riddle':
        model_class = MLP
        init_args = {'num_feature': num_feature, 'num_class': num_class}
        param_dist = {
            'num_hidden_layer': 2,  # [1, 2]
            'num_hidden_node': 512,  # [128, 256, 512]
            'activation': ['prelu', 'relu'],
            'dropout': tuning.Uniform(lo=0.2, hi=0.8),
            'learning_rate': tuning.UniformLogSpace(10, lo=-6, hi=-1),
        }
        best_param = tuning.random_search(model_class,
                                          init_args,
                                          param_dist,
                                          x_val_unvec,
                                          y_val,
                                          num_class=num_class,
                                          k=TUNING_K,
                                          num_search=num_search)
    else:  # scikit-learn methods
        x_val = vectorize_features(x_val_unvec, num_feature)

        if method == 'logit':  # logistic regression
            from sklearn.linear_model import LogisticRegression
            estimator = LogisticRegression(multi_class='multinomial',
                                           solver='lbfgs')
            param_dist = {'C': tuning.UniformLogSpace(base=10, lo=-3, hi=3)}
        elif method == 'random_forest':
            from sklearn.ensemble import RandomForestClassifier
            estimator = RandomForestClassifier()
            param_dist = {
                'max_features': ['sqrt', 'log2', None],
                'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=7),
                'n_estimators': tuning.UniformIntegerLogSpace(base=2,
                                                              lo=4,
                                                              hi=8)
            }
        elif method == 'linear_svm':
            from sklearn.svm import SVC
            # remark: due to a bug in scikit-learn / libsvm, the sparse 'linear'
            # kernel is much slower than the sparse 'poly' kernel, so we use
            # the 'poly' kernel with degree=1 over the 'linear' kernel
            estimator = SVC(kernel='poly',
                            degree=1,
                            coef0=0.,
                            gamma=1.,
                            probability=True,
                            cache_size=1000)
            param_dist = {'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1)}
        elif method == 'poly_svm':
            from sklearn.svm import SVC
            estimator = SVC(kernel='poly', probability=True, cache_size=1000)
            param_dist = {
                'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1),
                'degree': [2, 3, 4],
                'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1)
            }
        elif method == 'rbf_svm':
            from sklearn.svm import SVC
            estimator = SVC(kernel='rbf', probability=True, cache_size=1000)
            param_dist = {
                'C': tuning.UniformLogSpace(base=10, lo=-2, hi=1),
                'gamma': tuning.UniformLogSpace(base=10, lo=-5, hi=1)
            }
        elif method == 'gbdt':
            from xgboost import XGBClassifier
            estimator = XGBClassifier(objective='multi:softprob')
            param_dist = {
                'max_depth': tuning.UniformIntegerLogSpace(base=2, lo=0, hi=5),
                'n_estimators': tuning.UniformIntegerLogSpace(base=2,
                                                              lo=4,
                                                              hi=8),
                'learning_rate': tuning.UniformLogSpace(base=10, lo=-3, hi=0)
            }
        else:
            raise ValueError('unknown method: {}'.format(method))

        param_search = RandomizedSearchCV(estimator,
                                          param_dist,
                                          refit=False,
                                          n_iter=num_search,
                                          scoring=loss_scorer)
        param_search.fit(x_val, y_val)

        best_param = param_search.best_params_

    print('Best parameters for {} for k_idx={}: {} found in {:.3f} s'.format(
        method, k_idx, best_param,
        time.time() - start))

    return best_param
コード例 #5
0
ファイル: other_clf.py プロジェクト: agoila/RIDDLE
def run(data_fn, method='lrfc', which_half='both', prop_missing=0.0, k=10, 
    skip_nonlinear_svm=False, nb_searches=20):
    data_path = '{}/{}'.format(DATA_DIR, data_fn)

    def get_results_dir(method, k_idx):
        base_folder = 'out/more/{}_{}_{}'.format(method, data_fn, prop_missing)
        folder = '{}/{}_idx_partition'.format(base_folder, k_idx)

        if not os.path.exists('out'): os.makedirs('out')
        if not os.path.exists('out/more'): os.makedirs('out/models')
        if not os.path.exists(base_folder): os.makedirs(base_folder)
        if not os.path.exists(folder): os.makedirs(folder)

        return folder

    try: # load saved parameters
        get_param_fn = lambda x: '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, 
            x, data_fn, prop_missing)

        if method == 'lrfc':
            with open(get_param_fn('logit'), 'r') as f:
                logit_params = pickle.load(f)
            with open(get_param_fn('rfc'), 'r') as f:
                rfc_params = pickle.load(f)
        elif method == 'svm':
            with open(get_param_fn('linear-svm'), 'r') as f:
                linear_svm_params = pickle.load(f)
            if not skip_nonlinear_svm:
                with open(get_param_fn('poly-svm'), 'r') as f:
                    poly_svm_params = pickle.load(f)
                with open(get_param_fn('rbf-svm'), 'r') as f:
                    rbf_svm_params = pickle.load(f)
        else: raise ValueError('unknown method: {}'.format(method))
    except:
        eprint('Need to do parameter search!')
        eprint('Please run `parameter_search.py` with the relevant' + 
               'command line arguments')
        raise

    X, y, perm_indices, nb_features, nb_classes = get_base_data(data_path, 
        prop_missing)

    losses = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]}
    accs = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]}
    runtimes = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]}

    if which_half == 'first': loop_seq = range(0, k / 2)
    elif which_half == 'last': loop_seq = range(k / 2, k)
    elif which_half == 'both': loop_seq = range(0, k)
    else: raise ValueError('`which_half` must be \'first\', \'last\' or \'both\'')

    for k_idx in loop_seq:
        print('-' * 72)
        print('Partition k = {}'.format(k_idx))

        data_partition_dict = emr.get_k_fold_partition(X, y, k_idx=k_idx, k=k, 
        perm_indices=perm_indices)
        X_train = data_partition_dict['X_train']
        y_train = data_partition_dict['y_train']
        X_val   = data_partition_dict['X_val']
        y_val   = data_partition_dict['y_val']
        X_test  = data_partition_dict['X_test']
        y_test  = data_partition_dict['y_test']

        selected_feat_indices = select_feats(X_train + X_val, y_train + y_val,
            nb_features=nb_features)

        X_train, y_train = preproc_for_sklearn(X_train, y_train, nb_features)
        X_test, y_test = preproc_for_sklearn(X_test, y_test, nb_features)

        old_nb_features = len(X_train[0])
        X_train = X_train[:, selected_feat_indices]
        X_test = X_test[:, selected_feat_indices]

        nb_features = len(X_train[0]) # extraneous but for future utility
        print('Reduced features from {} to {}'.format(old_nb_features, nb_features))

        if method == 'lrfc':
            from sklearn.linear_model import LogisticRegression
            from sklearn.ensemble import RandomForestClassifier
            
            # logistic regression
            start = time.time()
            logit = LogisticRegression(multi_class='multinomial', solver='lbfgs', 
                **logit_params[k_idx])
            logit.fit(X_train, y_train)
            logit_acc = accuracy_score(y_test, logit.predict(X_test))
            logit_y_test_probas = logit.predict_proba(X_test)
            logit_loss = log_loss(y_test, logit_y_test_probas)
            logit_time = time.time() - start
            print('Logistic regression / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s'
                .format(logit_loss, logit_acc, logit_time))

            # random forest classifier
            start = time.time()
            rfc = RandomForestClassifier(**rfc_params[k_idx])
            rfc.fit(X_train, y_train)
            rfc_acc = accuracy_score(y_test, rfc.predict(X_test))
            rfc_y_test_probas = rfc.predict_proba(X_test)
            rfc_loss = log_loss(y_test, rfc_y_test_probas)
            rfc_time = time.time() - start
            print('Random forest / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s'
                .format(rfc_loss, rfc_acc, rfc_time))
            
            save_test_results(logit_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('logit', k_idx)))
            save_test_results(rfc_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('rfc', k_idx)))
            # joblib.dump(logit, get_results_dir('logit', k_idx) + '/clf.pkl')
            # joblib.dump(rfc, get_results_dir('rfc', k_idx) + '/clf.pkl')

            losses['logit'].append(logit_loss)
            accs['logit'].append(logit_acc)
            runtimes['logit'].append(logit_time)

            losses['rfc'].append(rfc_loss)
            accs['rfc'].append(rfc_acc)
            runtimes['rfc'].append(rfc_time)

        elif method == 'svm':
            from sklearn.svm import SVC

            # linear SVM
            start = time.time()
            linear_svm = SVC(kernel='linear', probability=True, 
                **linear_svm_params[k_idx])
            linear_svm.fit(X_train, y_train)
            linear_svm_acc = accuracy_score(y_test, linear_svm.predict(X_test))
            linear_svm_y_test_probas = linear_svm.predict_proba(X_test)
            linear_svm_loss = log_loss(y_test, linear_svm_y_test_probas)
            linear_svm_time = time.time() - start
            print('Linear SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(linear_svm_acc, linear_svm_loss, linear_svm_time))

            save_test_results(linear_svm_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('linear-svm', k_idx)))
            # joblib.dump(linear_svm, get_results_dir('linear-svm', k_idx) + '/clf.pkl')

            losses['linear-svm'].append(linear_svm_loss)
            accs['linear-svm'].append(linear_svm_acc)
            runtimes['linear-svm'].append(linear_svm_time)

            if skip_nonlinear_svm: continue # skip

            # polynomial SVM
            start = time.time()
            poly_svm = SVC(kernel='poly', probability=True,
                **poly_svm_params[k_idx])
            poly_svm.fit(X_train, y_train)
            poly_svm_acc = accuracy_score(y_test, poly_svm.predict(X_test))
            poly_svm_y_test_probas = poly_svm.predict_proba(X_test)
            poly_svm_loss = log_loss(y_test, poly_svm_y_test_probas)
            poly_svm_time = time.time() - start
            print('Polynomial SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(poly_svm_acc, poly_svm_loss, poly_svm_time))

            # RBF SVM
            start = time.time()
            rbf_svm = SVC(kernel='rbf', probability=True, 
                **rbf_svm_params[k_idx])
            rbf_svm.fit(X_train, y_train)
            rbf_svm_acc = accuracy_score(y_test, rbf_svm.predict(X_test))
            rbf_svm_y_test_probas = rbf_svm.predict_proba(X_test)
            rbf_svm_loss = log_loss(y_test, rbf_svm_y_test_probas)
            rbf_svm_time = time.time() - start
            print('RBF SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(rbf_svm_acc, rbf_svm_loss, rbf_svm_time))

            save_test_results(poly_svm_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('poly-svm', k_idx)))
            save_test_results(rbf_svm_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('rbf-svm', k_idx)))
            # joblib.dump(poly_svm, get_results_dir('poly-svm', k_idx) + '/clf.pkl')
            # joblib.dump(rbf_svm, get_results_dir('rbf-svm', k_idx) + '/clf.pkl')

            losses['poly-svm'].append(poly_svm_loss)
            accs['poly-svm'].append(poly_svm_acc)
            runtimes['poly-svm'].append(poly_svm_time)

            losses['rbf-svm'].append(rbf_svm_loss)
            accs['rbf-svm'].append(rbf_svm_acc)
            runtimes['rbf-svm'].append(rbf_svm_time)

        else: raise ValueError('unknown method: {}'.format(method))

    print()
    print('#' * 72)
    if method == 'lrfc':
        print_metrics(losses['logit'], accs['logit'], runtimes['logit'],
            'Logistic regression')
        print_metrics(losses['rfc'], accs['rfc'], runtimes['rfc'], 
            'Random forest')
    elif method == 'svm':
        print_metrics(losses['linear-svm'], accs['linear-svm'], 
            runtimes['linear-svm'], 'Linear SVM')
        if not skip_nonlinear_svm:
            print_metrics(losses['poly-svm'], accs['poly-svm'], 
                runtimes['poly-svm'], 'Polynomial SVM')
            print_metrics(losses['rbf-svm'], accs['rbf-svm'], 
                runtimes['rbf-svm'], 'RBF SVM')
    else: raise ValueError('unknown method: {}'.format(method))
    print('#' * 72)
コード例 #6
0
ファイル: parameter_search.py プロジェクト: mydp2017/RIDDLE
def run(data_fn,
        method='lrfc',
        prop_missing=0.0,
        k=10,
        skip_nonlinear_svm=False,
        nb_searches=20,
        max_nb_samples=10000):
    if 'dummy' in data_fn or 'debug' in data_fn: nb_searches = 3
    data_path = '{}/{}'.format(DATA_DIR, data_fn)

    if not FORCE_RUN:  # check if already did param search, if so, skip
        did = lambda x: already_done(x, data_fn, prop_missing)  # helper
        if method == 'riddle' and did(['riddle']):
            eprint('Already did parameter search for riddle')
            return
        elif method == 'lrfc' and did(['logit', 'rfc']):
            eprint('Already did parameter search for lrfc')
            return
        elif method == 'svm' and did(['linear-svm', 'poly-svm', 'rbf-svm']):
            eprint('Already did parameter search for svm')
            return

    params = {
        'riddle': {},
        'logit': {},
        'rfc': {},
        'linear-svm': {},
        'poly-svm': {},
        'rbf-svm': {}
    }
    X, y, perm_indices, nb_features, nb_classes = get_base_data(
        data_path, prop_missing)

    for k_idx in range(0, k):
        print('-' * 72)
        print('Partition k = {}'.format(k_idx))

        data_partition_dict = emr.get_k_fold_partition(
            X, y, k_idx=k_idx, k=k, perm_indices=perm_indices)

        X_train = data_partition_dict['X_train']
        y_train = data_partition_dict['y_train']

        X_val = data_partition_dict['X_val']
        y_val = data_partition_dict['y_val']

        # cap number of validation samples
        if max_nb_samples != None and len(X_val) > max_nb_samples:
            X_val = X_val[0:max_nb_samples]
            y_val = y_val[0:max_nb_samples]

        if method != 'riddle':
            selected_feat_indices = select_feats(X_train + X_val,
                                                 y_train + y_val,
                                                 nb_features=nb_features)
            X_val, y_val = preproc_for_sklearn(X_val,
                                               y_val,
                                               nb_features=nb_features)

            X_val = X_val[:, selected_feat_indices]

        if method == 'riddle':
            start = time.time()
            model_module = models.deep_mlp
            riddle_param_dist = {
                'learning_rate': UniformLogSpace(10, lo=-6, hi=-1)
            }
            params['riddle'][k_idx] = parameter_tuning.random_search(
                model_module,
                riddle_param_dist,
                X_val,
                y_val,
                nb_features=nb_features,
                nb_classes=nb_classes,
                k=3,
                process_X_data_func_args={'nb_features': nb_features},
                process_y_data_func_args={'nb_classes': nb_classes},
                nb_searches=nb_searches)
            print('Best parameters for RIDDLE: {} found in {:.3f} s'.format(
                params['riddle'][k_idx],
                time.time() - start))

        elif method == 'lrfc':
            # logistic regression
            start = time.time()
            logit_param_dist = {'C': UniformLogSpace()}
            logit_estimator = LogisticRegression(multi_class='multinomial',
                                                 solver='lbfgs')
            params['logit'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=logit_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=logit_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print(
                'Best parameters for logistic regression: {} found in {:.3f} s'
                .format(params['logit'][k_idx],
                        time.time() - start))

            # random forest classifier
            start = time.time()
            rfc_param_dist = {'max_features': ['sqrt', 'log2'], \
                'max_depth': UniformLogSpace(base=2, lo=2, hi=9)}
            rfc_estimator = RandomForestClassifier()
            params['rfc'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=rfc_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=rfc_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for random forest: {} found in {:.3f} s'.
                  format(params['rfc'][k_idx],
                         time.time() - start))

        elif method == 'svm':
            # linear SVM
            start = time.time()
            linear_svm_param_dist = {'C': UniformLogSpace()}
            linear_svm_estimator = SVC(kernel='linear', probability=True)
            params['linear-svm'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=linear_svm_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=linear_svm_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print(
                'Best parameters for linear SVM: {} found in {:.3f} s'.format(
                    params['linear-svm'][k_idx],
                    time.time() - start))

            if skip_nonlinear_svm: continue  # skip

            nonlinear_svm_param_dist = {
                'C': UniformLogSpace(),
                'gamma': UniformLogSpace(base=10, lo=-5, hi=1)
            }

            # polynomial SVM
            start = time.time()
            poly_svm_estimator = SVC(kernel='poly', probability=True)
            params['poly-svm'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=poly_svm_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=nonlinear_svm_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for polynomial SVM: {} found in {:.3f} s'.
                  format(params['poly-svm'][k_idx],
                         time.time() - start))

            # RBF SVM
            start = time.time()
            rbf_svm_estimator = SVC(kernel='rbf', probability=True)
            params['rbf-svm'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=rbf_svm_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=nonlinear_svm_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for RBF SVM: {} found in {:.3f} s'.format(
                params['rbf-svm'][k_idx],
                time.time() - start))

        else:
            raise ValueError('unknown method: {}'.format(method))

    # save
    for method_name, sub_param_dict in params.items():
        if len(sub_param_dict) > 0:
            pickle_object(
                sub_param_dict,
                '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, method_name, data_fn,
                                               prop_missing))

    print('Finished parameter search for method: {}'.format(method))
コード例 #7
0
def run(ModelClass, x_unvec, y, idx_feat_dict, num_feature, max_num_feature,
        num_class, feature_selection, k_idx, k, params, perm_indices,
        init_args, full_out_dir):
    """Run a classification pipeline for a single k-fold partition.

    Arguments:
        ModelClass: Python class
            classification model
        x_unvec: [[int]]
            feature indices that have not been vectorized; each inner list
            collects the indices of features that are present (binary on)
            for a sample
        y: [int]
            list of class labels as integer indices
        idx_feat_dict: {int: string}
            dictionary mapping feature indices to features
        num_feature: int
            number of features present in the dataset
        max_num_feature: int
            maximum number of features to use
        num_class: int
            number of classes present
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k_idx: int
            index of the k-fold partition to use
        k: int
            number of partitions for k-fold cross-validation
        params: [{string: ?}]
            list of dictionary mapping parameter names to values for each
            k-fold partition
        perm_indices: np.ndarray, int
            array of indices representing a permutation of the samples with
            shape (num_sample, )
        init_args: {string: ?}
            dictionary mapping initialization argument names to values
        out_dir: string
            directory where outputs (e.g., results) should be saved
    """
    print('-' * 72)
    print('Partition k = {}'.format(k_idx))
    print(params[k_idx])

    x_train_unvec, y_train, _, _, x_test_unvec, y_test = (
        emr.get_k_fold_partition(x_unvec,
                                 y,
                                 k_idx=k_idx,
                                 k=k,
                                 perm_indices=perm_indices))

    if max_num_feature > 0:  # select features and re-encode
        feat_encoding_dict, _ = select_features(
            x_train_unvec,
            y_train,
            idx_feat_dict,
            method=feature_selection,
            num_feature=num_feature,
            max_num_feature=max_num_feature)
        x_train_unvec = subset_reencode_features(x_train_unvec,
                                                 feat_encoding_dict)
        x_test_unvec = subset_reencode_features(x_test_unvec,
                                                feat_encoding_dict)
        num_feature = max_num_feature

    x_train = vectorize_features(x_train_unvec, num_feature)
    x_test = vectorize_features(x_test_unvec, num_feature)

    args = dict(init_args)  # copy dictionary
    args.update(params[k_idx])

    start = time.time()
    model = ModelClass(**args)
    model.fit(x_train, y_train)
    y_test_probas = model.predict_proba(x_test)
    runtime = time.time() - start

    evaluate(y_test,
             y_test_probas,
             runtime,
             num_class=num_class,
             out_dir=full_out_dir)
コード例 #8
0
ファイル: riddle.py プロジェクト: jisungk/RIDDLE
def run(x_unvec, y, idx_feat_dict, idx_class_dict, icd9_descript_dict,
        num_feature, max_num_feature, num_class, feature_selection, k_idx, k,
        params, perm_indices, full_out_dir):
    """Run a RIDDLE classification pipeline for a single k-fold partition.

    Arguments:
        x_unvec: [[int]]
            feature indices that have not been vectorized; each inner list
            collects the indices of features that are present (binary on)
            for a sample
        y: [int]
            list of class labels as integer indices
        idx_feat_dict: {int: string}
            dictionary mapping feature indices to features
        idx_class_dict: {int: string}
            dictionary mapping class indices to classes
        icd9_descript_dict: {string: string}
            dictionary mapping ICD9 codes to description text
        num_feature: int
            number of features present in the dataset
        max_num_feature: int
            maximum number of features to use
        num_class: int
            number of classes
        feature_selection: string
            feature selection method; values = {'random', 'frequency', 'chi2'}
        k_idx: int
            index of the k-fold partition to use
        k: int
            number of partitions for k-fold cross-validation
        params: [{string: ?}]
            list of dictionary mapping parameter names to values for each
            k-fold partition
        perm_indices: np.ndarray, int
            array of indices representing a permutation of the samples with
            shape (num_sample, )
        full_out_dir: string
            directory where outputs (e.g., results) should be saved
    """
    from keras import backend as K
    from riddle import emr, feature_importance
    from riddle.models import MLP

    print('Partition k = {}'.format(k_idx))
    print()
    x_train_unvec, y_train, x_val_unvec, y_val, x_test_unvec, y_test = (
        emr.get_k_fold_partition(x_unvec, y, k_idx=k_idx, k=k,
                                 perm_indices=perm_indices))

    if max_num_feature > 0:  # select features and re-encode
        feat_encoding_dict, idx_feat_dict = select_features(
            x_train_unvec, y_train, idx_feat_dict,
            method=feature_selection, num_feature=num_feature,
            max_num_feature=max_num_feature)
        x_train_unvec = subset_reencode_features(x_train_unvec,
                                                 feat_encoding_dict)
        x_val_unvec = subset_reencode_features(x_val_unvec,
                                               feat_encoding_dict)
        x_test_unvec = subset_reencode_features(x_test_unvec,
                                                feat_encoding_dict)
        num_feature = max_num_feature

    # set up
    max_num_epoch = -1
    if 'debug' in full_out_dir:
        max_num_epoch = 3
    model = MLP(num_feature=num_feature, num_class=num_class,
                max_num_epoch=max_num_epoch, **params[k_idx])

    # train and test
    start = time.time()

    model.train(x_train_unvec, y_train, x_val_unvec, y_val)
    y_test_probas = model.predict_proba(x_test_unvec)

    runtime = time.time() - start
    print('Completed training and testing in {:.4f} seconds'.format(runtime))
    print('-' * 72)
    print()

    # evaluate model performance
    evaluate(y_test, y_test_probas, runtime, num_class=num_class,
             out_dir=full_out_dir)

    model.save_model(path=full_out_dir + '/model.h5')
    K.clear_session()

    print('Finished with partition k = {}'.format(k_idx))
    print('=' * 72)
    print()
コード例 #9
0
ファイル: parameter_search.py プロジェクト: agoila/RIDDLE
def run(data_fn, method='lrfc', prop_missing=0.0, k=10, 
    skip_nonlinear_svm=False, nb_searches=20, max_nb_samples=10000):
    if 'dummy' in data_fn or 'debug' in data_fn: nb_searches = 3
    data_path = '{}/{}'.format(DATA_DIR, data_fn)

    if not FORCE_RUN: # check if already did param search, if so, skip 
        did = lambda x: already_done(x, data_fn, prop_missing) # helper
        if method == 'riddle' and did(['riddle']):
            eprint('Already did parameter search for riddle')
            return
        elif method == 'lrfc' and did(['logit', 'rfc']):
            eprint('Already did parameter search for lrfc')
            return
        elif method == 'svm' and did(['linear-svm', 'poly-svm', 'rbf-svm']):
            eprint('Already did parameter search for svm')
            return 

    params = {'riddle': {}, 'logit': {}, 'rfc': {}, 'linear-svm': {},
        'poly-svm': {}, 'rbf-svm': {}}
    X, y, perm_indices, nb_features, nb_classes = get_base_data(data_path, 
        prop_missing)

    for k_idx in range(0, k):
        print('-' * 72)
        print('Partition k = {}'.format(k_idx))
        
        data_partition_dict = emr.get_k_fold_partition(X, y, k_idx=k_idx, k=k, 
            perm_indices=perm_indices)

        X_train = data_partition_dict['X_train']
        y_train = data_partition_dict['y_train']

        X_val = data_partition_dict['X_val']
        y_val = data_partition_dict['y_val']

        # cap number of validation samples
        if max_nb_samples != None and len(X_val)> max_nb_samples:
            X_val = X_val[0:max_nb_samples]
            y_val = y_val[0:max_nb_samples]
        
        if method != 'riddle':
            selected_feat_indices = select_feats(X_train + X_val, y_train + y_val,
                nb_features=nb_features)
            X_val, y_val = preproc_for_sklearn(X_val, y_val, 
                nb_features=nb_features)

            X_val = X_val[:, selected_feat_indices]

        if method == 'riddle':
            start = time.time()
            model_module = models.deep_mlp
            riddle_param_dist = {'learning_rate': UniformLogSpace(10, lo=-6, hi=-1)}
            params['riddle'][k_idx] = parameter_tuning.random_search(model_module, 
                riddle_param_dist, X_val, y_val, nb_features=nb_features, 
                nb_classes=nb_classes, k=3, 
                process_X_data_func_args={'nb_features': nb_features}, 
                process_y_data_func_args={'nb_classes': nb_classes},
                nb_searches=nb_searches)
            print('Best parameters for RIDDLE: {} found in {:.3f} s'
                .format(params['riddle'][k_idx], time.time() - start))

        elif method == 'lrfc':
            # logistic regression
            start = time.time()
            logit_param_dist = {'C': UniformLogSpace()}
            logit_estimator = LogisticRegression(multi_class='multinomial', 
                solver='lbfgs')
            params['logit'][k_idx] = parameter_search(X_val, y_val, 
                estimator=logit_estimator, search=RandomizedSearchCV, 
                dist_or_grid=logit_param_dist, n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for logistic regression: {} found in {:.3f} s'
                .format(params['logit'][k_idx], time.time() - start))

            # random forest classifier
            start = time.time()
            rfc_param_dist = {'max_features': ['sqrt', 'log2'], \
                'max_depth': UniformLogSpace(base=2, lo=2, hi=9)}
            rfc_estimator = RandomForestClassifier()
            params['rfc'][k_idx] = parameter_search(X_val, y_val, 
                estimator=rfc_estimator, search=RandomizedSearchCV, 
                dist_or_grid=rfc_param_dist, n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for random forest: {} found in {:.3f} s'
                .format(params['rfc'][k_idx], time.time() - start))

        elif method == 'svm':
            # linear SVM
            start = time.time()
            linear_svm_param_dist = {'C': UniformLogSpace()}
            linear_svm_estimator = SVC(kernel='linear', probability=True)
            params['linear-svm'][k_idx] = parameter_search(X_val, y_val,
                estimator=linear_svm_estimator, search=RandomizedSearchCV, 
                dist_or_grid=linear_svm_param_dist, n_iter=nb_searches, 
                scoring=loss_scorer)
            print('Best parameters for linear SVM: {} found in {:.3f} s'
                .format(params['linear-svm'][k_idx], time.time() - start))

            if skip_nonlinear_svm: continue # skip

            nonlinear_svm_param_dist = {'C': UniformLogSpace(), 
                'gamma': UniformLogSpace(base=10, lo=-5, hi=1)}

            # polynomial SVM
            start = time.time()
            poly_svm_estimator = SVC(kernel='poly', probability=True)
            params['poly-svm'][k_idx] = parameter_search(X_val, y_val,
                estimator=poly_svm_estimator, search=RandomizedSearchCV, 
                dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, 
                scoring=loss_scorer)
            print('Best parameters for polynomial SVM: {} found in {:.3f} s'
                .format(params['poly-svm'][k_idx], time.time() - start))

            # RBF SVM
            start = time.time()
            rbf_svm_estimator = SVC(kernel='rbf', probability=True)
            params['rbf-svm'][k_idx] = parameter_search(X_val, y_val,
                estimator=rbf_svm_estimator, search=RandomizedSearchCV, 
                dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, 
                scoring=loss_scorer)
            print('Best parameters for RBF SVM: {} found in {:.3f} s'
                .format(params['rbf-svm'][k_idx], time.time() - start))

        else: raise ValueError('unknown method: {}'.format(method))

    # save
    for method_name, sub_param_dict in params.items():
        if len(sub_param_dict) > 0:
            pickle_object(sub_param_dict, '{}/{}_{}_{}_param.pkl'.format(
                CACHE_DIR, method_name, data_fn, prop_missing))

    print('Finished parameter search for method: {}'.format(method))