Example #1
0
def main(args):
    try: method = args[1].lower()
    except:
        method = 'lrfc'
        eprint('Using default method = \'{}\''.format(method))

    try: data_fn = args[2]
    except: 
        data_fn = 'dummy.txt'
        eprint('Using default data_fn = \'{}\''.format(data_fn))

    try: which_half = args[3].lower()
    except: 
        which_half = 'both'
        eprint('Using default which_half = {}'.format(which_half))

    try: prop_missing = float(args[4])
    except: 
        prop_missing = 0.0
        eprint('Using default prop_missing = {}'.format(prop_missing))

    # not going to finish in time, so skip nonlinear svm if using full dataset
    skip_nonlinear_svm = 'final-100.txt' in data_fn
    if skip_nonlinear_svm:
        eprint('Skipping SVMs with non-linear kernels')
        
    run(data_fn, method=method, which_half=which_half,
        prop_missing=prop_missing, skip_nonlinear_svm=skip_nonlinear_svm)
Example #2
0
def main(args):
    try:
        method = args[1].lower()
    except:
        method = 'lrfc'
        eprint('Using default method = \'{}\''.format(method))

    try:
        data_fn = args[2]
    except:
        data_fn = 'dummy.txt'
        eprint('Using default data_fn = \'{}\''.format(data_fn))

    try:
        which_half = args[3].lower()
    except:
        which_half = 'both'
        eprint('Using default which_half = {}'.format(which_half))

    try:
        prop_missing = float(args[4])
    except:
        prop_missing = 0.0
        eprint('Using default prop_missing = {}'.format(prop_missing))

    # not going to finish in time, so skip nonlinear svm if using full dataset
    skip_nonlinear_svm = 'final-100.txt' in data_fn
    if skip_nonlinear_svm:
        eprint('Skipping SVMs with non-linear kernels')

    run(data_fn,
        method=method,
        which_half=which_half,
        prop_missing=prop_missing,
        skip_nonlinear_svm=skip_nonlinear_svm)
Example #3
0
def get_base_data(data_path, prop_missing):
    icd9_descript_path = '{}/{}'.format(DATA_DIR, 'phewas_codes.txt')

    # load data
    print('Loading data...')
    start = time.time()

    # get common data
    icd9_descript_dict = emr.get_icd9_descript_dict(icd9_descript_path)
    X, y, idx_feat_dict, idx_class_dict = emr.get_data(
        path=data_path,
        icd9_descript_dict=icd9_descript_dict,
        prop_missing=prop_missing)

    nb_features = len(idx_feat_dict)
    nb_classes = len(idx_class_dict)
    nb_cases = len(X)

    print('Data loaded in {:.5f} s'.format(time.time() - start))
    print()

    # shuffle indices
    perm_indices = np.random.permutation(nb_cases)
    try:  # try validating shuffled indices
        with open(data_path + '_perm_indices.pkl', 'r') as f:
            exp_perm_indices = pickle.load(f)
            assert np.all(perm_indices == exp_perm_indices)
    except:
        eprint('file not found ' + data_path + '_perm_indices.pkl')
        eprint('not doing perm_indices check')

    return X, y, perm_indices, nb_features, nb_classes
Example #4
0
def get_base_data(data_path, prop_missing):
    icd9_descript_path = '{}/{}'.format(DATA_DIR, 'phewas_codes.txt')

    # load data
    print('Loading data...')
    start = time.time()

    # get common data
    icd9_descript_dict = emr.get_icd9_descript_dict(icd9_descript_path) 
    X, y, idx_feat_dict, idx_class_dict = emr.get_data(path=data_path, 
        icd9_descript_dict=icd9_descript_dict, prop_missing=prop_missing)

    nb_features = len(idx_feat_dict)
    nb_classes = len(idx_class_dict)
    nb_cases = len(X)

    print('Data loaded in {:.5f} s'.format(time.time() - start))
    print()

    # shuffle indices
    perm_indices = np.random.permutation(nb_cases)    
    try: # try validating shuffled indices
        with open(data_path + '_perm_indices.pkl', 'r') as f:
            exp_perm_indices = pickle.load(f)
            assert np.all(perm_indices == exp_perm_indices)
    except:
        eprint('file not found ' + data_path + '_perm_indices.pkl')
        eprint('not doing perm_indices check')

    return X, y, perm_indices, nb_features, nb_classes
Example #5
0
def main(args):
    try:
        method = args[1].lower()
    except:
        method = 'lrfc'
        eprint('Using default method = \'{}\''.format(method))

    try:
        data_fn = args[2]
    except:
        data_fn = 'dummy.txt'
        eprint('Using default data_fn = \'{}\''.format(data_fn))

    try:
        prop_missing = float(args[3])
    except:
        prop_missing = 0.0
        eprint('Using default prop_missing = {}'.format(prop_missing))

    # not going to finish in time, so skip nonlinear svm if using full dataset
    skip_nonlinear_svm = 'final-100.txt' in data_fn

    run(data_fn,
        method=method,
        prop_missing=prop_missing,
        skip_nonlinear_svm=skip_nonlinear_svm)
Example #6
0
def main(args):
    try: method = args[1].lower()
    except:
        method = 'lrfc'
        eprint('Using default method = \'{}\''.format(method))

    try: data_fn = args[2]
    except: 
        data_fn = 'dummy.txt'
        eprint('Using default data_fn = \'{}\''.format(data_fn))

    try: prop_missing = float(args[3])
    except: 
        prop_missing = 0.0
        eprint('Using default prop_missing = {}'.format(prop_missing))

    # not going to finish in time, so skip nonlinear svm if using full dataset
    skip_nonlinear_svm = 'final-100.txt' in data_fn

    run(data_fn, method=method, prop_missing=prop_missing, 
        skip_nonlinear_svm=skip_nonlinear_svm)
Example #7
0
def run(data_fn,
        method='lrfc',
        which_half='both',
        prop_missing=0.0,
        k=10,
        skip_nonlinear_svm=False,
        nb_searches=20):
    data_path = '{}/{}'.format(DATA_DIR, data_fn)

    def get_results_dir(method, k_idx):
        base_folder = 'out/more/{}_{}_{}'.format(method, data_fn, prop_missing)
        folder = '{}/{}_idx_partition'.format(base_folder, k_idx)

        if not os.path.exists('out'): os.makedirs('out')
        if not os.path.exists('out/more'): os.makedirs('out/models')
        if not os.path.exists(base_folder): os.makedirs(base_folder)
        if not os.path.exists(folder): os.makedirs(folder)

        return folder

    try:  # load saved parameters
        get_param_fn = lambda x: '{}/{}_{}_{}_param.pkl'.format(
            CACHE_DIR, x, data_fn, prop_missing)

        if method == 'lrfc':
            with open(get_param_fn('logit'), 'r') as f:
                logit_params = pickle.load(f)
            with open(get_param_fn('rfc'), 'r') as f:
                rfc_params = pickle.load(f)
        elif method == 'svm':
            with open(get_param_fn('linear-svm'), 'r') as f:
                linear_svm_params = pickle.load(f)
            if not skip_nonlinear_svm:
                with open(get_param_fn('poly-svm'), 'r') as f:
                    poly_svm_params = pickle.load(f)
                with open(get_param_fn('rbf-svm'), 'r') as f:
                    rbf_svm_params = pickle.load(f)
        else:
            raise ValueError('unknown method: {}'.format(method))
    except:
        eprint('Need to do parameter search!')
        eprint('Please run `parameter_search.py` with the relevant' +
               'command line arguments')
        raise

    X, y, perm_indices, nb_features, nb_classes = get_base_data(
        data_path, prop_missing)

    losses = {
        'logit': [],
        'rfc': [],
        'linear-svm': [],
        'poly-svm': [],
        'rbf-svm': []
    }
    accs = {
        'logit': [],
        'rfc': [],
        'linear-svm': [],
        'poly-svm': [],
        'rbf-svm': []
    }
    runtimes = {
        'logit': [],
        'rfc': [],
        'linear-svm': [],
        'poly-svm': [],
        'rbf-svm': []
    }

    if which_half == 'first': loop_seq = range(0, k / 2)
    elif which_half == 'last': loop_seq = range(k / 2, k)
    elif which_half == 'both': loop_seq = range(0, k)
    else:
        raise ValueError(
            '`which_half` must be \'first\', \'last\' or \'both\'')

    for k_idx in loop_seq:
        print('-' * 72)
        print('Partition k = {}'.format(k_idx))

        data_partition_dict = emr.get_k_fold_partition(
            X, y, k_idx=k_idx, k=k, perm_indices=perm_indices)
        X_train = data_partition_dict['X_train']
        y_train = data_partition_dict['y_train']
        X_val = data_partition_dict['X_val']
        y_val = data_partition_dict['y_val']
        X_test = data_partition_dict['X_test']
        y_test = data_partition_dict['y_test']

        selected_feat_indices = select_feats(X_train + X_val,
                                             y_train + y_val,
                                             nb_features=nb_features)

        X_train, y_train = preproc_for_sklearn(X_train, y_train, nb_features)
        X_test, y_test = preproc_for_sklearn(X_test, y_test, nb_features)

        old_nb_features = len(X_train[0])
        X_train = X_train[:, selected_feat_indices]
        X_test = X_test[:, selected_feat_indices]

        nb_features = len(X_train[0])  # extraneous but for future utility
        print('Reduced features from {} to {}'.format(old_nb_features,
                                                      nb_features))

        if method == 'lrfc':
            from sklearn.linear_model import LogisticRegression
            from sklearn.ensemble import RandomForestClassifier

            # logistic regression
            start = time.time()
            logit = LogisticRegression(multi_class='multinomial',
                                       solver='lbfgs',
                                       **logit_params[k_idx])
            logit.fit(X_train, y_train)
            logit_acc = accuracy_score(y_test, logit.predict(X_test))
            logit_y_test_proba = logit.predict_proba(X_test)
            logit_loss = log_loss(y_test, logit_y_test_proba)
            logit_time = time.time() - start
            print(
                'Logistic regression / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s'
                .format(logit_loss, logit_acc, logit_time))

            # random forest classifier
            start = time.time()
            rfc = RandomForestClassifier(**rfc_params[k_idx])
            rfc.fit(X_train, y_train)
            rfc_acc = accuracy_score(y_test, rfc.predict(X_test))
            rfc_y_test_proba = rfc.predict_proba(X_test)
            rfc_loss = log_loss(y_test, rfc_y_test_proba)
            rfc_time = time.time() - start
            print(
                'Random forest / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s'
                .format(rfc_loss, rfc_acc, rfc_time))

            save_test_results(
                logit_y_test_proba, y_test,
                '{}/test_results.txt'.format(get_results_dir('logit', k_idx)))
            save_test_results(
                rfc_y_test_proba, y_test,
                '{}/test_results.txt'.format(get_results_dir('rfc', k_idx)))
            # joblib.dump(logit, get_results_dir('logit', k_idx) + '/clf.pkl')
            # joblib.dump(rfc, get_results_dir('rfc', k_idx) + '/clf.pkl')

            losses['logit'].append(logit_loss)
            accs['logit'].append(logit_acc)
            runtimes['logit'].append(logit_time)

            losses['rfc'].append(rfc_loss)
            accs['rfc'].append(rfc_acc)
            runtimes['rfc'].append(rfc_time)

        elif method == 'svm':
            from sklearn.svm import SVC

            # linear SVM
            start = time.time()
            linear_svm = SVC(kernel='linear',
                             probability=True,
                             **linear_svm_params[k_idx])
            linear_svm.fit(X_train, y_train)
            linear_svm_acc = accuracy_score(y_test, linear_svm.predict(X_test))
            linear_svm_y_test_proba = linear_svm.predict_proba(X_test)
            linear_svm_loss = log_loss(y_test, linear_svm_y_test_proba)
            linear_svm_time = time.time() - start
            print(
                'Linear SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(linear_svm_acc, linear_svm_loss, linear_svm_time))

            save_test_results(
                linear_svm_y_test_proba, y_test, '{}/test_results.txt'.format(
                    get_results_dir('linear-svm', k_idx)))
            # joblib.dump(linear_svm, get_results_dir('linear-svm', k_idx) + '/clf.pkl')

            losses['linear-svm'].append(linear_svm_loss)
            accs['linear-svm'].append(linear_svm_acc)
            runtimes['linear-svm'].append(linear_svm_time)

            if skip_nonlinear_svm: continue  # skip

            # polynomial SVM
            start = time.time()
            poly_svm = SVC(kernel='poly',
                           probability=True,
                           **poly_svm_params[k_idx])
            poly_svm.fit(X_train, y_train)
            poly_svm_acc = accuracy_score(y_test, poly_svm.predict(X_test))
            poly_svm_y_test_proba = poly_svm.predict_proba(X_test)
            poly_svm_loss = log_loss(y_test, poly_svm_y_test_proba)
            poly_svm_time = time.time() - start
            print(
                'Polynomial SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(poly_svm_acc, poly_svm_loss, poly_svm_time))

            # RBF SVM
            start = time.time()
            rbf_svm = SVC(kernel='rbf',
                          probability=True,
                          **rbf_svm_params[k_idx])
            rbf_svm.fit(X_train, y_train)
            rbf_svm_acc = accuracy_score(y_test, rbf_svm.predict(X_test))
            rbf_svm_y_test_proba = rbf_svm.predict_proba(X_test)
            rbf_svm_loss = log_loss(y_test, rbf_svm_y_test_proba)
            rbf_svm_time = time.time() - start
            print('RBF SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'.
                  format(rbf_svm_acc, rbf_svm_loss, rbf_svm_time))

            save_test_results(
                poly_svm_y_test_proba, y_test, '{}/test_results.txt'.format(
                    get_results_dir('poly-svm', k_idx)))
            save_test_results(
                rbf_svm_y_test_proba, y_test,
                '{}/test_results.txt'.format(get_results_dir('rbf-svm',
                                                             k_idx)))
            # joblib.dump(poly_svm, get_results_dir('poly-svm', k_idx) + '/clf.pkl')
            # joblib.dump(rbf_svm, get_results_dir('rbf-svm', k_idx) + '/clf.pkl')

            losses['poly-svm'].append(poly_svm_loss)
            accs['poly-svm'].append(poly_svm_acc)
            runtimes['poly-svm'].append(poly_svm_time)

            losses['rbf-svm'].append(rbf_svm_loss)
            accs['rbf-svm'].append(rbf_svm_acc)
            runtimes['rbf-svm'].append(rbf_svm_time)

        else:
            raise ValueError('unknown method: {}'.format(method))

    print()
    print('#' * 72)
    if method == 'lrfc':
        print_metrics(losses['logit'], accs['logit'], runtimes['logit'],
                      'Logistic regression')
        print_metrics(losses['rfc'], accs['rfc'], runtimes['rfc'],
                      'Random forest')
    elif method == 'svm':
        print_metrics(losses['linear-svm'], accs['linear-svm'],
                      runtimes['linear-svm'], 'Linear SVM')
        if not skip_nonlinear_svm:
            print_metrics(losses['poly-svm'], accs['poly-svm'],
                          runtimes['poly-svm'], 'Polynomial SVM')
            print_metrics(losses['rbf-svm'], accs['rbf-svm'],
                          runtimes['rbf-svm'], 'RBF SVM')
    else:
        raise ValueError('unknown method: {}'.format(method))
    print('#' * 72)
Example #8
0
def main(args):
    k = 10 # ten partitions for k-fold cross-validation

    try: id_string = args[1]
    except: 
        id_string = 'dummy'
        eprint('Using default id_string = \'{}\''.format(id_string))

    try: data_fn = args[2]
    except: 
        data_fn = 'dummy.txt'
        eprint('Using default data_fn = \'{}\''.format(data_fn))

    try: interpret_model = args[3].lower() == 'true' or args[3].lower()[0] == 't'
    except:
        interpret_model = True
        eprint('Using default interpret_model = {}'.format(interpret_model))

    try: prop_missing = float(args[4])
    except: 
        prop_missing = 0.0
        eprint('Using default prop_missing = {}'.format(prop_missing))

    data_path = '{}/{}'.format(DATA_DIR, data_fn)
    icd9_descript_path = '{}/{}'.format(DATA_DIR, 'phewas_codes.txt')

    model_module = models.deep_mlp
    model_id = model_module.__name__.split('.')[2]
    data_name = ''.join(data_fn.split('.')[:-1])

    if not os.path.exists('out'): os.makedirs('out')
    if not os.path.exists('out/more'): os.makedirs('out/more')
    out_directory = 'out/more/{}_{}_{}'.format('riddle', data_fn, prop_missing)
    if not os.path.exists(out_directory): os.makedirs(out_directory)

    start = time.time()

    # get common data
    icd9_descript_dict = emr.get_icd9_descript_dict(icd9_descript_path) 
    X, y, idx_feat_dict, idx_class_dict = emr.get_data(path=data_path, 
        icd9_descript_dict=icd9_descript_dict, prop_missing=prop_missing)

    # print/save value-sorted dictionary of classes and features
    class_mapping = sorted(idx_class_dict.items(), key=lambda key: key[0])
    print('Class mapping:')
    print(class_mapping)
    print()
    with open(out_directory + '/class_mapping.txt', 'w+') as f:
        print(class_mapping, file=f)
    with open(out_directory + '/feature_mapping.txt', 'w+') as f:
        for idx, feat in idx_feat_dict.items():
            f.write('{}\t{}\n'.format(idx, feat))

    nb_features = len(idx_feat_dict)
    nb_classes = len(idx_class_dict)
    nb_cases = len(X)

    print('Data loaded in {:.5f} seconds'.format(time.time() - start))

    # shuffle indices and save them
    perm_indices = np.random.permutation(nb_cases)
    pickle_object(perm_indices, out_directory + '/perm_indices.pkl')
    try: # try validating shuffled indices
        with open(data_path + '_perm_indices.pkl', 'r') as f:
            exp_perm_indices = pickle.load(f)
            assert np.all(perm_indices == exp_perm_indices)
    except:
        eprint('file not found ' + data_path + '_perm_indices.pkl')
        eprint('not doing perm_indices check')

    # load saved model parameters
    model_params_fn = '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, 'riddle',
        data_fn, prop_missing)
    try:
        with open(model_params_fn, 'r') as f:
            model_params = pickle.load(f)
    except:
        eprint('Need to do parameter search!')
        eprint('Please run `parameter_search.py` with the relevant' + 
               'command line arguments')
        raise

    # run pipeline and get metric results
    (losses, accs, runtimes), (list_contrib_sums_D, 
        list_contrib_sums_D2, list_contrib_sums), pairs = \
        kfold_run_pipeline(model_module,
            model_params, X, y, nb_features=nb_features, nb_classes=nb_classes,
            k=k, perm_indices=perm_indices, interpret_model=interpret_model, 
            out_directory=out_directory, id_string=id_string)

    if interpret_model:
        total_contrib_sums_D = compute_total_sums(list_contrib_sums_D)
        total_contrib_sums_D2 = compute_total_sums(list_contrib_sums_D2)
        total_contrib_sums = compute_total_sums(list_contrib_sums)

        nb_pairs = len(pairs)

        # get descriptions of feature importance
        feat_importance_summary = feature_importance.summarize_feature_importance(
            total_contrib_sums_D, total_contrib_sums_D2, idx_feat_dict=idx_feat_dict, 
            idx_class_dict=idx_class_dict, icd9_descript_dict=icd9_descript_dict,
            pairs=pairs, nb_cases=nb_cases)

        # get frequencies of features per class
        feat_class_freq_table = frequency.get_frequency_table(X, y, 
            idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict)

        # get orderings
        ordering_summary = ordering.summarize_orderings(total_contrib_sums, 
            feat_class_freq_table, idx_feat_dict=idx_feat_dict, 
            idx_class_dict=idx_class_dict, icd9_descript_dict=icd9_descript_dict, 
            nb_pairs=nb_pairs)
        ordering_summary.save_individual_tables(idx_class_dict, out_directory)
        ordering_summary.save(out_directory)

    # print metrics in a pretty fashion
    print_metrics(losses, accs, runtimes, id_string=id_string)

    print('This k-fold multipipeline run script took {:.4f} seconds'
        .format(time.time() - start))
Example #9
0
def run(data_fn, method='lrfc', which_half='both', prop_missing=0.0, k=10, 
    skip_nonlinear_svm=False, nb_searches=20):
    data_path = '{}/{}'.format(DATA_DIR, data_fn)

    def get_results_dir(method, k_idx):
        base_folder = 'out/more/{}_{}_{}'.format(method, data_fn, prop_missing)
        folder = '{}/{}_idx_partition'.format(base_folder, k_idx)

        if not os.path.exists('out'): os.makedirs('out')
        if not os.path.exists('out/more'): os.makedirs('out/models')
        if not os.path.exists(base_folder): os.makedirs(base_folder)
        if not os.path.exists(folder): os.makedirs(folder)

        return folder

    try: # load saved parameters
        get_param_fn = lambda x: '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, 
            x, data_fn, prop_missing)

        if method == 'lrfc':
            with open(get_param_fn('logit'), 'r') as f:
                logit_params = pickle.load(f)
            with open(get_param_fn('rfc'), 'r') as f:
                rfc_params = pickle.load(f)
        elif method == 'svm':
            with open(get_param_fn('linear-svm'), 'r') as f:
                linear_svm_params = pickle.load(f)
            if not skip_nonlinear_svm:
                with open(get_param_fn('poly-svm'), 'r') as f:
                    poly_svm_params = pickle.load(f)
                with open(get_param_fn('rbf-svm'), 'r') as f:
                    rbf_svm_params = pickle.load(f)
        else: raise ValueError('unknown method: {}'.format(method))
    except:
        eprint('Need to do parameter search!')
        eprint('Please run `parameter_search.py` with the relevant' + 
               'command line arguments')
        raise

    X, y, perm_indices, nb_features, nb_classes = get_base_data(data_path, 
        prop_missing)

    losses = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]}
    accs = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]}
    runtimes = {'logit':[], 'rfc':[], 'linear-svm':[], 'poly-svm':[], 'rbf-svm':[]}

    if which_half == 'first': loop_seq = range(0, k / 2)
    elif which_half == 'last': loop_seq = range(k / 2, k)
    elif which_half == 'both': loop_seq = range(0, k)
    else: raise ValueError('`which_half` must be \'first\', \'last\' or \'both\'')

    for k_idx in loop_seq:
        print('-' * 72)
        print('Partition k = {}'.format(k_idx))

        data_partition_dict = emr.get_k_fold_partition(X, y, k_idx=k_idx, k=k, 
        perm_indices=perm_indices)
        X_train = data_partition_dict['X_train']
        y_train = data_partition_dict['y_train']
        X_val   = data_partition_dict['X_val']
        y_val   = data_partition_dict['y_val']
        X_test  = data_partition_dict['X_test']
        y_test  = data_partition_dict['y_test']

        selected_feat_indices = select_feats(X_train + X_val, y_train + y_val,
            nb_features=nb_features)

        X_train, y_train = preproc_for_sklearn(X_train, y_train, nb_features)
        X_test, y_test = preproc_for_sklearn(X_test, y_test, nb_features)

        old_nb_features = len(X_train[0])
        X_train = X_train[:, selected_feat_indices]
        X_test = X_test[:, selected_feat_indices]

        nb_features = len(X_train[0]) # extraneous but for future utility
        print('Reduced features from {} to {}'.format(old_nb_features, nb_features))

        if method == 'lrfc':
            from sklearn.linear_model import LogisticRegression
            from sklearn.ensemble import RandomForestClassifier
            
            # logistic regression
            start = time.time()
            logit = LogisticRegression(multi_class='multinomial', solver='lbfgs', 
                **logit_params[k_idx])
            logit.fit(X_train, y_train)
            logit_acc = accuracy_score(y_test, logit.predict(X_test))
            logit_y_test_probas = logit.predict_proba(X_test)
            logit_loss = log_loss(y_test, logit_y_test_probas)
            logit_time = time.time() - start
            print('Logistic regression / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s'
                .format(logit_loss, logit_acc, logit_time))

            # random forest classifier
            start = time.time()
            rfc = RandomForestClassifier(**rfc_params[k_idx])
            rfc.fit(X_train, y_train)
            rfc_acc = accuracy_score(y_test, rfc.predict(X_test))
            rfc_y_test_probas = rfc.predict_proba(X_test)
            rfc_loss = log_loss(y_test, rfc_y_test_probas)
            rfc_time = time.time() - start
            print('Random forest / loss: {:.3f} / accuracy: {:.3f} / time: {:.3f} s'
                .format(rfc_loss, rfc_acc, rfc_time))
            
            save_test_results(logit_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('logit', k_idx)))
            save_test_results(rfc_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('rfc', k_idx)))
            # joblib.dump(logit, get_results_dir('logit', k_idx) + '/clf.pkl')
            # joblib.dump(rfc, get_results_dir('rfc', k_idx) + '/clf.pkl')

            losses['logit'].append(logit_loss)
            accs['logit'].append(logit_acc)
            runtimes['logit'].append(logit_time)

            losses['rfc'].append(rfc_loss)
            accs['rfc'].append(rfc_acc)
            runtimes['rfc'].append(rfc_time)

        elif method == 'svm':
            from sklearn.svm import SVC

            # linear SVM
            start = time.time()
            linear_svm = SVC(kernel='linear', probability=True, 
                **linear_svm_params[k_idx])
            linear_svm.fit(X_train, y_train)
            linear_svm_acc = accuracy_score(y_test, linear_svm.predict(X_test))
            linear_svm_y_test_probas = linear_svm.predict_proba(X_test)
            linear_svm_loss = log_loss(y_test, linear_svm_y_test_probas)
            linear_svm_time = time.time() - start
            print('Linear SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(linear_svm_acc, linear_svm_loss, linear_svm_time))

            save_test_results(linear_svm_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('linear-svm', k_idx)))
            # joblib.dump(linear_svm, get_results_dir('linear-svm', k_idx) + '/clf.pkl')

            losses['linear-svm'].append(linear_svm_loss)
            accs['linear-svm'].append(linear_svm_acc)
            runtimes['linear-svm'].append(linear_svm_time)

            if skip_nonlinear_svm: continue # skip

            # polynomial SVM
            start = time.time()
            poly_svm = SVC(kernel='poly', probability=True,
                **poly_svm_params[k_idx])
            poly_svm.fit(X_train, y_train)
            poly_svm_acc = accuracy_score(y_test, poly_svm.predict(X_test))
            poly_svm_y_test_probas = poly_svm.predict_proba(X_test)
            poly_svm_loss = log_loss(y_test, poly_svm_y_test_probas)
            poly_svm_time = time.time() - start
            print('Polynomial SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(poly_svm_acc, poly_svm_loss, poly_svm_time))

            # RBF SVM
            start = time.time()
            rbf_svm = SVC(kernel='rbf', probability=True, 
                **rbf_svm_params[k_idx])
            rbf_svm.fit(X_train, y_train)
            rbf_svm_acc = accuracy_score(y_test, rbf_svm.predict(X_test))
            rbf_svm_y_test_probas = rbf_svm.predict_proba(X_test)
            rbf_svm_loss = log_loss(y_test, rbf_svm_y_test_probas)
            rbf_svm_time = time.time() - start
            print('RBF SVM / accuracy: {:.3f} / loss: {:.3f} / time: {:.3f} s'
                .format(rbf_svm_acc, rbf_svm_loss, rbf_svm_time))

            save_test_results(poly_svm_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('poly-svm', k_idx)))
            save_test_results(rbf_svm_y_test_probas, y_test, 
                '{}/test_results.txt'.format(get_results_dir('rbf-svm', k_idx)))
            # joblib.dump(poly_svm, get_results_dir('poly-svm', k_idx) + '/clf.pkl')
            # joblib.dump(rbf_svm, get_results_dir('rbf-svm', k_idx) + '/clf.pkl')

            losses['poly-svm'].append(poly_svm_loss)
            accs['poly-svm'].append(poly_svm_acc)
            runtimes['poly-svm'].append(poly_svm_time)

            losses['rbf-svm'].append(rbf_svm_loss)
            accs['rbf-svm'].append(rbf_svm_acc)
            runtimes['rbf-svm'].append(rbf_svm_time)

        else: raise ValueError('unknown method: {}'.format(method))

    print()
    print('#' * 72)
    if method == 'lrfc':
        print_metrics(losses['logit'], accs['logit'], runtimes['logit'],
            'Logistic regression')
        print_metrics(losses['rfc'], accs['rfc'], runtimes['rfc'], 
            'Random forest')
    elif method == 'svm':
        print_metrics(losses['linear-svm'], accs['linear-svm'], 
            runtimes['linear-svm'], 'Linear SVM')
        if not skip_nonlinear_svm:
            print_metrics(losses['poly-svm'], accs['poly-svm'], 
                runtimes['poly-svm'], 'Polynomial SVM')
            print_metrics(losses['rbf-svm'], accs['rbf-svm'], 
                runtimes['rbf-svm'], 'RBF SVM')
    else: raise ValueError('unknown method: {}'.format(method))
    print('#' * 72)
Example #10
0
def main(args):
    k = 10  # ten partitions for k-fold cross-validation

    try:
        id_string = args[1]
    except:
        id_string = 'dummy'
        eprint('Using default id_string = \'{}\''.format(id_string))

    try:
        data_fn = args[2]
    except:
        data_fn = 'dummy.txt'
        eprint('Using default data_fn = \'{}\''.format(data_fn))

    try:
        interpret_model = args[3].lower() == 'true' or args[3].lower(
        )[0] == 't'
    except:
        interpret_model = True
        eprint('Using default interpret_model = {}'.format(interpret_model))

    try:
        prop_missing = float(args[4])
    except:
        prop_missing = 0.0
        eprint('Using default prop_missing = {}'.format(prop_missing))

    data_path = '{}/{}'.format(DATA_DIR, data_fn)
    icd9_descript_path = '{}/{}'.format(DATA_DIR, 'phewas_codes.txt')

    model_module = models.deep_mlp
    model_id = model_module.__name__.split('.')[2]
    data_name = ''.join(data_fn.split('.')[:-1])

    if not os.path.exists('out'): os.makedirs('out')
    if not os.path.exists('out/more'): os.makedirs('out/more')
    out_directory = 'out/more/{}_{}_{}'.format('riddle', data_fn, prop_missing)
    if not os.path.exists(out_directory): os.makedirs(out_directory)

    start = time.time()

    # get common data
    icd9_descript_dict = emr.get_icd9_descript_dict(icd9_descript_path)
    X, y, idx_feat_dict, idx_class_dict = emr.get_data(
        path=data_path,
        icd9_descript_dict=icd9_descript_dict,
        prop_missing=prop_missing)

    # print/save value-sorted dictionary of classes and features
    class_mapping = sorted(idx_class_dict.items(), key=lambda key: key[0])
    print('Class mapping:')
    print(class_mapping)
    print()
    with open(out_directory + '/class_mapping.txt', 'w+') as f:
        print(class_mapping, file=f)
    with open(out_directory + '/feature_mapping.txt', 'w+') as f:
        for idx, feat in idx_feat_dict.items():
            f.write('{}\t{}\n'.format(idx, feat))

    nb_features = len(idx_feat_dict)
    nb_classes = len(idx_class_dict)
    nb_cases = len(X)

    print('Data loaded in {:.5f} seconds'.format(time.time() - start))

    # shuffle indices and save them
    perm_indices = np.random.permutation(nb_cases)
    pickle_object(perm_indices, out_directory + '/perm_indices.pkl')
    try:  # try validating shuffled indices
        with open(data_path + '_perm_indices.pkl', 'r') as f:
            exp_perm_indices = pickle.load(f)
            assert np.all(perm_indices == exp_perm_indices)
    except:
        eprint('file not found ' + data_path + '_perm_indices.pkl')
        eprint('not doing perm_indices check')

    # load saved model parameters
    model_params_fn = '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, 'riddle',
                                                     data_fn, prop_missing)
    try:
        with open(model_params_fn, 'r') as f:
            model_params = pickle.load(f)
    except:
        eprint('Need to do parameter search!')
        eprint('Please run `parameter_search.py` with the relevant' +
               'command line arguments')
        raise

    # run pipeline and get metric results
    (losses, accs, runtimes), (list_contrib_sums_D,
        list_contrib_sums_D2, list_contrib_sums), pairs = \
        kfold_run_pipeline(model_module,
            model_params, X, y, nb_features=nb_features, nb_classes=nb_classes,
            k=k, perm_indices=perm_indices, interpret_model=interpret_model,
            out_directory=out_directory, id_string=id_string)

    if interpret_model:
        total_contrib_sums_D = compute_total_sums(list_contrib_sums_D)
        total_contrib_sums_D2 = compute_total_sums(list_contrib_sums_D2)
        total_contrib_sums = compute_total_sums(list_contrib_sums)

        nb_pairs = len(pairs)

        # get descriptions of feature importance
        feat_importance_summary = feature_importance.summarize_feature_importance(
            total_contrib_sums_D,
            total_contrib_sums_D2,
            idx_feat_dict=idx_feat_dict,
            idx_class_dict=idx_class_dict,
            icd9_descript_dict=icd9_descript_dict,
            pairs=pairs,
            nb_cases=nb_cases)

        # get frequencies of features per class
        feat_class_freq_table = frequency.get_frequency_table(
            X, y, idx_feat_dict=idx_feat_dict, idx_class_dict=idx_class_dict)

        # get orderings
        ordering_summary = ordering.summarize_orderings(
            total_contrib_sums,
            feat_class_freq_table,
            idx_feat_dict=idx_feat_dict,
            idx_class_dict=idx_class_dict,
            icd9_descript_dict=icd9_descript_dict,
            nb_pairs=nb_pairs)
        ordering_summary.save_individual_tables(idx_class_dict, out_directory)
        ordering_summary.save(out_directory)

    # print metrics in a pretty fashion
    print_metrics(losses, accs, runtimes, id_string=id_string)

    print('This k-fold multipipeline run script took {:.4f} seconds'.format(
        time.time() - start))
Example #11
0
def run(data_fn,
        method='lrfc',
        prop_missing=0.0,
        k=10,
        skip_nonlinear_svm=False,
        nb_searches=20,
        max_nb_samples=10000):
    if 'dummy' in data_fn or 'debug' in data_fn: nb_searches = 3
    data_path = '{}/{}'.format(DATA_DIR, data_fn)

    if not FORCE_RUN:  # check if already did param search, if so, skip
        did = lambda x: already_done(x, data_fn, prop_missing)  # helper
        if method == 'riddle' and did(['riddle']):
            eprint('Already did parameter search for riddle')
            return
        elif method == 'lrfc' and did(['logit', 'rfc']):
            eprint('Already did parameter search for lrfc')
            return
        elif method == 'svm' and did(['linear-svm', 'poly-svm', 'rbf-svm']):
            eprint('Already did parameter search for svm')
            return

    params = {
        'riddle': {},
        'logit': {},
        'rfc': {},
        'linear-svm': {},
        'poly-svm': {},
        'rbf-svm': {}
    }
    X, y, perm_indices, nb_features, nb_classes = get_base_data(
        data_path, prop_missing)

    for k_idx in range(0, k):
        print('-' * 72)
        print('Partition k = {}'.format(k_idx))

        data_partition_dict = emr.get_k_fold_partition(
            X, y, k_idx=k_idx, k=k, perm_indices=perm_indices)

        X_train = data_partition_dict['X_train']
        y_train = data_partition_dict['y_train']

        X_val = data_partition_dict['X_val']
        y_val = data_partition_dict['y_val']

        # cap number of validation samples
        if max_nb_samples != None and len(X_val) > max_nb_samples:
            X_val = X_val[0:max_nb_samples]
            y_val = y_val[0:max_nb_samples]

        if method != 'riddle':
            selected_feat_indices = select_feats(X_train + X_val,
                                                 y_train + y_val,
                                                 nb_features=nb_features)
            X_val, y_val = preproc_for_sklearn(X_val,
                                               y_val,
                                               nb_features=nb_features)

            X_val = X_val[:, selected_feat_indices]

        if method == 'riddle':
            start = time.time()
            model_module = models.deep_mlp
            riddle_param_dist = {
                'learning_rate': UniformLogSpace(10, lo=-6, hi=-1)
            }
            params['riddle'][k_idx] = parameter_tuning.random_search(
                model_module,
                riddle_param_dist,
                X_val,
                y_val,
                nb_features=nb_features,
                nb_classes=nb_classes,
                k=3,
                process_X_data_func_args={'nb_features': nb_features},
                process_y_data_func_args={'nb_classes': nb_classes},
                nb_searches=nb_searches)
            print('Best parameters for RIDDLE: {} found in {:.3f} s'.format(
                params['riddle'][k_idx],
                time.time() - start))

        elif method == 'lrfc':
            # logistic regression
            start = time.time()
            logit_param_dist = {'C': UniformLogSpace()}
            logit_estimator = LogisticRegression(multi_class='multinomial',
                                                 solver='lbfgs')
            params['logit'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=logit_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=logit_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print(
                'Best parameters for logistic regression: {} found in {:.3f} s'
                .format(params['logit'][k_idx],
                        time.time() - start))

            # random forest classifier
            start = time.time()
            rfc_param_dist = {'max_features': ['sqrt', 'log2'], \
                'max_depth': UniformLogSpace(base=2, lo=2, hi=9)}
            rfc_estimator = RandomForestClassifier()
            params['rfc'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=rfc_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=rfc_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for random forest: {} found in {:.3f} s'.
                  format(params['rfc'][k_idx],
                         time.time() - start))

        elif method == 'svm':
            # linear SVM
            start = time.time()
            linear_svm_param_dist = {'C': UniformLogSpace()}
            linear_svm_estimator = SVC(kernel='linear', probability=True)
            params['linear-svm'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=linear_svm_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=linear_svm_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print(
                'Best parameters for linear SVM: {} found in {:.3f} s'.format(
                    params['linear-svm'][k_idx],
                    time.time() - start))

            if skip_nonlinear_svm: continue  # skip

            nonlinear_svm_param_dist = {
                'C': UniformLogSpace(),
                'gamma': UniformLogSpace(base=10, lo=-5, hi=1)
            }

            # polynomial SVM
            start = time.time()
            poly_svm_estimator = SVC(kernel='poly', probability=True)
            params['poly-svm'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=poly_svm_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=nonlinear_svm_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for polynomial SVM: {} found in {:.3f} s'.
                  format(params['poly-svm'][k_idx],
                         time.time() - start))

            # RBF SVM
            start = time.time()
            rbf_svm_estimator = SVC(kernel='rbf', probability=True)
            params['rbf-svm'][k_idx] = parameter_search(
                X_val,
                y_val,
                estimator=rbf_svm_estimator,
                search=RandomizedSearchCV,
                dist_or_grid=nonlinear_svm_param_dist,
                n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for RBF SVM: {} found in {:.3f} s'.format(
                params['rbf-svm'][k_idx],
                time.time() - start))

        else:
            raise ValueError('unknown method: {}'.format(method))

    # save
    for method_name, sub_param_dict in params.items():
        if len(sub_param_dict) > 0:
            pickle_object(
                sub_param_dict,
                '{}/{}_{}_{}_param.pkl'.format(CACHE_DIR, method_name, data_fn,
                                               prop_missing))

    print('Finished parameter search for method: {}'.format(method))
Example #12
0
def run(data_fn, method='lrfc', prop_missing=0.0, k=10, 
    skip_nonlinear_svm=False, nb_searches=20, max_nb_samples=10000):
    if 'dummy' in data_fn or 'debug' in data_fn: nb_searches = 3
    data_path = '{}/{}'.format(DATA_DIR, data_fn)

    if not FORCE_RUN: # check if already did param search, if so, skip 
        did = lambda x: already_done(x, data_fn, prop_missing) # helper
        if method == 'riddle' and did(['riddle']):
            eprint('Already did parameter search for riddle')
            return
        elif method == 'lrfc' and did(['logit', 'rfc']):
            eprint('Already did parameter search for lrfc')
            return
        elif method == 'svm' and did(['linear-svm', 'poly-svm', 'rbf-svm']):
            eprint('Already did parameter search for svm')
            return 

    params = {'riddle': {}, 'logit': {}, 'rfc': {}, 'linear-svm': {},
        'poly-svm': {}, 'rbf-svm': {}}
    X, y, perm_indices, nb_features, nb_classes = get_base_data(data_path, 
        prop_missing)

    for k_idx in range(0, k):
        print('-' * 72)
        print('Partition k = {}'.format(k_idx))
        
        data_partition_dict = emr.get_k_fold_partition(X, y, k_idx=k_idx, k=k, 
            perm_indices=perm_indices)

        X_train = data_partition_dict['X_train']
        y_train = data_partition_dict['y_train']

        X_val = data_partition_dict['X_val']
        y_val = data_partition_dict['y_val']

        # cap number of validation samples
        if max_nb_samples != None and len(X_val)> max_nb_samples:
            X_val = X_val[0:max_nb_samples]
            y_val = y_val[0:max_nb_samples]
        
        if method != 'riddle':
            selected_feat_indices = select_feats(X_train + X_val, y_train + y_val,
                nb_features=nb_features)
            X_val, y_val = preproc_for_sklearn(X_val, y_val, 
                nb_features=nb_features)

            X_val = X_val[:, selected_feat_indices]

        if method == 'riddle':
            start = time.time()
            model_module = models.deep_mlp
            riddle_param_dist = {'learning_rate': UniformLogSpace(10, lo=-6, hi=-1)}
            params['riddle'][k_idx] = parameter_tuning.random_search(model_module, 
                riddle_param_dist, X_val, y_val, nb_features=nb_features, 
                nb_classes=nb_classes, k=3, 
                process_X_data_func_args={'nb_features': nb_features}, 
                process_y_data_func_args={'nb_classes': nb_classes},
                nb_searches=nb_searches)
            print('Best parameters for RIDDLE: {} found in {:.3f} s'
                .format(params['riddle'][k_idx], time.time() - start))

        elif method == 'lrfc':
            # logistic regression
            start = time.time()
            logit_param_dist = {'C': UniformLogSpace()}
            logit_estimator = LogisticRegression(multi_class='multinomial', 
                solver='lbfgs')
            params['logit'][k_idx] = parameter_search(X_val, y_val, 
                estimator=logit_estimator, search=RandomizedSearchCV, 
                dist_or_grid=logit_param_dist, n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for logistic regression: {} found in {:.3f} s'
                .format(params['logit'][k_idx], time.time() - start))

            # random forest classifier
            start = time.time()
            rfc_param_dist = {'max_features': ['sqrt', 'log2'], \
                'max_depth': UniformLogSpace(base=2, lo=2, hi=9)}
            rfc_estimator = RandomForestClassifier()
            params['rfc'][k_idx] = parameter_search(X_val, y_val, 
                estimator=rfc_estimator, search=RandomizedSearchCV, 
                dist_or_grid=rfc_param_dist, n_iter=nb_searches,
                scoring=loss_scorer)
            print('Best parameters for random forest: {} found in {:.3f} s'
                .format(params['rfc'][k_idx], time.time() - start))

        elif method == 'svm':
            # linear SVM
            start = time.time()
            linear_svm_param_dist = {'C': UniformLogSpace()}
            linear_svm_estimator = SVC(kernel='linear', probability=True)
            params['linear-svm'][k_idx] = parameter_search(X_val, y_val,
                estimator=linear_svm_estimator, search=RandomizedSearchCV, 
                dist_or_grid=linear_svm_param_dist, n_iter=nb_searches, 
                scoring=loss_scorer)
            print('Best parameters for linear SVM: {} found in {:.3f} s'
                .format(params['linear-svm'][k_idx], time.time() - start))

            if skip_nonlinear_svm: continue # skip

            nonlinear_svm_param_dist = {'C': UniformLogSpace(), 
                'gamma': UniformLogSpace(base=10, lo=-5, hi=1)}

            # polynomial SVM
            start = time.time()
            poly_svm_estimator = SVC(kernel='poly', probability=True)
            params['poly-svm'][k_idx] = parameter_search(X_val, y_val,
                estimator=poly_svm_estimator, search=RandomizedSearchCV, 
                dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, 
                scoring=loss_scorer)
            print('Best parameters for polynomial SVM: {} found in {:.3f} s'
                .format(params['poly-svm'][k_idx], time.time() - start))

            # RBF SVM
            start = time.time()
            rbf_svm_estimator = SVC(kernel='rbf', probability=True)
            params['rbf-svm'][k_idx] = parameter_search(X_val, y_val,
                estimator=rbf_svm_estimator, search=RandomizedSearchCV, 
                dist_or_grid=nonlinear_svm_param_dist, n_iter=nb_searches, 
                scoring=loss_scorer)
            print('Best parameters for RBF SVM: {} found in {:.3f} s'
                .format(params['rbf-svm'][k_idx], time.time() - start))

        else: raise ValueError('unknown method: {}'.format(method))

    # save
    for method_name, sub_param_dict in params.items():
        if len(sub_param_dict) > 0:
            pickle_object(sub_param_dict, '{}/{}_{}_{}_param.pkl'.format(
                CACHE_DIR, method_name, data_fn, prop_missing))

    print('Finished parameter search for method: {}'.format(method))