def main():
    """
    Makes top 368 clients to all guards usability table.
    """
    paths_filename = "as_paths.txt"
    index_filename = "as_paths_index.bin"
    libspookyhash_filename = "./libspookyhash.so"

    ip_to_as = json.load(open("../guard_info/ip_to_as.json"))
    guard_to_bw = pickle.load(open("../guard_info/guard_to_bw.pickle", "rb"))
    fp_to_as = {
        g.fingerprint: ip_to_as[g.address]
        for (g, bw) in guard_to_bw.items()
    }

    pfi_instance = pfi.PFI(libspookyhash_filename, paths_filename,
                           index_filename)

    pfi_instance.load()
    pfi_instance.verify()

    client_as_lst = [
        asn.strip()
        for asn in open("../data/top368client.txt", 'r').readlines()
    ]

    client_to_guard_usability = {}
    for client_as in client_as_lst:
        print(f'{client_as}')
        usability_table = denasa.make_guard_usability_dict(
            client_as, fp_to_as, pfi_instance)
        client_to_guard_usability[client_as] = usability_table

    json.dump(client_to_guard_usability,
              open("client_to_guard_usability.json", "w"))
import numpy as np
import argparse
import pfi
import json
import pickle
import copy
import relays
import matplotlib.pyplot as plt

# initialize vars
paths_filename = "as_paths.txt"
index_filename = "as_paths_index.bin"
libspookyhash_filename = "../denasa/libspookyhash.so"

pfi_instance = pfi.PFI(libspookyhash_filename, paths_filename, index_filename)

pfi_instance.load()
pfi_instance.verify()

ip_to_as = json.load(open("../guard_info/ip_to_as.json"))
all_ases = [
    asn.strip() for asn in open("../data/relay_ases.txt", 'r').readlines()
]

guard_to_bw = pickle.load(open("../guard_info/guard_to_bw.pickle", "rb"))

fp_to_bw = {g.fingerprint: bw for (g, bw) in guard_to_bw.items()}
fp_to_as = {
    g.fingerprint: ip_to_as[g.address]
    for (g, bw) in guard_to_bw.items()
Ejemplo n.º 3
0
def run(args):
    # TODO: log out the args
    print(args)
    n_shuffles = args.n_shuffles
    corr_th = args.corr_th
    epoch = args.epoch
    batch = args.batch
    max_cols = args.max_cols

    # Create necessary dirs
    OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_ff_cor{corr_th}')
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    logger = set_logger(
        filename=os.path.join(OUTDIR, f'{APP}_main_logfile.log'))

    # ==========  Load data  ==========
    print('\nLoad TC data ...')

    # ---------- Load data ----------
    # y_enc = pd.read_csv(YENC_PATH, sep='\t')
    ## data = pd.read_csv(DATAPATH, sep='\t')
    ## xdata = data.iloc[:, 1:].copy()
    ## ydata = data.iloc[:, 0].copy()
    data_train = pd.read_csv(DATAPATH_TR, sep=',')
    data_val = pd.read_csv(DATAPATH_VL, sep=',')
    print(f'\ndata_train.shape {data_train.shape}')
    print(f'data_val.shape   {data_val.shape}')

    mm = pd.read_csv('/vol/ml/apartin/Benchmarks/Data/Pilot1/lincs1000.tsv',
                     sep='\t')

    train = train[['case_id', 'cancer_type'] +
                  mm['gdc'].tolist()]  # Extract lincs from the whole dataset
    test = test[['case_id', 'cancer_type'] +
                mm['gdc'].tolist()]  # Extract lincs from the whole dataset
    print(train.shape)
    print(test.shape)

    if args.bootstrap_cols > -1:
        ## xdata = xdata.sample(n=args.bootstrap_cols, axis=1, random_state=SEED)  # Take a subset of cols
        y_tmp = data_train.iloc[:, 0]
        x_tmp = data_train.iloc[:, 1:].sample(n=args.bootstrap_cols,
                                              axis=1,
                                              random_state=SEED)
        data_train = pd.concat([y_tmp, x_tmp], axis=1)
        data_val = data_val[data_train.columns]
    print(f'\ndata_train.shape {data_train.shape}')
    print(f'data_val.shape   {data_val.shape}')
    ##features = xdata.columns

    ##print('data.shape', data.shape)
    ##print(data.iloc[:3, :4])

    ##print('\nxdata.shape', xdata.shape)
    ##print('np.unique(ydata)', np.unique(ydata))

    ##scaler = StandardScaler()
    ##xdata = scaler.fit_transform(xdata)
    ##xdata = pd.DataFrame(xdata, columns=features)

    ##xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata)

    # ==========  RF classifier  ==========
    logger.info('RF classifier ...')
    logger.info('-----------------')

    # ---------- Get the data ----------
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    logger.info(f'xtr.shape {xtr.shape}')
    logger.info(f'xvl.shape {xvl.shape}')
    logger.info(f'ytr.shape {ytr.shape}')
    logger.info(f'yvl.shape {yvl.shape}')

    # ---------- Train RF classifier ----------
    logger.info(f'Train RF Classifier ...')
    rf_model = RandomForestClassifier(n_estimators=200,
                                      max_features='sqrt',
                                      random_state=SEED)
    rf_model.fit(xtr, ytr)
    logger.info(
        f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}')

    yvl_preds = rf_model.predict(xvl)
    print('true', yvl[:10].values)
    print('pred', yvl_preds[:10])
    logger.info('f1_score micro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')))
    logger.info('f1_score macro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')))

    # TODO: finish this ...
    # df_conf = utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['type'].values,
    #                                       title=f'{APP}_confusion', savefig=True, img_name=f'{APP}_confusion')

    # ---------- Feature importance ----------
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    max_cols=max_cols,
                                    title='RF Classifier (FI using MDI)')
    rf_fi = utils.get_rf_fi(rf_model, columns=features)
    rf_fi.to_csv(os.path.join(OUTDIR, f'{APP}_rf_fi.csv'), index=False)
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight')

    # PFI
    logger.info('Compute PFI ...')
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=True)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols=max_cols,
                             title='RF Classifier (PFI var)',
                             ylabel='Importance (relative)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'),
                bbox_inches='tight')
    fig = fi_obj.plot_score_fi(max_cols=max_cols,
                               title='RF Classifier (PFI MDA: f1-score)',
                               ylabel='Importance (score decrease)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}_rf')

    # ==========  NN classifier  ==========
    logger.info('                 ')
    logger.info('NN classifier ...')
    logger.info('-----------------')

    # ---------- Get the data ----------
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    logger.info(f'xtr.shape {xtr.shape}')
    logger.info(f'xvl.shape {xvl.shape}')
    logger.info(f'ytr.shape {ytr.shape}')
    logger.info(f'yvl.shape {yvl.shape}')

    n_classes = len(np.unique(ytr))
    ytr = keras.utils.to_categorical(ytr, num_classes=n_classes)
    yvl = keras.utils.to_categorical(yvl, num_classes=n_classes)

    # ---------- Train NN classifier ----------
    logger.info('Training NN Classifier...')
    keras_model = create_nn_classifier(n_features=xtr.shape[1],
                                       n_classes=n_classes)
    history = keras_model.fit(xtr,
                              ytr,
                              epochs=epoch,
                              batch_size=batch,
                              verbose=0)
    # utils.plot_keras_learning(history, figsize = (10, 8), savefig=True,
    #                           img_name=os.path.join(OUTDIR, 'learning_with_lr'))
    score = keras_model.evaluate(xvl, yvl,
                                 verbose=False)[-1]  # compute the val loss
    logger.info('Prediction score (val loss): {:.4f}'.format(score))

    yvl_preds = keras_model.predict(xvl)
    print('true', np.argmax(yvl[:10], axis=1))
    print('pred', np.argmax(yvl_preds[:10, :], axis=1))
    logger.info('f1_score micro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='micro')))
    logger.info('f1_score macro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='macro')))

    # ---------- Feature importance ----------
    # PFI
    logger.info('Compute PFI ...')
    t0 = time.time()
    fi_obj = pfi.PFI(model=keras_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=True)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols=max_cols,
                             title='NN Classifier (PFI var)',
                             ylabel='Importance (relative)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_var.png'),
                bbox_inches='tight')
    fig = fi_obj.plot_score_fi(max_cols=max_cols,
                               title='NN Classifier (PFI MDA: f1-score)',
                               ylabel='Importance (score decrease)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_score.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}_nn')
Ejemplo n.º 4
0
def run(args):
    # TODO: log out the args
    print(f'\n{args}')
    n_shuffles = args.n_shuffles
    corr_th = args.corr_th
    epoch = args.epoch
    batch = args.batch
    max_cols_plot = args.max_cols_plot

    # Create necessary dirs
    # dataset = DATAPATH_TR.split('_')[-1]  # TODO: clean/fix
    OUTDIR = os.path.join(file_path, f'results_{APP}_cor{corr_th}')
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    logger = set_logger(
        filename=os.path.join(OUTDIR, f'{APP}_main_logfile.log'))

    # ==========  Load data  ==========
    print('\n======= Load TC data =======')
    y_enc = pd.read_csv(YENC_PATH, sep='\t')
    data_train = pd.read_csv(DATAPATH_TR, sep='\t')
    data_val = pd.read_csv(DATAPATH_VL, sep='\t')
    print(f'\ndata_train.shape {data_train.shape}')
    print(f'data_val.shape   {data_val.shape}')

    if args.bootstrap_cols > -1:
        y_tmp = data_train.iloc[:, 0]
        x_tmp = data_train.iloc[:, 1:].sample(n=args.bootstrap_cols,
                                              axis=1,
                                              random_state=SEED)
        data_train = pd.concat([y_tmp, x_tmp], axis=1)
        data_val = data_val[data_train.columns]
    print(f'\ndata_train.shape {data_train.shape}')
    print(f'data_val.shape   {data_val.shape}')

    # Compute corr matrix
    # cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5)
    # fig = utils.plot_cor_heatmap(cor)
    # fig.savefig(os.path.join(OUTDIR, f'{APP}_feature_corr.png'), bbox_inches='tight')

    # # k-fold scheme
    # kfolds = 5
    # if kfolds == 1:
    #     skf = StratifiedShuffleSplit(n_splits=kfolds, test_size=0.2, random_state=SEED)
    # else:
    #     skf = StratifiedKFold(n_splits=kfolds, shuffle=False, random_state=SEED)

    # # Run k-fold CV
    # best_model = None
    # best_model_id = 0
    # best_score = 0
    # df_scores = pd.DataFrame(index=range(kfolds), columns=['kfold', 'f1_micro', 'f1_macro'])

    # for f, (train_idx, val_idx) in enumerate(skf.split(xdata, ydata)):
    #     print(f'\nFold {f + 1}/{kfolds} ...\n')

    #     print('train_idx', train_idx)
    #     print('val_idx', val_idx)

    #     # Split data
    #     xtr, xvl = xdata[train_idx], xdata[val_idx]
    #     ytr, yvl = ydata[train_idx], ydata[val_idx]

    #     rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED)  # min_samples_split=3,
    #     rf_model.fit(xtr, ytr)
    #     score = rf_model.score(xvl, yvl)
    #     print(f'Prediction score (mean accuracy): {score:.4f}')

    #     yvl_preds = rf_model.predict(xvl)
    #     print('true', yvl[:7])
    #     print('pred', yvl_preds[:7])
    #     print(f'f1_score micro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'):.3f}')
    #     print(f'f1_score macro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'):.3f}')
    #     tmp_df = pd.DataFrame({'yvl': yvl, 'yvl_preds': yvl_preds})
    #     tmp_df.to_csv(os.path.join(OUTDIR, f'preds_cv_{f}.csv'), index=False)

    #     # Plot feature importance
    #     indices, fig = utils.plot_rf_fi(rf_model, n_features_toplot=15, title='FI RF Classifier')
    #     fi = utils.get_rf_fi(rf_model)
    #     fi.to_csv(os.path.join(OUTDIR, 'rf_classifier_fi.csv'), index=False)
    #     fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight')

    #     # Compute scores
    #     df_scores.loc[f, 'kfold'] = f + 1
    #     df_scores.loc[f, 'f1_micro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')
    #     df_scores.loc[f, 'f1_macro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')

    #     # Save best model
    #     ## if val_scores.iloc[f, 0] < best_score:
    #     if best_score < df_scores.loc[f, 'f1_micro']:
    #         best_score = df_scores.loc[f, 'f1_micro']
    #         best_model = rf_model
    #         best_model_id = f

    # print(df_scores)
    # model = best_model

    # ==========  RF classifier  ==========
    logger.info('------- Data for RF Classifier -------')
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy().values
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy().values
    features = xtr.columns
    print(f'\nnp.unique(ytr): {np.unique(ytr)}')
    logger.info(f'xtr.shape {xtr.shape}')
    logger.info(f'xvl.shape {xvl.shape}')
    logger.info(f'ytr.shape {ytr.shape}')
    logger.info(f'yvl.shape {yvl.shape}')

    # ---------- Train RF classifier ----------
    logger.info('------- Train RF Classifier -------')
    rf_model = RandomForestClassifier(n_estimators=200,
                                      min_samples_leaf=5,
                                      max_features='sqrt',
                                      random_state=SEED)
    rf_model.fit(xtr, ytr)
    logger.info(
        f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}')

    yvl_preds = rf_model.predict(xvl)
    #print('true', yvl[:10].values)
    print('true', yvl[:10])
    print('pred', yvl_preds[:10])
    logger.info('f1_score micro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')))
    logger.info('f1_score macro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')))

    utils.plot_confusion_matrix(y_true=yvl,
                                y_pred=yvl_preds,
                                labels=y_enc['label'].values,
                                title=f'{APP}_confusion_rf',
                                savefig=True,
                                img_name=os.path.join(
                                    OUTDIR, f'{APP}_confusion_rf.png'))

    # ---------- MDI and PFI from RF ----------
    print('\n------- MDI and PFI from RF classifier -------')
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    max_cols_plot=max_cols_plot,
                                    title='RF Classifier (FI using MDI)',
                                    errorbars=False,
                                    plot_direction='v',
                                    color='darkorange')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight')
    rf_fi = utils.get_rf_fi(rf_model, columns=features)
    rf_fi.to_csv(os.path.join(OUTDIR, f'{APP}_rf_fi.csv'), index=False)

    # PFI
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     y_enc=y_enc,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=True)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols_plot=max_cols_plot,
                             title='RF Classifier (PFI var)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_score_fi(max_cols_plot=max_cols_plot,
                               title='RF Classifier (PFI MDA: f1-score)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_fimap(figsize=(20, 7),
                            n_top_cols=10,
                            title='RF PFI Map',
                            drop_correlated=True)
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_map.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}_rf')

    # ==========  NN classifier  ==========
    logger.info('                 ')
    logger.info('------- Data for NN Classifier -------')
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    print(f'\nnp.unique(ytr): {np.unique(ytr)}')
    logger.info(f'xtr.shape {xtr.shape}')
    logger.info(f'xvl.shape {xvl.shape}')
    logger.info(f'ytr.shape {ytr.shape}')
    logger.info(f'yvl.shape {yvl.shape}')

    n_classes = len(np.unique(ytr))
    ytr = keras.utils.to_categorical(ytr, num_classes=n_classes)
    yvl = keras.utils.to_categorical(yvl, num_classes=n_classes)

    # ---------- Train NN classifier ----------
    logger.info('------- Train NN Classifier -------')
    keras_model = create_nn_classifier(n_features=xtr.shape[1],
                                       n_classes=n_classes)

    # callback_list = [EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0,
    #                                mode='auto', baseline=None, restore_best_weights=True)]
    callback_list = [
        ReduceLROnPlateau(monitor='val_loss',
                          factor=0.1,
                          patience=10,
                          verbose=1,
                          mode='auto',
                          min_delta=0.0001,
                          cooldown=0,
                          min_lr=0),
        ModelCheckpoint(filepath=os.path.join(OUTDIR, f'{APP}_nn_model'),
                        monitor='val_loss',
                        verbose=0,
                        save_best_only=True,
                        save_weights_only=False,
                        mode='auto',
                        period=1)
    ]

    history = keras_model.fit(xtr,
                              ytr,
                              validation_data=(xvl, yvl),
                              epochs=epoch,
                              batch_size=batch,
                              verbose=1,
                              callbacks=callback_list)
    # utils.plot_keras_learning(history, figsize = (10, 8), savefig=True,
    #                           img_name=os.path.join(OUTDIR, 'learning_with_lr'))

    score = keras_model.evaluate(xvl, yvl,
                                 verbose=False)[-1]  # compute the val loss
    logger.info('Prediction score (val loss): {:.4f}'.format(score))

    yvl_preds = keras_model.predict(xvl)
    print('true', np.argmax(yvl[:10], axis=1))
    print('pred', np.argmax(yvl_preds[:10, :], axis=1))
    logger.info('f1_score micro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='micro')))
    logger.info('f1_score macro: {:.3f}'.format(
        f1_score(y_true=np.argmax(yvl, axis=1),
                 y_pred=np.argmax(yvl_preds, axis=1),
                 average='macro')))

    # Reshape taregt (required for confusion matrix and PFI)
    if yvl_preds.ndim > 1 and yvl_preds.shape[
            1] > 1:  # if classification, get the class label
        yvl_preds = np.argmax(yvl_preds, axis=1)
    if yvl.ndim > 1 and yvl.shape[
            1] > 1:  # if classification, get the class label
        yvl = np.argmax(yvl, axis=1)

    utils.plot_confusion_matrix(y_true=yvl,
                                y_pred=yvl_preds,
                                labels=y_enc['label'].values,
                                title=f'{APP}_confusion_nn',
                                savefig=True,
                                img_name=os.path.join(
                                    OUTDIR, f'{APP}_confusion_nn.png'))

    # PFI
    t0 = time.time()
    fi_obj = pfi.PFI(model=keras_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     y_enc=y_enc,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=True)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols_plot=max_cols_plot,
                             title='NN Classifier (PFI var)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_var.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_score_fi(max_cols_plot=max_cols_plot,
                               title='NN Classifier (PFI MDA: f1-score)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_score.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_fimap(figsize=(20, 7),
                            n_top_cols=10,
                            title='NN PFI Map',
                            drop_correlated=True)
    fig.savefig(os.path.join(OUTDIR, f'{APP}_nn_pfi_map.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}_nn')
Ejemplo n.º 5
0
def run():
    # print(args)
    n_shuffles = 20
    corr_th = 1

    # Create necessary dirs
    OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_cor{corr_th}_runtime')
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    # ==========  RF classifier  ==========
    print('\nLoad TC data ...')

    # ---------- Load data ----------
    data = pd.read_csv(DATAPATH, sep='\t')
    xdata = data.iloc[:, 1:].copy()
    ydata = data.iloc[:, 0].copy()
    features = xdata.columns

    scaler = StandardScaler()
    xdata = scaler.fit_transform(xdata)
    xdata = pd.DataFrame(xdata, columns=features)

    xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata)
    print('\nxtr.shape', xtr.shape)
    print('xvl.shape', xvl.shape)

    # ---------- Feature importance from RF and PFI ----------
    print('\nCompute PFI ...')
    n_samples = np.linspace(start=int(xvl.shape[0]/4), stop=xvl.shape[0], num=4, dtype=int) 
    n_cols = np.linspace(start=int(xvl.shape[1]/4), stop=xvl.shape[1], num=4, dtype=int)
    print(n_samples)
    print(n_cols)

    tt = pd.DataFrame(index=range(len(n_samples) * len(n_cols)),
                      columns=['n_samples', 'n_cols', 'time (sec)', 'time (min)'])

    t_run = time.time()
    cnt = 0
    for i, s in enumerate(n_samples):
        for j, c in enumerate(n_cols):
            print(f'(n_samples, n_cols): ({s}, {c})')
            xtr_ = xtr.iloc[:, :c]
            xvl_ = xvl.iloc[:s, :c]
            yvl_ = yvl[:s]
            # print('xtr_.shape', xtr_.shape)
            # print('xvl_.shape', xvl_.shape)
            # print('yvl_.shape', yvl_.shape)
            
            rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED)
            rf_model.fit(xtr_, ytr)

            fi_obj = pfi.PFI(model=rf_model, xdata=xvl_, ydata=yvl_, n_shuffles=n_shuffles, outdir=OUTDIR)
            fi_obj.gen_col_sets(th=corr_th, toplot=False, verbose=False)
            
            t0 = time.time()
            fi_obj.compute_pfi(ml_type='c', verbose=False)
            t = time.time()-t0
            tt.loc[cnt, ['n_samples', 'n_cols', 'time (sec)', 'time (min)']] = np.array([s, c, t, t/60])
            cnt += 1

    tt.to_csv(os.path.join(OUTDIR, 'tt.csv'), index=False)
    print(f'\nTotal run time:  {(time.time()-t_run)/60} mins')
Ejemplo n.º 6
0
def run(args):
    print(args)
    n_shuffles = args.n_shuffles
    corr_th = args.corr_th
    epoch = args.epoch
    batch = args.batch
    max_cols = args.max_cols

    # Create necessary dirs
    OUTDIR = os.path.join(file_path, f'results_aacr_{APP}_cor{corr_th}')
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    # ==========  RF classifier  ==========
    print('\nLoad NT data ...')

    # ---------- Load data ----------
    data = pd.read_csv(DATAPATH, sep='\t')
    xdata = data.iloc[:, 1:].copy()
    ydata = data.iloc[:, 0].copy()

    if args.bootstrap_cols > -1:
        xdata = xdata.sample(n=args.bootstrap_cols, axis=1,
                             random_state=SEED)  # Take a subset of cols
    features = xdata.columns

    print('data.shape', data.shape)
    print(data.iloc[:3, :4])

    print('\nxdata.shape', xdata.shape)
    print('np.unique(ydata)', np.unique(ydata))

    scaler = StandardScaler()
    xdata = scaler.fit_transform(xdata)
    xdata = pd.DataFrame(xdata, columns=features)

    xtr, xvl, ytr, yvl = train_test_split(xdata,
                                          ydata,
                                          test_size=0.2,
                                          random_state=SEED,
                                          shuffle=True,
                                          stratify=ydata)

    # # k-fold scheme
    # kfolds = 5
    # if kfolds == 1:
    #     skf = StratifiedShuffleSplit(n_splits=kfolds, test_size=0.2, random_state=SEED)
    # else:
    #     skf = StratifiedKFold(n_splits=kfolds, shuffle=False, random_state=SEED)

    # # Run k-fold CV
    # best_model = None
    # best_model_id = 0
    # best_score = 0
    # df_scores = pd.DataFrame(index=range(kfolds), columns=['kfold', 'f1_micro', 'f1_macro'])

    # for f, (train_idx, val_idx) in enumerate(skf.split(xdata, ydata)):
    #     print(f'\nFold {f + 1}/{kfolds} ...\n')

    #     print('train_idx', train_idx)
    #     print('val_idx', val_idx)

    #     # Split data
    #     xtr, xvl = xdata[train_idx], xdata[val_idx]
    #     ytr, yvl = ydata[train_idx], ydata[val_idx]

    #     rf_model = RandomForestClassifier(n_estimators=150, max_features='sqrt', random_state=SEED)  # min_samples_split=3,
    #     rf_model.fit(xtr, ytr)
    #     score = rf_model.score(xvl, yvl)
    #     print(f'Prediction score (mean accuracy): {score:.4f}')

    #     yvl_preds = rf_model.predict(xvl)
    #     print('true', yvl[:7])
    #     print('pred', yvl_preds[:7])
    #     print(f'f1_score micro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='micro'):.3f}')
    #     print(f'f1_score macro: {f1_score(y_true=yvl, y_pred=yvl_preds, average='macro'):.3f}')
    #     tmp_df = pd.DataFrame({'yvl': yvl, 'yvl_preds': yvl_preds})
    #     tmp_df.to_csv(os.path.join(OUTDIR, f'preds_cv_{f}.csv'), index=False)

    #     # Plot feature importance
    #     indices, fig = utils.plot_rf_fi(rf_model, n_features_toplot=15, title='FI RF Classifier')
    #     fi = utils.get_rf_fi(rf_model)
    #     fi.to_csv(os.path.join(OUTDIR, 'rf_classifier_fi.csv'), index=False)
    #     fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'), bbox_inches='tight')

    #     # Compute scores
    #     df_scores.loc[f, 'kfold'] = f + 1
    #     df_scores.loc[f, 'f1_micro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')
    #     df_scores.loc[f, 'f1_macro'] = f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')

    #     # Save best model
    #     ## if val_scores.iloc[f, 0] < best_score:
    #     if best_score < df_scores.loc[f, 'f1_micro']:
    #         best_score = df_scores.loc[f, 'f1_micro']
    #         best_model = rf_model
    #         best_model_id = f

    # print(df_scores)
    # model = best_model

    # ---------- Train classifier ----------
    print('\nTrain RF Classifier ...')
    rf_model = RandomForestClassifier(n_estimators=200,
                                      max_features='sqrt',
                                      random_state=SEED)
    rf_model.fit(xtr, ytr)
    print(f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}')

    yvl_preds = rf_model.predict(xvl)
    print('true', yvl[:10].values)
    print('pred', yvl_preds[:10])
    print('f1_score micro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')))
    print('f1_score macro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')))

    # TODO: finish this ...
    # df_conf = utils.plot_confusion_matrix(y_true=yvl, y_pred=yvl_preds, labels=y_enc['type'].values,
    #                                       title=f'{APP}_confusion', savefig=True, img_name=f'{APP}_confusion')

    # Compute corr matrix
    # cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5)
    # fig = utils.plot_cor_heatmap(cor)
    # fig.savefig(os.path.join(OUTDIR, f'{APP}_feature_corr.png'), bbox_inches='tight')

    # ---------- Feature importance from RF and PFI ----------
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    max_cols=max_cols,
                                    title='RF Classifier (FI using MDI)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_fi.png'), bbox_inches='tight')

    # PFI
    print('\nCompute PFI ...')
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c')
    # logger.info(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(max_cols=max_cols,
                             title='RF Classifier (PFI var)',
                             ylabel='Importance (relative)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_var.png'),
                bbox_inches='tight')
    fig = fi_obj.plot_score_fi(max_cols=max_cols,
                               title='RF Classifier (PFI MDA: f1-score)',
                               ylabel='Importance (score decrease)')
    fig.savefig(os.path.join(OUTDIR, f'{APP}_rf_pfi_score.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name=f'{APP}')
Ejemplo n.º 7
0
def run(args):
    print(args)
    n_shuffles = args.n_shuffles
    corr_th = args.corr_th
    epoch = args.epoch
    batch = args.batch
    max_cols = args.max_cols

    # Create necessary dirs
    utils.make_dir(OUTDIR)  # os.makedirs(OUTDIR, exist_ok=True)

    # ==========  Load classification data  ==========
    print('\n======== Load classification data ========')
    data_train = pd.read_csv(DATAPATH_CLASSIFICATION_TRAIN, sep='\t')
    data_val = pd.read_csv(DATAPATH_CLASSIFICATION_VAL, sep='\t')
    print('data_train.shape', data_train.shape)
    print('data_val.shape  ', data_val.shape)
    print(f'\ndata_train:\n{data_train.iloc[:3, :4]}')
    print(f'\ndata_val:\n{data_val.iloc[:3, :4]}')

    # ==========  RF classifier  ==========
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    print(f'\nnp.unique(ytr): {np.unique(ytr)}')

    # Compute corr matrix
    cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5)
    fig = utils.plot_cor_heatmap(cor)
    fig.savefig(os.path.join(OUTDIR, 'feature_corr_classification.png'),
                bbox_inches='tight')

    # ---------- Train classifier ----------
    print('\n------- Train RF Classifier -------')
    rf_model = RandomForestClassifier(n_estimators=200,
                                      max_features='sqrt',
                                      random_state=SEED)
    rf_model.fit(xtr, ytr)
    print(f'Prediction score (mean accuracy): {rf_model.score(xvl, yvl):.4f}')

    yvl_preds = rf_model.predict(xvl)
    print('true', yvl[:5].values)
    print('pred', yvl_preds[:5])
    print('f1_score micro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='micro')))
    print('f1_score macro: {:.3f}'.format(
        f1_score(y_true=yvl, y_pred=yvl_preds, average='macro')))

    yvl_preds_p = rf_model.predict_proba(xvl)
    print(f'yvl_preds_p:\n{yvl_preds_p[:5]}')

    utils.plot_confusion_matrix(y_true=yvl,
                                y_pred=yvl_preds,
                                labels=yvl.unique(),
                                title=f'RF Classifier (Confusion)',
                                savefig=True,
                                img_name=os.path.join(
                                    OUTDIR, 'rf_classifier_confusion.png'))

    # ---------- MDI and PFI from RF ----------
    print('\n------- MDI and PFI from RF classifier -------')
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    title='RF Classifier (FI using MDI)')
    fig.savefig(os.path.join(OUTDIR, 'rf_classifier_fi.png'),
                bbox_inches='tight')

    # PFI
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='c', verbose=False)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(title='RF Classifier (PFI var)')
    fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_var.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_score_fi(title='RF Classifier (PFI MDA: f1-score)')
    fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_score.png'),
                bbox_inches='tight')

    fig = fi_obj.plot_score_fi_p(title='RF Classifier (PFI MDA: p-score)')
    fig.savefig(os.path.join(OUTDIR, 'rf_classifier_pfi_score_p.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name='rf_classifier')

    # ==========  NN classifier  ==========
    # print('\nLoad classification data ...')

    # ---------- Load data ----------
    # data = pd.read_csv(DATAPATH_CLASSIFICATION, sep='\t')
    # xdata = data.iloc[:, 1:].copy()
    # ydata = data.iloc[:, 0].copy()
    # features = xdata.columns

    # print('data.shape', data.shape)
    # print(data.iloc[:3, :4])

    # print('\nxdata.shape', xdata.shape)
    # print('np.unique(ydata)', np.unique(ydata))

    # n_classes = len(np.unique(ydata))
    # ydata = keras.utils.to_categorical(ydata, num_classes=n_classes)

    # scaler = StandardScaler()
    # xdata = scaler.fit_transform(xdata)
    # xdata = pd.DataFrame(xdata, columns=features)

    # xtr, xvl, ytr, yvl = train_test_split(xdata, ydata, test_size=0.2, random_state=SEED, shuffle=True, stratify=ydata)

    # n_classes = len(np.unique(ydata))
    # ytr = keras.utils.to_categorical(ytr, num_classes=n_classes)
    # yvl = keras.utils.to_categorical(yvl, num_classes=n_classes)

    # print('\nTrain NN Classifier ...')
    # keras_model = create_nn_classifier(n_features=xtr.shape[1], n_classes=n_classes)
    # history = keras_model.fit(xtr, ytr, epochs=epoch, batch_size=batch, verbose=0)
    # score = keras_model.evaluate(xvl, yvl, verbose=False)[-1]  # compute the val loss
    # print(f'Prediction score (val loss): {score:.4f}')

    # yvl_preds = keras_model.predict(xvl)
    # print('true', np.argmax(yvl[:10], axis=1))
    # print('pred', np.argmax(yvl_preds[:10, :], axis=1))
    # print('f1_score micro: {:.3f}'.format(f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='micro')))
    # print('f1_score macro: {:.3f}'.format(f1_score(y_true=np.argmax(yvl, axis=1), y_pred=np.argmax(yvl_preds, axis=1), average='macro')))

    # # ---------- Feature importance from RF and PFI ----------
    # # PFI
    # print('\nCompute PFI (NN classifier) ...')
    # t0 = time.time()
    # fi_obj = pfi.PFI(model=keras_model, xdata=xvl, ydata=yvl, n_shuffles=n_shuffles)
    # fi_obj.gen_col_sets(th=corr_th, toplot=False)
    # fi_obj.compute_pfi(ml_type='c', verbose=False)
    # print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # # Plot and save PFI
    # fig = fi_obj.plot_var_fi(title='NN Classifier (PFI var)', ylabel='Importance (relative)')
    # fig.savefig(os.path.join(OUTDIR, 'nn_classifier_pfi_var.png'), bbox_inches='tight')
    # fig = fi_obj.plot_score_fi(title='NN Classifier (PFI MDA: f1-score)', ylabel='Importance (score decrease)')
    # fig.savefig(os.path.join(OUTDIR, 'nn_classifier_pfi_score.png'), bbox_inches='tight')

    # # Dump resutls
    # fi_obj.dump(path=OUTDIR, name='nn_classifier')

    # ==========  Load regression data  ==========
    print('\n======== Load regression data ========')

    data_train = pd.read_csv(DATAPATH_REGRESSION_TRAIN, sep='\t')
    data_val = pd.read_csv(DATAPATH_REGRESSION_VAL, sep='\t')
    print('data_train.shape', data_train.shape)
    print('data_val.shape  ', data_val.shape)
    print(f'\ndata_train:\n{data_train.iloc[:3, :4]}')
    print(f'\ndata_val:\n{data_val.iloc[:3, :4]}')

    # ==========  RF regressor  ==========
    xtr = data_train.iloc[:, 1:].copy()
    ytr = data_train.iloc[:, 0].copy()
    xvl = data_val.iloc[:, 1:].copy()
    yvl = data_val.iloc[:, 0].copy()
    features = xtr.columns
    print(f'\nnp.unique(ytr): {np.unique(ytr)}')

    # Compute corr matrix
    cor = utils.compute_cor_mat(xvl, zero_diag=True, decimals=5)
    fig = utils.plot_cor_heatmap(cor)
    fig.savefig(os.path.join(OUTDIR, 'feature_corr_regression.png'),
                bbox_inches='tight')

    # ---------- Train regressor ----------
    print('\n------- Train RF Regressor -------')
    rf_model = RandomForestRegressor(n_estimators=150,
                                     min_samples_leaf=5,
                                     max_features='sqrt',
                                     random_state=SEED)
    rf_model.fit(xtr, ytr)
    score = rf_model.score(xvl, yvl)
    print(f'Prediction score (r_square): {score:.4f}')

    # ---------- Feature importance from RF and PFI ----------
    print('\n------- MDI and PFI from RF regressor -------')
    # Plot RF FI
    indices, fig = utils.plot_rf_fi(rf_model,
                                    columns=features,
                                    title='RF Regressor (FI using MDI)')
    fig.savefig(os.path.join(OUTDIR, 'rf_regressor_fi.png'),
                bbox_inches='tight')

    # PFI
    t0 = time.time()
    fi_obj = pfi.PFI(model=rf_model,
                     xdata=xvl,
                     ydata=yvl,
                     n_shuffles=n_shuffles,
                     outdir=OUTDIR)
    fi_obj.gen_col_sets(th=corr_th, toplot=False)
    fi_obj.compute_pfi(ml_type='r', verbose=False)
    print(f'Total PFI time:  {(time.time()-t0)/60:.3f} mins')

    # Plot and save PFI
    fig = fi_obj.plot_var_fi(title='RF Regressor (PFI var)')
    fig.savefig(os.path.join(OUTDIR, 'rf_regressor_pfi_var.png'),
                bbox_inches='tight')
    fig = fi_obj.plot_score_fi(title='RF Regressor (PFI MDA: f1-score)')
    fig.savefig(os.path.join(OUTDIR, 'rf_regressor_pfi_score.png'),
                bbox_inches='tight')

    # Dump resutls
    fi_obj.dump(path=OUTDIR, name='rf_regressor')