コード例 #1
0
def import_dataset_METABRIC(norm_mode='standard'):
    in_filename1 = '{}/metabric/cleaned_features_final.csv'.format(
        utilmlab.get_data_dir())
    in_filename2 = '{}/metabric/label.csv'.format(
        utilmlab.get_data_dir())

    df1 = pd.read_csv(in_filename1, sep =',')
    df2 = pd.read_csv(in_filename2, sep =',')

    data  = np.asarray(df1)
    data  = f_get_Normalization(data, norm_mode)
    
    time  = np.asarray(df2[['event_time']])
    # time  = np.round(time/12.) #unit time = month
    label = np.asarray(df2[['label']])

    
    num_Category    = int(np.max(time) * 1.2)        #to have enough time-horizon
    num_Event       = int(len(np.unique(label)) - 1) #only count the number of events (do not count censoring as an event)

    x_dim           = np.shape(data)[1]

    mask1           = f_get_fc_mask2(time, label, num_Event, num_Category)
    mask2           = f_get_fc_mask3(time, -1, num_Category)

    DIM             = (x_dim)
    DATA            = (data, time, label)
    MASK            = (mask1, mask2)

    return DIM, DATA, MASK
コード例 #2
0
def import_dataset_SYNTHETIC(norm_mode='standard'):
    in_filename = '{}/synthetic/synthetic_comprisk.csv'.format(
        utilmlab.get_data_dir())
    df = pd.read_csv(in_filename, sep=',')
    
    label           = np.asarray(df[['label']])
    time            = np.asarray(df[['time']])
    data            = np.asarray(df.iloc[:,4:])
    data            = f_get_Normalization(data, norm_mode)

    num_Category    = int(np.max(time) * 1.2)  #to have enough time-horizon
    num_Event       = int(len(np.unique(label)) - 1) #only count the number of events (do not count censoring as an event)

    x_dim           = np.shape(data)[1]

    mask1           = f_get_fc_mask2(time, label, num_Event, num_Category)
    mask2           = f_get_fc_mask3(time, -1, num_Category)

    DIM             = (x_dim)
    DATA            = (data, time, label)
    MASK            = (mask1, mask2)

    return DIM, DATA, MASK
コード例 #3
0
def get_dataset(dataset, nsample=0):

    fn = None
    labels = []
    features_not_scalable = []
    features_drop = []
    rval = 0
    df = None
    dataset_lst = get_available_datasets()

    if dataset not in get_available_datasets() + ['show']:
        print('error: {} not available: ({})'.format(dataset, dataset_lst))
        sys.exit(1)

    if dataset == 'spam':
        fn = '{}/spam.csv'.format(utilmlab.get_data_dir())
    elif dataset == 'spambase':
        fn = '{}/spambase.csv.gz'.format(utilmlab.get_data_dir())
        labels = ['label']
    elif dataset == 'breastcancer' or dataset == 'bc':
        data = load_breast_cancer()  # get Breast Cancer Dataset
        df = pd.DataFrame(data.data, columns=data.feature_names)
        target = 'target'
        df[target] = data.target
        labels = [target]
    elif dataset == 'mnist':
        (data, _), _ = mnist.load_data()
        data = np.reshape(np.asarray(data), [60000, 28 * 28]).astype(float)
        #rand_sel = np.random.rand(60000)>0.9
        #data = data[rand_sel,:]
        df = pd.DataFrame(data, columns=range(28**2))

    elif dataset == 'cover':
        data = fetch_covtype()
        df = pd.DataFrame(data.data)
        target = 'target'
        df[target] = data.target
        labels = [target]
    elif dataset == 'news':
        fn = '{}/OnlineNewsPopularity.csv'.format(utilmlab.get_data_dir())
        labels = [' shares']
        features_drop = ['url']
    elif dataset == 'newsbin':
        fn = '{}/OnlineNewsPopularity.csv'.format(utilmlab.get_data_dir())
        labels = [' shares']
        features_drop = ['url']
        response_var = labels[0]
        df = pd.read_csv(fn)
        df[response_var] = [0 if el <= 5000 else 1 for el in df[response_var]]
    elif dataset == 'letter':
        fn = '{}/letter.csv'.format(utilmlab.get_data_dir())
    elif dataset == 'letter-recognition':
        fn = '{}/letter-recognition.csv.gz'.format(utilmlab.get_data_dir())
        labels = ['lettr']
    elif dataset == 'creditcardfraud':
        fn = ds2fn_d[dataset]
        labels = ['Amount', 'Class']
    elif dataset == 'show':
        print('availabel datasets: {}'.format(dataset_lst))
        sys.exit(0)
    else:
        print('warning: unsupported dataset:{}'.format(dataset))
        rval = 1
        return (rval, None)

    if df is not None:
        pass
    elif fn is not None:
        df = pd.read_csv(fn)
    else:
        assert 0

    features = [
        el for el in list(df.columns)
        if el not in labels and el not in features_drop
    ]

    if nsample:
        df = df[:nsample]
    return (rval, {
        'df': df,
        'targets': labels,
        'features': features,
        'features_not_scalable': features_not_scalable
    } if not rval else None)
コード例 #4
0
import pandas as pd
import utilmlab
import sys
from sklearn.datasets import load_breast_cancer, fetch_covtype
from tensorflow.keras.datasets import mnist
import logging
import argparse
import os
import numpy as np

ds2fn_d = {
    'bc': None,
    'cover': None,
    'breastcancer': None,
    'mnist': None,
    'spam': '{}/spam.csv'.format(utilmlab.get_data_dir()),
    'spambase': '{}/spambase.csv.gz'.format(utilmlab.get_data_dir()),
    'news': '{}/OnlineNewsPopularity.csv'.format(utilmlab.get_data_dir()),
    'newsbin': '{}/OnlineNewsPopularity.csv'.format(utilmlab.get_data_dir()),
    'letter': '{}/letter.csv'.format(utilmlab.get_data_dir()),
    'letter-recognition': '{}/letter.csv.gz'.format(utilmlab.get_data_dir()),
    'creditcardfraud': '{}/creditcard.csv'.format(utilmlab.get_data_dir())
}


def is_available(ds):
    if ds in ds2fn_d.keys():
        return True if ds2fn_d[ds] is None else os.path.isfile(ds2fn_d[ds])


def get_available_datasets():
コード例 #5
0
import pandas as pd
import utilmlab
import sys
from sklearn.datasets import load_breast_cancer, fetch_covtype
import logging
import argparse
import os


ds2fn_d = {
    'bc': None,
    'cover': None,
    'breastcancer': None,
    'spam': '{}/spam.csv.gz'.format(utilmlab.get_data_dir()),
    'spambase': '{}/spambase.csv.gz'.format(utilmlab.get_data_dir()),
    'news': '{}/OnlineNewsPopularity.csv.gz'.format(utilmlab.get_data_dir()),
    'newsbin': '{}/OnlineNewsPopularity.csv.gz'.format(
        utilmlab.get_data_dir()),
    'letter': '{}/letter.csv.gz'.format(utilmlab.get_data_dir()),
    'letter-recognition': '{}/letter.csv.gz'.format(utilmlab.get_data_dir()),
    'creditcardfraud': '{}/kaggle_creditcardfraud/creditcard_modified.csv'.format(
        utilmlab.get_data_dir())
}


def is_available(ds):
    if ds in ds2fn_d.keys():
        return True if ds2fn_d[ds] is None else os.path.isfile(ds2fn_d[ds])


def get_available_datasets():
コード例 #6
0
           mean_confidence_interval(PEHE_train_np)[1],
           mean_confidence_interval(PEHE_test_np)[0],
           mean_confidence_interval(PEHE_test_np)[1]))
    return results_d


if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description="Causal Multi-task Gaussian Processes")

    parser.add_argument("-n", "--num-exp", default=10, type=int)
    parser.add_argument("-m", "--mode", default="CMGP", type=str)
    parser.add_argument("-t", "--test-frac", default=0.1, type=float)
    parser.add_argument("-o")

    args = parser.parse_args()

    fn_data = '{}/ihdp/ihdp_covariates.csv'.format(utilmlab.get_data_dir())

    if not os.path.isfile(fn_data):
        print('Error: this implementation requires the IHDP dataset'
              ', please refer to the README.md for more details.')
        sys.exit(0)

    results_d = main(args, fn_data)

    if args.o is not None:
        with open(args.o, 'w') as fp:
            json.dump(results_d, fp)
コード例 #7
0
    script_ana = Path('{}/alg/invase/invase_ana.py'.format(proj_dir))
    script_plot = Path('{}/alg/invase/invase_plot.py'.format(proj_dir))

    for dataset in ['csv', 'bc', 'spambase']:

        odir = '{}/dataset_{}'.format(resdir, dataset)

        utilmlab.ensure_dir(odir)

        fn_feature_score = '{}/feature_score.csv.gz'.format(odir)
        fn_json = '{}/feature_score.csv.json'.format(odir)
        fn_plot_sample = '{}/sample.png'.format(odir)
        fn_plot_global = '{}/global.png'.format(odir)

        if dataset == 'csv':
            fn_csv = '{}/spambase.csv.gz'.format(utilmlab.get_data_dir())
            utilmlab.exe_cmd(
                logger, '{} {} -i {} --target label --it {} -o {}'.format(
                    python_exe, script, fn_csv, nepoch, fn_feature_score))
        else:
            if not data_loader_mlab.is_available(dataset):
                continue
            utilmlab.exe_cmd(
                logger, '{} {} --dataset {} --it {} -o {}'.format(
                    python_exe, script, dataset, nepoch, fn_feature_score))

        utilmlab.exe_cmd(
            logger, '{} {} -i {} -o {}'.format(python_exe, script_ana,
                                               fn_feature_score, fn_json))

        utilmlab.exe_cmd(
コード例 #8
0
    parser.add_argument("--testy", default="testy.csv")
    parser.add_argument("--testt", default="testt.csv")
    return parser.parse_args()


if __name__ == '__main__':

    args = init_arg()
    dataset = args.dataset
    fn_trainx, fn_trainy, fn_traint = args.trainx, args.trainy, args.traint
    fn_testx, fn_testy, fn_testt = args.testx, args.testy, args.testt

    Test_T = None
    if dataset == 'twins':
        train_rate = 0.8
        fn_twins_csv = utilmlab.get_data_dir() + "/twins/Twin_Data.csv.gz"
        [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] \
            = Data_Twins(fn_twins_csv, train_rate)
    elif dataset == 'jobs':
        fn_jobs_csv = utilmlab.get_data_dir(
        ) + "/jobs/Jobs_Lalonde_Data.csv.gz"
        [Train_X, Train_T, Train_Y, Test_X, Test_T, Test_Y, Train_X_Test,
         Train_T_Test, Train_Y_Test, Train_No, Test_No, Train_Test_No] \
         = Data_Jobs(fn_jobs_csv)
    else:
        assert 0

    pd.DataFrame(Train_X).to_csv(fn_trainx, index=False)
    pd.DataFrame(Train_Y).to_csv(fn_trainy, index=False)
    pd.DataFrame(Train_T).to_csv(fn_traint, index=False)
    pd.DataFrame(Test_X).to_csv(fn_testx, index=False)
コード例 #9
0
import utilmlab
import sys
from sklearn.datasets import load_breast_cancer, fetch_covtype
import logging
import argparse
import os

ds2fn_d = {
    'bc':
    None,
    'cover':
    None,
    'breastcancer':
    None,
    'spam':
    '{}/spam.csv.gz'.format(utilmlab.get_data_dir()),
    'spambase':
    '{}/spambase.csv.gz'.format(utilmlab.get_data_dir()),
    'news':
    '{}/OnlineNewsPopularity.csv.gz'.format(utilmlab.get_data_dir()),
    'newsbin':
    '{}/OnlineNewsPopularity.csv.gz'.format(utilmlab.get_data_dir()),
    'letter':
    '{}/letter.csv.gz'.format(utilmlab.get_data_dir()),
    'letter-recognition':
    '{}/letter.csv.gz'.format(utilmlab.get_data_dir()),
    'creditcardfraud':
    '{}/kaggle_creditcardfraud/creditcard_modified.csv'.format(
        utilmlab.get_data_dir())
}