def import_dataset_METABRIC(norm_mode='standard'): in_filename1 = '{}/metabric/cleaned_features_final.csv'.format( utilmlab.get_data_dir()) in_filename2 = '{}/metabric/label.csv'.format( utilmlab.get_data_dir()) df1 = pd.read_csv(in_filename1, sep =',') df2 = pd.read_csv(in_filename2, sep =',') data = np.asarray(df1) data = f_get_Normalization(data, norm_mode) time = np.asarray(df2[['event_time']]) # time = np.round(time/12.) #unit time = month label = np.asarray(df2[['label']]) num_Category = int(np.max(time) * 1.2) #to have enough time-horizon num_Event = int(len(np.unique(label)) - 1) #only count the number of events (do not count censoring as an event) x_dim = np.shape(data)[1] mask1 = f_get_fc_mask2(time, label, num_Event, num_Category) mask2 = f_get_fc_mask3(time, -1, num_Category) DIM = (x_dim) DATA = (data, time, label) MASK = (mask1, mask2) return DIM, DATA, MASK
def import_dataset_SYNTHETIC(norm_mode='standard'): in_filename = '{}/synthetic/synthetic_comprisk.csv'.format( utilmlab.get_data_dir()) df = pd.read_csv(in_filename, sep=',') label = np.asarray(df[['label']]) time = np.asarray(df[['time']]) data = np.asarray(df.iloc[:,4:]) data = f_get_Normalization(data, norm_mode) num_Category = int(np.max(time) * 1.2) #to have enough time-horizon num_Event = int(len(np.unique(label)) - 1) #only count the number of events (do not count censoring as an event) x_dim = np.shape(data)[1] mask1 = f_get_fc_mask2(time, label, num_Event, num_Category) mask2 = f_get_fc_mask3(time, -1, num_Category) DIM = (x_dim) DATA = (data, time, label) MASK = (mask1, mask2) return DIM, DATA, MASK
def get_dataset(dataset, nsample=0): fn = None labels = [] features_not_scalable = [] features_drop = [] rval = 0 df = None dataset_lst = get_available_datasets() if dataset not in get_available_datasets() + ['show']: print('error: {} not available: ({})'.format(dataset, dataset_lst)) sys.exit(1) if dataset == 'spam': fn = '{}/spam.csv'.format(utilmlab.get_data_dir()) elif dataset == 'spambase': fn = '{}/spambase.csv.gz'.format(utilmlab.get_data_dir()) labels = ['label'] elif dataset == 'breastcancer' or dataset == 'bc': data = load_breast_cancer() # get Breast Cancer Dataset df = pd.DataFrame(data.data, columns=data.feature_names) target = 'target' df[target] = data.target labels = [target] elif dataset == 'mnist': (data, _), _ = mnist.load_data() data = np.reshape(np.asarray(data), [60000, 28 * 28]).astype(float) #rand_sel = np.random.rand(60000)>0.9 #data = data[rand_sel,:] df = pd.DataFrame(data, columns=range(28**2)) elif dataset == 'cover': data = fetch_covtype() df = pd.DataFrame(data.data) target = 'target' df[target] = data.target labels = [target] elif dataset == 'news': fn = '{}/OnlineNewsPopularity.csv'.format(utilmlab.get_data_dir()) labels = [' shares'] features_drop = ['url'] elif dataset == 'newsbin': fn = '{}/OnlineNewsPopularity.csv'.format(utilmlab.get_data_dir()) labels = [' shares'] features_drop = ['url'] response_var = labels[0] df = pd.read_csv(fn) df[response_var] = [0 if el <= 5000 else 1 for el in df[response_var]] elif dataset == 'letter': fn = '{}/letter.csv'.format(utilmlab.get_data_dir()) elif dataset == 'letter-recognition': fn = '{}/letter-recognition.csv.gz'.format(utilmlab.get_data_dir()) labels = ['lettr'] elif dataset == 'creditcardfraud': fn = ds2fn_d[dataset] labels = ['Amount', 'Class'] elif dataset == 'show': print('availabel datasets: {}'.format(dataset_lst)) sys.exit(0) else: print('warning: unsupported dataset:{}'.format(dataset)) rval = 1 return (rval, None) if df is not None: pass elif fn is not None: df = pd.read_csv(fn) else: assert 0 features = [ el for el in list(df.columns) if el not in labels and el not in features_drop ] if nsample: df = df[:nsample] return (rval, { 'df': df, 'targets': labels, 'features': features, 'features_not_scalable': features_not_scalable } if not rval else None)
import pandas as pd import utilmlab import sys from sklearn.datasets import load_breast_cancer, fetch_covtype from tensorflow.keras.datasets import mnist import logging import argparse import os import numpy as np ds2fn_d = { 'bc': None, 'cover': None, 'breastcancer': None, 'mnist': None, 'spam': '{}/spam.csv'.format(utilmlab.get_data_dir()), 'spambase': '{}/spambase.csv.gz'.format(utilmlab.get_data_dir()), 'news': '{}/OnlineNewsPopularity.csv'.format(utilmlab.get_data_dir()), 'newsbin': '{}/OnlineNewsPopularity.csv'.format(utilmlab.get_data_dir()), 'letter': '{}/letter.csv'.format(utilmlab.get_data_dir()), 'letter-recognition': '{}/letter.csv.gz'.format(utilmlab.get_data_dir()), 'creditcardfraud': '{}/creditcard.csv'.format(utilmlab.get_data_dir()) } def is_available(ds): if ds in ds2fn_d.keys(): return True if ds2fn_d[ds] is None else os.path.isfile(ds2fn_d[ds]) def get_available_datasets():
import pandas as pd import utilmlab import sys from sklearn.datasets import load_breast_cancer, fetch_covtype import logging import argparse import os ds2fn_d = { 'bc': None, 'cover': None, 'breastcancer': None, 'spam': '{}/spam.csv.gz'.format(utilmlab.get_data_dir()), 'spambase': '{}/spambase.csv.gz'.format(utilmlab.get_data_dir()), 'news': '{}/OnlineNewsPopularity.csv.gz'.format(utilmlab.get_data_dir()), 'newsbin': '{}/OnlineNewsPopularity.csv.gz'.format( utilmlab.get_data_dir()), 'letter': '{}/letter.csv.gz'.format(utilmlab.get_data_dir()), 'letter-recognition': '{}/letter.csv.gz'.format(utilmlab.get_data_dir()), 'creditcardfraud': '{}/kaggle_creditcardfraud/creditcard_modified.csv'.format( utilmlab.get_data_dir()) } def is_available(ds): if ds in ds2fn_d.keys(): return True if ds2fn_d[ds] is None else os.path.isfile(ds2fn_d[ds]) def get_available_datasets():
mean_confidence_interval(PEHE_train_np)[1], mean_confidence_interval(PEHE_test_np)[0], mean_confidence_interval(PEHE_test_np)[1])) return results_d if __name__ == '__main__': parser = argparse.ArgumentParser( description="Causal Multi-task Gaussian Processes") parser.add_argument("-n", "--num-exp", default=10, type=int) parser.add_argument("-m", "--mode", default="CMGP", type=str) parser.add_argument("-t", "--test-frac", default=0.1, type=float) parser.add_argument("-o") args = parser.parse_args() fn_data = '{}/ihdp/ihdp_covariates.csv'.format(utilmlab.get_data_dir()) if not os.path.isfile(fn_data): print('Error: this implementation requires the IHDP dataset' ', please refer to the README.md for more details.') sys.exit(0) results_d = main(args, fn_data) if args.o is not None: with open(args.o, 'w') as fp: json.dump(results_d, fp)
script_ana = Path('{}/alg/invase/invase_ana.py'.format(proj_dir)) script_plot = Path('{}/alg/invase/invase_plot.py'.format(proj_dir)) for dataset in ['csv', 'bc', 'spambase']: odir = '{}/dataset_{}'.format(resdir, dataset) utilmlab.ensure_dir(odir) fn_feature_score = '{}/feature_score.csv.gz'.format(odir) fn_json = '{}/feature_score.csv.json'.format(odir) fn_plot_sample = '{}/sample.png'.format(odir) fn_plot_global = '{}/global.png'.format(odir) if dataset == 'csv': fn_csv = '{}/spambase.csv.gz'.format(utilmlab.get_data_dir()) utilmlab.exe_cmd( logger, '{} {} -i {} --target label --it {} -o {}'.format( python_exe, script, fn_csv, nepoch, fn_feature_score)) else: if not data_loader_mlab.is_available(dataset): continue utilmlab.exe_cmd( logger, '{} {} --dataset {} --it {} -o {}'.format( python_exe, script, dataset, nepoch, fn_feature_score)) utilmlab.exe_cmd( logger, '{} {} -i {} -o {}'.format(python_exe, script_ana, fn_feature_score, fn_json)) utilmlab.exe_cmd(
parser.add_argument("--testy", default="testy.csv") parser.add_argument("--testt", default="testt.csv") return parser.parse_args() if __name__ == '__main__': args = init_arg() dataset = args.dataset fn_trainx, fn_trainy, fn_traint = args.trainx, args.trainy, args.traint fn_testx, fn_testy, fn_testt = args.testx, args.testy, args.testt Test_T = None if dataset == 'twins': train_rate = 0.8 fn_twins_csv = utilmlab.get_data_dir() + "/twins/Twin_Data.csv.gz" [Train_X, Train_T, Train_Y, Opt_Train_Y, Test_X, Test_Y] \ = Data_Twins(fn_twins_csv, train_rate) elif dataset == 'jobs': fn_jobs_csv = utilmlab.get_data_dir( ) + "/jobs/Jobs_Lalonde_Data.csv.gz" [Train_X, Train_T, Train_Y, Test_X, Test_T, Test_Y, Train_X_Test, Train_T_Test, Train_Y_Test, Train_No, Test_No, Train_Test_No] \ = Data_Jobs(fn_jobs_csv) else: assert 0 pd.DataFrame(Train_X).to_csv(fn_trainx, index=False) pd.DataFrame(Train_Y).to_csv(fn_trainy, index=False) pd.DataFrame(Train_T).to_csv(fn_traint, index=False) pd.DataFrame(Test_X).to_csv(fn_testx, index=False)
import utilmlab import sys from sklearn.datasets import load_breast_cancer, fetch_covtype import logging import argparse import os ds2fn_d = { 'bc': None, 'cover': None, 'breastcancer': None, 'spam': '{}/spam.csv.gz'.format(utilmlab.get_data_dir()), 'spambase': '{}/spambase.csv.gz'.format(utilmlab.get_data_dir()), 'news': '{}/OnlineNewsPopularity.csv.gz'.format(utilmlab.get_data_dir()), 'newsbin': '{}/OnlineNewsPopularity.csv.gz'.format(utilmlab.get_data_dir()), 'letter': '{}/letter.csv.gz'.format(utilmlab.get_data_dir()), 'letter-recognition': '{}/letter.csv.gz'.format(utilmlab.get_data_dir()), 'creditcardfraud': '{}/kaggle_creditcardfraud/creditcard_modified.csv'.format( utilmlab.get_data_dir()) }