def data_preprocess(params): ### Record Concatenation dataio = DataIO(params['input_path'], params['result_path']) dataio.read_data() ctn = Concatenation(dataio) patient_info, n_feature = ctn.get_concatenation() # patient id: Patient # static feature and dynamic feature # dynamic feature{time:feature_value} ### Data Imputation imp_method = 'simple' imp = Imputation(patient_info, n_feature) patient_array = imp.get_imputation(imp_method) return (dataio, patient_info, patient_array)
def preprocessorMain(self): self.removeTargetColumn() while (1): print("\nTasks (Preprocessing)\n") for task in self.tasks: print(task) while (1): try: choice = int( input( "\nWhat do you want to do? [enter -1 to exit]: ")) except ValueError: print("Integer Value required. Try again.....") continue break if choice == -1: exit() elif choice == 1: DataDescription(self.data).describe() elif choice == 2: self.data = Imputation(self.data).imputer() elif choice == 3: self.data = Categorical(self.data).categoricalMain() elif choice == 4: self.data = FeatureScaling(self.data).scaling() elif choice == 5: Download(self.data).download() else: print("\nWrong choice!! Try again...")
def data_preprocess(params): ### Record Concatenation dataio = DataIO(params['input_path'], params['map_path'], params['domain']) dataio.read_data() dataio.read_label() ctn = Concatenation(dataio, params['domain']) patient_info, n_feature, feature_list, feature_range = ctn.get_concatenation() # patient id: Patient # static feature and dynamic feature # dynamic feature{time:feature_value} ### Data Imputation imp_method = 'simple' imp = Imputation(patient_info, n_feature) patient_array, patient_time = imp.get_imputation(imp_method) ### Clinical Data with DTI Generation cli = CliGen(feature_list, feature_range, ctn.dti_time) subject_array = cli.get_data(patient_array, patient_time, params['time']) if True == params['binary']: # only works for discrete clinical features subject_array = cli.get_binarization() subject_label = cli.get_label(patient_info, params['labels'], params['time']) return subject_array, subject_label
def training(self): # Preparação dos dados self.imp = Imputation(self.data) # Seleciona as caracteristicas self.features = FeatureSelection(self.imp.imputed_data) data_selected = self.features.data_selected self.selected_features = self.features.selected_features # Encontra os padrões ausentes self.missing_patterns = MissingPatterns(self.data, self.selected_features).missing_patterns # Realiza o treinamento dos classificadores #print('test train') for mpi in self.missing_patterns: # Seleciona as caracteristicas cpi = set(self.selected_features) - set(mpi) data_temp = Instances.copy_instances(data_selected, from_row=0, num_rows=data_selected.num_instances) data_temp.class_is_last() # Separa os dados de treinamento data_temp = self.reduceData(data_temp, cpi, self.data) # Treina os classificadores com os dados imputados classifier = Classifier(classname=self.learn_class, options=self.options) classifier.build_classifier(data_temp) #print(classifier.distribution_for_instance(data_selected.get_instance(30))) #!!!!!! Verica o peso de cada classificador (sua acuracia de classificação) evl = Evaluation(data_temp) evl.crossvalidate_model(classifier, data_temp, 15, Random(1)) # Adiciona os classificadores treinados ao conjunto de classificadores my_classifier = MyClassifier(classifier, cpi, 1 - evl.mean_absolute_error) self.classifiers.add(my_classifier)
parser.add_argument('--chromosomes', help='comma separated values of chromosomes (If not set, imputation for all chromosomes will be performed') parser.add_argument('--additional_shapeit_parameters', help='Extra command line arguments to pass to SHAPEIT tool', default=' ') parser.add_argument('--additional_impute2_parameters', help='Extra command line arguments to pass to impute2 tool', default=' ') parser.add_argument('--position_batch_size', help='Size of the chromosomal size of each imputation batch', default=5000000, type=int) parser.add_argument('--sample_batch_size', help='Minimum number of samples in imputation batches', default=500, type=int) parser.add_argument('--reference', help='name of the imputation reference panel') parser.add_argument('--action', help='Action to do: liftover, phase, impute', choices=['liftover', 'phase', 'impute', 'phase_impute', 'liftover_phase_impute']) parser.add_argument('--add_reference', help='Add a new reference panel', action='store_true') parser.add_argument('--backend', help='Execution environment. Default: local', choices=['pbs', 'grid', 'local'], default='local') parser.add_argument('--chain_file', help='Genomic assembly for the liftover step', default='hg18ToHg19') parser.add_argument('--nosubmit', help='Create scripts but don\'t submit them for execution', action='store_true') parser.add_argument('--java_executable', help='java executable. Default: java .This is useful when java is not in the PATH', default='java') args = parser.parse_args() imp = Imputation(installation_dir=args.installation_dir, reference_dir=args.reference_dir) #Check for absolute paths: check_for_absolute_path('--study', args.study) check_for_absolute_path('--output', args.output) check_for_absolute_path('--installation_dir', args.installation_dir) check_for_absolute_path('--reference_dir', args.reference_dir) if args.results: if args.output: if args.results != args.output: raise Exception('--results and --output are the same parameters. They have different values') else: args.output = args.results if args.dl_tools:
def main(args): '''Main function for active sensing. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': preditcion at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - active_sensing_model_parameters: - active_sensing_model_name: asac, deepsensing - model_name: rnn, lstm, gru - model_parameters: network parameters such as numer of layers - h_dim: hidden dimensions - n_layer: layer number - n_head: head number (only for transformer model) - batch_size: number of samples in mini-batch - epochs: number of epochs - learning_rate: learning rate - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression ''' #%% Step 0: Set basic parameters metric_parameters = { 'problem': args.problem, 'label_name': [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_' data_loader_training = CSVLoader( static_file=data_directory + 'static_train_data.csv.gz', temporal_file=data_directory + 'temporal_train_data_eav.csv.gz') data_loader_testing = CSVLoader( static_file=data_directory + 'static_test_data.csv.gz', temporal_file=data_directory + 'temporal_test_data_eav.csv.gz') dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print('Finish data loading.') #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print('Finish preprocessing.') #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=args.treatment) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print('Finish defining problem.') #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type='static') temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type='temporal') imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print('Finish imputation.') #%% Step 5: Feature selection (4 options) static_feature_selection = \ FeatureSelection(feature_selection_model_name = args.static_feature_selection_model, feature_type = 'static', feature_number = args.static_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) temporal_feature_selection = \ FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model, feature_type = 'temporal', feature_number = args.temporal_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print('Finish feature selection.') #%% Step 6: Fit and Predict (6 options) # Set predictor model parameters model_parameters = { 'h_dim': args.h_dim, 'n_layer': args.n_layer, 'batch_size': args.batch_size, 'epoch': args.epochs, 'model_type': args.model_name, 'learning_rate': args.learning_rate, 'static_mode': args.static_mode, 'time_mode': args.time_mode, 'verbose': True } # Set the validation data for best model saving dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0) active_sensing_class = active_sensing(args.active_sensing_model_name, model_parameters, args.task) active_sensing_class.fit(dataset_training) test_s_hat = active_sensing_class.predict(dataset_testing) print('Finish original predictor model training and testing.') #%% Step 7: Visualize Results idx = np.random.permutation(len(test_s_hat))[:2] # Visualize the output print('Future Measurements Recommendation') print_interpretation(test_s_hat[idx], dataset_testing.feature_name, metric_parameters, model_parameters) return
def main(args): '''Main function for AutoML in time-series predictions. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': preditcion at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - predictor_parameters: - epochs: number of epochs - bo_itr: bayesian optimization iterations - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression - metric_name: auc, apr, mae, mse ''' #%% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { 'problem': args.problem, 'label_name': [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_' data_loader_training = CSVLoader( static_file=data_directory + 'static_train_data.csv.gz', temporal_file=data_directory + 'temporal_train_data_eav.csv.gz') data_loader_testing = CSVLoader( static_file=data_directory + 'static_test_data.csv.gz', temporal_file=data_directory + 'temporal_test_data_eav.csv.gz') dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print('Finish data loading.') #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print('Finish preprocessing.') #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=[args.treatment]) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print('Finish defining problem.') #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type='static') temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type='temporal') imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print('Finish imputation.') #%% Step 5: Feature selection (4 options) static_feature_selection = \ FeatureSelection(feature_selection_model_name = args.static_feature_selection_model, feature_type = 'static', feature_number = args.static_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) temporal_feature_selection = \ FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model, feature_type = 'temporal', feature_number = args.temporal_feature_selection_number, task = args.task, metric_name = args.metric_name, metric_parameters = metric_parameters) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print('Finish feature selection.') #%% Step 6: Bayesian Optimization ## Model define model_parameters = { 'projection_horizon': 5, 'static_mode': 'concatenate', 'time_mode': 'concatenate' } crn_model = CRN_Model(task=args.task) crn_model.set_params(**model_parameters) model_class = crn_model # train_validate split dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.2) # Bayesian Optimization Start metric = BOMetric(metric='auc', fold=0, split='test') # Run BO for selected model class BO_model = AutoTS(dataset_training, model_class, metric) models, bo_score = BO_model.training_loop(num_iter=2) auto_ens_model = AutoEnsemble(models, bo_score) # Prediction assert not dataset_testing.is_validation_defined test_y_hat = auto_ens_model.predict(dataset_testing, test_split='test') test_y = dataset_testing.label print('Finish AutoML model training and testing.') #%% Step 7: Visualize Results idx = np.random.permutation(len(test_y_hat))[:2] # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(test_y, test_y_hat) print('Finish predictor model evaluation.') # Visualize the output # (1) Performance print('Overall performance') print_performance(result, metric_sets, metric_parameters) # (2) Predictions print('Each prediction') print_prediction(test_y_hat[idx], metric_parameters) return
def read_split(self, Lpath, target_name): """Creates train dataset Parameters ---------- Lpath : list, defaut = None List of str paths to load the data target_name : str, default = None The name of the target. Works for both classification (multiclass or not) and regression. Returns ------- dict Dictionnary containing : - 'train' : pandas dataframe for train dataset - 'target' : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification) """ col = [] col_train = [] df_train = dict() y_train = dict() if (type(Lpath) != list): raise ValueError("You must specify a list of paths " "to load all the data") elif (self.to_path is None): raise ValueError("You must specify a path to save your data " "and make sure your files are not already saved") else: ############################################################## # Reading the files ############################################################## for path in Lpath: # Reading each file df = self.pre_clean(path, drop_duplicate=False) # Checking if the target exists if (target_name in df.columns): is_null = df[target_name].isnull() train = df df_train[path] = train[~is_null].drop(target_name, axis=1) y_train[path] = train[target_name][~is_null] del df # Exceptions if (sum([df_train[path].shape[0] for path in df_train.keys()]) == 0): raise ValueError("You have no train dataset. " "Please check that the " "target name is correct.") # Finding the common subset of features for i, df in enumerate(df_train.values()): if (i == 0): col_train = df.columns else: col_train = list(set(col_train) & set(df.columns)) col = sorted(list(set(col_train))) if (self.verbose): print("") print("> Number of common features : " + str(len(col))) ############################################################## # Creating train and target dataframes ############################################################## print("") print("gathering and crunching for train datasets ...") # TODO: Optimize df_train = pd.concat([df[col] for df in df_train.values()]) y_train = pd.concat([y for y in y_train.values()]) # optimiser !! # Checking shape of the target if (type(y_train) == pd.core.frame.DataFrame): raise ValueError( "Your train target contains more than two columns !" " Please check that only one column " "is named " + target_name) else: pass # Handling indices if (self.verbose): print("reindexing for train datasets ...") if (df_train.index.nunique() < df_train.shape[0]): df_train.index = range(df_train.shape[0]) if (y_train.index.nunique() < y_train.shape[0]): y_train.index = range(y_train.shape[0]) # Dropping duplicates if (self.verbose): print("dropping training duplicates ...") # Temp adding target to check (x,y) duplicates... df_train[target_name] = y_train.values df_train = df_train.drop_duplicates() del df_train[target_name] y_train = y_train.loc[df_train.index] # TODO: Need to reindex ? # Deleting constant variables if (self.verbose): print("dropping constant variables on training set ...") for var in col: if (df_train[var].nunique(dropna=False) == 1): del df_train[var] # Missing values sparse_features = (df_train.isnull().sum() / df_train.shape[0]).sort_values(ascending=False) sparse = True if (sparse_features.max() == 0.0): sparse = False # Print information if (self.verbose): if (sparse): print("") print("> " "% missing values on train set:") print( np.round(sparse_features[sparse_features > 0.0][:5], 1)) else: print("") print("> You have no missing values on train set...") high_missing_features = sparse_features[ sparse_features > 0.8].index if len(high_missing_features) > 0: if (self.verbose): print("") print( f'dropping training set columns with high missing rate: {self.missing_threshold}...' ) print(F'drop {len(high_missing_features)} columns') print(high_missing_features) df_train.drop(high_missing_features, axis=1, inplace=True) else: print("") print( f'dropping columns with high missing rate >{self.missing_threshold}...' ) print(f'> No need to dropping!') if (self.verbose): print("") print( "> Number of categorical features:" " " + str(len(df_train.dtypes[df_train.dtypes == 'object'].index))) # noqa print( "> Number of numerical features:" " " + str(len(df_train.dtypes[ df_train.dtypes != 'object'].index))) # noqa print("> Number of training samples : " + str(df_train.shape[0])) ############################################################## # Encoding target ############################################################## task = "classification" if (y_train.nunique() <= 2): task = "classification" else: if (y_train.dtype == object): task = "classification" else: # no needs to convert into float pass if (self.verbose): print("") print("> Task : " + task) if (task == "classification"): if (self.verbose): print('Train Traget') print(y_train.value_counts()) print("") print("encoding target ...") enc = LabelEncoder() y_train = pd.Series(enc.fit_transform(y_train.values), index=y_train.index, name=target_name, dtype='int') print("training set encoding finished") else: if (self.verbose): print(y_train.describe()) ############################################################## # Dumping ############################################################## # Creating a folder to save the files and target encoder try: os.mkdir(self.to_path) except OSError: pass if (self.to_hdf5): start_time = time.time() if (self.verbose): print("") print("dumping files into directory : " + self.to_path) # Temp adding target to dump train file... df_train[target_name] = y_train.values df_train.to_hdf(self.to_path + '/df_train.h5', 'train') del df_train[target_name] if (self.verbose): print("train dumped") else: pass if (task == "classification"): fhand = open(self.to_path + '/target_encoder.obj', 'wb') pickle.dump(enc, fhand) fhand.close() else: pass if (self.verbose): print("") print("Impute the Missing Values...") imp = Imputation() df_train = imp.fit_transform(df_train) print("") return { "train": df_train, "target": y_train, }
parser.add_argument('--study', help='Absolute path of the directory off the study panel') parser.add_argument('--output', help='Absolute path of the output (results) directory') parser.add_argument('--chromosomes', help='comma separated values of chromosomes (If not set, imputation for all chromosomes will be performed') parser.add_argument('--additional_shapeit_parameters', help='Extra command line arguments to pass to SHAPEIT tool', default=' ') parser.add_argument('--additional_impute2_parameters', help='Extra command line arguments to pass to impute2 tool', default=' ') parser.add_argument('--position_batch_size', help='Size of the chromosomal size of each imputation batch', default=5000000, type=int) parser.add_argument('--sample_batch_size', help='Minimum number of samples in imputation batches', default=500, type=int) parser.add_argument('--reference', help='name of the imputation reference panel') parser.add_argument('--action', help='Action to do: liftover, phase, impute', choices=['liftover', 'phase', 'impute']) parser.add_argument('--add_reference', help='Add a new reference panel', action='store_true') parser.add_argument('--backend', help='Execution environment. Default: local', choices=['pbs', 'grid', 'local'], default='local') parser.add_argument('--nosubmit', help='Create scripts but don\'t submit them for execution', action='store_true') args = parser.parse_args() imp = Imputation(tools_dir=args.tools_dir, reference_dir=args.reference_dir) if args.dl_tools: imp.install_imputation_tools() elif args.list: imp.list_reference_panels() elif args.dl_reference: imp.install_reference_panel(args.dl_reference) elif args.add_reference: imp.add_custom_reference_panels() elif args.action: if not args.study:
def main(args): """Main function for AutoML in time-series predictions. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': preditcion at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addition, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - predictor_parameters: - epochs: number of epochs - bo_itr: bayesian optimization iterations - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression - metric_name: auc, apr, mae, mse """ #%% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { "problem": args.problem, "label_name": [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = "../datasets/data/" + args.data_name + "/" + args.data_name + "_" data_loader_training = CSVLoader( static_file=data_directory + "static_train_data.csv.gz", temporal_file=data_directory + "temporal_train_data_eav.csv.gz", ) data_loader_testing = CSVLoader( static_file=data_directory + "static_test_data.csv.gz", temporal_file=data_directory + "temporal_test_data_eav.csv.gz", ) dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print("Finish data loading.") #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print("Finish preprocessing.") #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=args.treatment) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print("Finish defining problem.") #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type="static") temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type="temporal") imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print("Finish imputation.") #%% Step 5: Feature selection (4 options) static_feature_selection = FeatureSelection( feature_selection_model_name=args.static_feature_selection_model, feature_type="static", feature_number=args.static_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) temporal_feature_selection = FeatureSelection( feature_selection_model_name=args.temporal_feature_selection_model, feature_type="temporal", feature_number=args.temporal_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print("Finish feature selection.") #%% Step 6: Bayesian Optimization ## Model define # RNN model rnn_parameters = { "model_type": "lstm", "epoch": args.epochs, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": False, } general_rnn = GeneralRNN(task=args.task) general_rnn.set_params(**rnn_parameters) # CNN model cnn_parameters = { "epoch": args.epochs, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": False, } temp_cnn = TemporalCNN(task=args.task) temp_cnn.set_params(**cnn_parameters) # Transformer transformer = TransformerPredictor(task=args.task, epoch=args.epochs, static_mode=args.static_mode, time_mode=args.time_mode) # Attention model attn_parameters = { "model_type": "lstm", "epoch": args.epochs, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": False, } attn = Attention(task=args.task) attn.set_params(**attn_parameters) # model_class_list = [general_rnn, attn, temp_cnn, transformer] model_class_list = [general_rnn, attn] # train_validate split dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.1) # Bayesian Optimization Start metric = BOMetric(metric="auc", fold=0, split="test") ens_model_list = [] # Run BO for each model class for m in model_class_list: BO_model = automl.model.AutoTS(dataset_training, m, metric, model_path="tmp/") models, bo_score = BO_model.training_loop(num_iter=args.bo_itr) auto_ens_model = AutoEnsemble(models, bo_score) ens_model_list.append(auto_ens_model) # Load all ensemble models for ens in ens_model_list: for m in ens.models: m.load_model(BO_model.model_path + "/" + m.model_id + ".h5") # Stacking algorithm stacking_ens_model = StackingEnsemble(ens_model_list) stacking_ens_model.fit(dataset_training, fold=0, train_split="val") # Prediction assert not dataset_testing.is_validation_defined test_y_hat = stacking_ens_model.predict(dataset_testing, test_split="test") test_y = dataset_testing.label print("Finish AutoML model training and testing.") #%% Step 7: Visualize Results idx = np.random.permutation(len(test_y_hat))[:2] # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(test_y, test_y_hat) print("Finish predictor model evaluation.") # Visualize the output # (1) Performance print("Overall performance") print_performance(result, metric_sets, metric_parameters) # (2) Predictions print("Each prediction") print_prediction(test_y_hat[idx], metric_parameters) return
def main(): print('-'*10+'Welcome to ML Preprocessor CLI'+'-'*10+'\n\n') try: file_path=sys.argv[1] if file_path.endswith('.csv')==False: raise IncorrectFileFormatError("file is not in CSV format") revised_df=readCSV(file_path) print('\nScreenshot of independent dataframe:\n') print(revised_df.head()) print('\n'+'-'*30+'\n') while True: print('\nTasks(Preprocessing)') print('1.Data Description') print('2.Handling NULL values') print('3.Encoding Categorical Data') print('4.Feature Scaling of the Dataset') print('5.Download the modified Dataset\n') option=int(input('What do you want to do?(Press -1 to exit):')) if option==-1: raise ExitError elif option==1: data_desc=DataDescription(revised_df) while True: option=data_desc.getOption() if option==-1: break elif option==1: data_desc.showProperty() elif option==2: data_desc.showStats() elif option==3: data_desc.showDF() else: print('Incorrect option!Try again.') elif option==2: impute=Imputation(revised_df) while True: option=impute.getOption() if option==-1: break elif option==1: impute.countNULL() elif option==2: revised_df=impute.dropColumn() elif option==3: revised_df=impute.fillUtil() elif option==4: impute.showDF() else: print('Incorrect option!Try again.') elif option==3: encode=EncodeCategorical(revised_df) while True: option=encode.getOption() if option==-1: break elif option==1: encode.showCategorical() elif option==2: revised_df=encode.performOneHotEncodingUtil() elif option==3: encode.showDF() else: print('Incorrect option!Try again.') elif option==4: while True: scale=FeatureScaling(revised_df) option=scale.getOption() if option==-1: break elif option==1: scale.normalizeUtil() elif option==2: scale.standardizeUtil() elif option==3: scale.showDF() else: print('Incorrect option!Try again.') elif option==5: download=Download(revised_df) download.downloadDataframe() else: print('Incorrect option!Try again.') except IndexError as e: print('File path missing.',e) except IncorrectFileFormatError as e: print('Incorrect file format.',e) except FileNotFoundError as e: print('File Not Found!',e) except ExitError as e: print(e) except Exception as e: print('Incorrect option chosen.',e)
def main(args): '''Main function for individual treatment effect estimation. Args: - data loading parameters: - data_names: mimic, ward, cf, mimic_antibiotics - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'online' - 'online': preiction at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected featuer number - treatment effects model parameters: - model_name: CRN, RMSN, GANITE Each model has different types of hyperparameters that need to be set. - Parameters needed for the Counterfactual Recurrent Network (CRN): - hyperparameters for encoder: - rnn_hidden_units: hidden dimensions in the LSTM unit - rnn_keep_prob: keep probability used for variational dropout in the LSTM unit - br_size: size of the balancing representation - fc_hidden_units: hidden dimensions of the fully connected layers used for treatment classifier and predictor - batch_size: number of samples in mini-batch - num_epochs: number of epochs - learning_rate: learning rate - max_alpha: alpha controls the trade-off between building tratment invariant representations (domain discrimination) and being able to predict outcomes (outcome prediction); during training, CRN uses an exponentially increasing schedule for alpha from 0 to max_alpha. - hyperparameters for decoder: - the decoder requires the same hyperparameters as the encoder with the exception of the rnn_hidden_units which is set to be equal to the br_size of the encoder - Parameters for Recurrent Marginal Structural Networks (RMSN): - hyperparameters for encoder: - dropout_rate: dropout probability used for variational - rnn_hidden_units: hidden dimensions in the LSTM unit - batch_size: number of samples in mini-batch - num_epochs: number of epochs - learning_rate: learning rate - max_norm: max gradient norm used for gradient clipping during training - hyperparameters for decoder: - the decoder requires the same hyperparameters as the encoder. - model_dir: directory where the model is saved - model_name: name of the saved model - Parameters for GANITE: - batch size: number of samples in mini-batch - alpha: parameter trading off between discriminator loss and supervised loss for the generator training - learning_rate: learning rate - hidden_units: hidden dimensions of the fully connected layers used in the networks - stack_dim: number of timesteps to stack All models have the following common parameters: - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - taks: 'classification' or 'regression' - metric_name: auc, apr, mae, mse (used for factual prediction) - patient id: patient for which counterfactual trajectories are computed - timestep: timestep in patient trajectory for estimating counterfactuals ''' # %% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { 'problem': args.problem, 'label_name': [args.label_name] } # %% Step 1: Upload Dataset # File names data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_' data_loader_training = CSVLoader( static_file=data_directory + 'static_train_data.csv.gz', temporal_file=data_directory + 'temporal_train_data_eav.csv.gz') data_loader_testing = CSVLoader( static_file=data_directory + 'static_test_data.csv.gz', temporal_file=data_directory + 'temporal_test_data_eav.csv.gz') dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print('Finish data loading.') # %% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print('Finish preprocessing.') # %% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=[args.treatment]) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print('Finish defining problem.') # %% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type='static') temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type='temporal') imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print('Finish imputation.') # %% Step 5: Feature selection (4 options) static_feature_selection = \ FeatureSelection(feature_selection_model_name=args.static_feature_selection_model, feature_type='static', feature_number=args.static_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters) temporal_feature_selection = \ FeatureSelection(feature_selection_model_name=args.temporal_feature_selection_model, feature_type='temporal', feature_number=args.temporal_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print('Finish feature selection.') # %% Step 6: Fit treatment effects (3 options) # Set the validation data for best model saving dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0) # Set the treatment effects model model_name = args.model_name # Set treatment effects model parameters if model_name == 'CRN': model_parameters = { 'encoder_rnn_hidden_units': args.crn_encoder_rnn_hidden_units, 'encoder_br_size': args.crn_encoder_br_size, 'encoder_fc_hidden_units': args.crn_encoder_fc_hidden_units, 'encoder_learning_rate': args.crn_encoder_learning_rate, 'encoder_batch_size': args.crn_encoder_batch_size, 'encoder_keep_prob': args.crn_encoder_keep_prob, 'encoder_num_epochs': args.crn_encoder_num_epochs, 'encoder_max_alpha': args.crn_encoder_max_alpha, 'decoder_br_size': args.crn_decoder_br_size, 'decoder_fc_hidden_units': args.crn_decoder_fc_hidden_units, 'decoder_learning_rate': args.crn_decoder_learning_rate, 'decoder_batch_size': args.crn_decoder_batch_size, 'decoder_keep_prob': args.crn_decoder_keep_prob, 'decoder_num_epochs': args.crn_decoder_num_epochs, 'decoder_max_alpha': args.crn_decoder_max_alpha, 'projection_horizon': args.projection_horizon, 'static_mode': args.static_mode, 'time_mode': args.time_mode } treatment_model = treatment_effects_model(model_name, model_parameters, task='classification') treatment_model.fit(dataset_training) elif model_name == 'RMSN': hyperparams_encoder_iptw = { 'dropout_rate': args.rmsn_encoder_dropout_rate, 'memory_multiplier': args.rmsn_encoder_memory_multiplier, 'num_epochs': args.rmsn_encoder_num_epochs, 'batch_size': args.rmsn_encoder_batch_size, 'learning_rate': args.rmsn_encoder_learning_rate, 'max_norm': args.rmsn_encoder_max_norm } hyperparams_decoder_iptw = { 'dropout_rate': args.rmsn_decoder_dropout_rate, 'memory_multiplier': args.rmsn_decoder_memory_multiplier, 'num_epochs': args.rmsn_decoder_num_epochs, 'batch_size': args.rmsn_decoder_batch_size, 'learning_rate': args.rmsn_decoder_learning_rate, 'max_norm': args.rmsn_decoder_max_norm } model_parameters = { 'hyperparams_encoder_iptw': hyperparams_encoder_iptw, 'hyperparams_decoder_iptw': hyperparams_decoder_iptw, 'model_dir': args.rmsn_model_dir, 'model_name': args.rmsn_model_name, 'static_mode': args.static_mode, 'time_mode': args.time_mode } treatment_model = treatment_effects_model(model_name, model_parameters, task='classification') treatment_model.fit(dataset_training, projection_horizon=args.projection_horizon) elif model_name == 'GANITE': hyperparams = { 'batch_size': args.ganite_batch_size, 'alpha': args.ganite_alpha, 'hidden_dims': args.ganite_hidden_dims, 'learning_rate': args.ganite_learning_rate } model_parameters = { 'hyperparams': hyperparams, 'stack_dim': args.ganite_stack_dim, 'static_mode': args.static_mode, 'time_mode': args.time_mode } treatment_model = treatment_effects_model(model_name, model_parameters, task='classification') treatment_model.fit(dataset_training) test_y_hat = treatment_model.predict(dataset_testing) print('Finish treatment effects model training and testing.') # %% Step 9: Visualize Results # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(dataset_testing.label, test_y_hat) print('Finish predictor model evaluation.') # Visualize the output # (1) Performance on estimating factual outcomes print('Overall performance on estimating factual outcomes') print_performance(result, metric_sets, metric_parameters) # (2) Counterfactual trajectories print('Counterfactual trajectories') if model_name in ['CRN', 'RMSN']: # Predict and visualize counterfactuals for the sequence of treatments indicated by the user # through the treatment_options. The lengths of each sequence of treatments needs to be projection_horizon + 1. treatment_options = np.array([[[1], [1], [1], [1], [1], [0]], [[0], [0], [0], [0], [1], [1]]]) history, counterfactual_traj = treatment_model.predict_counterfactual_trajectories( dataset=dataset_testing, patient_id=args.patient_id, timestep=args.timestep, treatment_options=treatment_options) print_counterfactual_predictions( patient_history=history, treatment_options=treatment_options, counterfactual_predictions=counterfactual_traj) return
def main(args): """Main function for time-series prediction. Args: - data loading parameters: - data_names: mimic, ward, cf - preprocess parameters: - normalization: minmax, standard, None - one_hot_encoding: input features that need to be one-hot encoded - problem: 'one-shot' or 'online' - 'one-shot': one time prediction at the end of the time-series - 'online': prediction at every time stamps of the time-series - max_seq_len: maximum sequence length after padding - label_name: the column name for the label(s) - treatment: the column name for treatments - imputation parameters: - static_imputation_model: mean, median, mice, missforest, knn, gain - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain - feature selection parameters: - feature_selection_model: greedy-addition, greedy-deletion, recursive-addition, recursive-deletion, None - feature_number: selected feature number - predictor_parameters: - model_name: rnn, gru, lstm, attention, tcn, transformer - model_parameters: network parameters such as number of layers - h_dim: hidden dimensions - n_layer: layer number - n_head: head number (only for transformer model) - batch_size: number of samples in mini-batch - epochs: number of epochs - learning_rate: learning rate - static_mode: how to utilize static features (concatenate or None) - time_mode: how to utilize time information (concatenate or None) - task: classification or regression - uncertainty_model_name: uncertainty estimation model name (ensemble) - interpretation_model_name: interpretation model name (tinvase) - metric_name: auc, apr, mae, mse """ #%% Step 0: Set basic parameters metric_sets = [args.metric_name] metric_parameters = { "problem": args.problem, "label_name": [args.label_name] } #%% Step 1: Upload Dataset # File names data_directory = "../datasets/data/" + args.data_name + "/" + args.data_name + "_" data_loader_training = CSVLoader( static_file=data_directory + "static_train_data.csv.gz", temporal_file=data_directory + "temporal_train_data_eav.csv.gz", ) data_loader_testing = CSVLoader( static_file=data_directory + "static_test_data.csv.gz", temporal_file=data_directory + "temporal_test_data_eav.csv.gz", ) dataset_training = data_loader_training.load() dataset_testing = data_loader_testing.load() print("Finish data loading.") #%% Step 2: Preprocess Dataset # (0) filter out negative values (Automatically) negative_filter = FilterNegative() # (1) one-hot encode categorical features onehot_encoder = OneHotEncoder( one_hot_encoding_features=[args.one_hot_encoding]) # (2) Normalize features: 3 options (minmax, standard, none) normalizer = Normalizer(args.normalization) filter_pipeline = PipelineComposer(negative_filter, onehot_encoder, normalizer) dataset_training = filter_pipeline.fit_transform(dataset_training) dataset_testing = filter_pipeline.transform(dataset_testing) print("Finish preprocessing.") #%% Step 3: Define Problem problem_maker = ProblemMaker(problem=args.problem, label=[args.label_name], max_seq_len=args.max_seq_len, treatment=args.treatment) dataset_training = problem_maker.fit_transform(dataset_training) dataset_testing = problem_maker.fit_transform(dataset_testing) print("Finish defining problem.") #%% Step 4: Impute Dataset static_imputation = Imputation( imputation_model_name=args.static_imputation_model, data_type="static") temporal_imputation = Imputation( imputation_model_name=args.temporal_imputation_model, data_type="temporal") imputation_pipeline = PipelineComposer(static_imputation, temporal_imputation) dataset_training = imputation_pipeline.fit_transform(dataset_training) dataset_testing = imputation_pipeline.transform(dataset_testing) print("Finish imputation.") #%% Step 5: Feature selection (4 options) static_feature_selection = FeatureSelection( feature_selection_model_name=args.static_feature_selection_model, feature_type="static", feature_number=args.static_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) temporal_feature_selection = FeatureSelection( feature_selection_model_name=args.temporal_feature_selection_model, feature_type="temporal", feature_number=args.temporal_feature_selection_number, task=args.task, metric_name=args.metric_name, metric_parameters=metric_parameters, ) feature_selection_pipeline = PipelineComposer(static_feature_selection, temporal_feature_selection) dataset_training = feature_selection_pipeline.fit_transform( dataset_training) dataset_testing = feature_selection_pipeline.transform(dataset_testing) print("Finish feature selection.") #%% Step 6: Fit and Predict (6 options) # Set predictor model parameters model_parameters = { "h_dim": args.h_dim, "n_layer": args.n_layer, "n_head": args.n_head, "batch_size": args.batch_size, "epoch": args.epochs, "model_type": args.model_name, "learning_rate": args.learning_rate, "static_mode": args.static_mode, "time_mode": args.time_mode, "verbose": True, } # Set the validation data for best model saving dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0) pred_class = prediction(args.model_name, model_parameters, args.task) pred_class.fit(dataset_training) test_y_hat = pred_class.predict(dataset_testing) print("Finish predictor model training and testing.") #%% Step 7: Estimate Uncertainty (1 option) uncertainty_model = uncertainty(args.uncertainty_model_name, model_parameters, pred_class, args.task) uncertainty_model.fit(dataset_training) test_ci_hat = uncertainty_model.predict(dataset_testing) print("Finish uncertainty estimation") #%% Step 8: Interpret Predictions (1 option) interpretor = interpretation(args.interpretation_model_name, model_parameters, pred_class, args.task) interpretor.fit(dataset_training) test_s_hat = interpretor.predict(dataset_testing) print("Finish model interpretation") #%% Step 9: Visualize Results idx = np.random.permutation(len(test_y_hat))[:2] # Evaluate predictor model result = Metrics(metric_sets, metric_parameters).evaluate(dataset_testing.label, test_y_hat) print("Finish predictor model evaluation.") # Visualize the output # (1) Performance print("Overall performance") print_performance(result, metric_sets, metric_parameters) # (2) Predictions print("Each prediction") print_prediction(test_y_hat[idx], metric_parameters) # (3) Uncertainty print("Uncertainty estimations") print_uncertainty(test_y_hat[idx], test_ci_hat[idx], metric_parameters) # (4) Model interpretation print("Model interpretation") print_interpretation(test_s_hat[idx], dataset_training.feature_name, metric_parameters, model_parameters) return