コード例 #1
0
def data_preprocess(params):
    ### Record Concatenation
    dataio = DataIO(params['input_path'], params['result_path'])
    dataio.read_data()
    ctn = Concatenation(dataio)
    patient_info, n_feature = ctn.get_concatenation() # patient id: Patient
                                           # static feature and dynamic feature
                                           # dynamic feature{time:feature_value}
    ### Data Imputation 
    imp_method = 'simple' 
    imp = Imputation(patient_info, n_feature)
    patient_array = imp.get_imputation(imp_method)
    return (dataio, patient_info, patient_array)
コード例 #2
0
 def preprocessorMain(self):
     self.removeTargetColumn()
     while (1):
         print("\nTasks (Preprocessing)\n")
         for task in self.tasks:
             print(task)
         while (1):
             try:
                 choice = int(
                     input(
                         "\nWhat do you want to do? [enter -1 to exit]:  "))
             except ValueError:
                 print("Integer Value required. Try again.....")
                 continue
             break
         if choice == -1:
             exit()
         elif choice == 1:
             DataDescription(self.data).describe()
         elif choice == 2:
             self.data = Imputation(self.data).imputer()
         elif choice == 3:
             self.data = Categorical(self.data).categoricalMain()
         elif choice == 4:
             self.data = FeatureScaling(self.data).scaling()
         elif choice == 5:
             Download(self.data).download()
         else:
             print("\nWrong choice!! Try again...")
コード例 #3
0
def data_preprocess(params):
    ### Record Concatenation
    dataio = DataIO(params['input_path'], params['map_path'], params['domain'])
    dataio.read_data()
    dataio.read_label()
    ctn = Concatenation(dataio, params['domain'])
    patient_info, n_feature, feature_list, feature_range = ctn.get_concatenation()
                                           # patient id: Patient
                                           # static feature and dynamic feature
                                           # dynamic feature{time:feature_value}
    ### Data Imputation
    imp_method = 'simple'
    imp = Imputation(patient_info, n_feature)
    patient_array, patient_time = imp.get_imputation(imp_method)

    ### Clinical Data with DTI Generation
    cli = CliGen(feature_list, feature_range, ctn.dti_time)
    subject_array = cli.get_data(patient_array, patient_time, params['time'])
    if True == params['binary']: # only works for discrete clinical features
        subject_array = cli.get_binarization()
    subject_label = cli.get_label(patient_info, params['labels'], params['time'])
    return subject_array, subject_label
コード例 #4
0
	def training(self):
		# Preparação dos dados
		self.imp = Imputation(self.data)

		# Seleciona as caracteristicas
		self.features = FeatureSelection(self.imp.imputed_data)
		data_selected = self.features.data_selected
		self.selected_features = self.features.selected_features

		# Encontra os padrões ausentes
		self.missing_patterns = MissingPatterns(self.data, self.selected_features).missing_patterns

		# Realiza o treinamento dos classificadores
		#print('test train')
		for mpi in self.missing_patterns:

			# Seleciona as caracteristicas
			cpi = set(self.selected_features) - set(mpi)
			data_temp = Instances.copy_instances(data_selected, from_row=0, num_rows=data_selected.num_instances)
			data_temp.class_is_last()

			# Separa os dados de treinamento
			data_temp = self.reduceData(data_temp, cpi, self.data)

			
			# Treina os classificadores com os dados imputados
			classifier = Classifier(classname=self.learn_class, options=self.options)
			classifier.build_classifier(data_temp)
			
			#print(classifier.distribution_for_instance(data_selected.get_instance(30)))
			

			#!!!!!! Verica o peso de cada classificador (sua acuracia de classificação)
			evl = Evaluation(data_temp)
			evl.crossvalidate_model(classifier, data_temp, 15, Random(1))

			# Adiciona os classificadores treinados ao conjunto de classificadores
			my_classifier = MyClassifier(classifier, cpi, 1 - evl.mean_absolute_error)
			self.classifiers.add(my_classifier)
コード例 #5
0
	parser.add_argument('--chromosomes', help='comma separated values of chromosomes (If not set, imputation for all chromosomes will be performed')
	parser.add_argument('--additional_shapeit_parameters', help='Extra command line arguments to pass to SHAPEIT tool', default=' ')
	parser.add_argument('--additional_impute2_parameters', help='Extra command line arguments to pass to impute2 tool', default=' ')
	parser.add_argument('--position_batch_size', help='Size of the chromosomal size of each imputation batch', default=5000000, type=int)
	parser.add_argument('--sample_batch_size', help='Minimum number of samples in imputation batches', default=500, type=int)
	parser.add_argument('--reference', help='name of the imputation reference panel')
	parser.add_argument('--action', help='Action to do: liftover, phase, impute', choices=['liftover', 'phase', 'impute', 'phase_impute', 'liftover_phase_impute'])
	parser.add_argument('--add_reference', help='Add a new reference panel', action='store_true')
	parser.add_argument('--backend', help='Execution environment. Default: local', choices=['pbs',  'grid', 'local'], default='local')
	parser.add_argument('--chain_file', help='Genomic assembly for the liftover step', default='hg18ToHg19')
	parser.add_argument('--nosubmit', help='Create scripts but don\'t submit them for execution', action='store_true')
	parser.add_argument('--java_executable', help='java executable. Default: java .This is useful when java is not in the PATH', default='java')
	
	args = parser.parse_args()

	imp = Imputation(installation_dir=args.installation_dir, reference_dir=args.reference_dir)

	#Check for absolute paths:
	check_for_absolute_path('--study', args.study)
	check_for_absolute_path('--output', args.output)
	check_for_absolute_path('--installation_dir', args.installation_dir)
	check_for_absolute_path('--reference_dir', args.reference_dir)

	if args.results:
		if args.output:
			if args.results != args.output:
				raise Exception('--results and --output are the same parameters. They have different values')
		else:
			args.output = args.results

	if args.dl_tools:
コード例 #6
0
def main(args):
    '''Main function for active sensing.
  
  Args:
    - data loading parameters:
      - data_names: mimic, ward, cf    
      
    - preprocess parameters: 
      - normalization: minmax, standard, None
      - one_hot_encoding: input features that need to be one-hot encoded
      - problem: 'one-shot' or 'online'
        - 'one-shot': one time prediction at the end of the time-series 
        - 'online': preditcion at every time stamps of the time-series
      - max_seq_len: maximum sequence length after padding
      - label_name: the column name for the label(s)
      - treatment: the column name for treatments
      
    - imputation parameters: 
      - static_imputation_model: mean, median, mice, missforest, knn, gain
      - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain
            
    - feature selection parameters:
      - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None
      - feature_number: selected featuer number
      
    - active_sensing_model_parameters:
      - active_sensing_model_name: asac, deepsensing
      - model_name: rnn, lstm, gru
      - model_parameters: network parameters such as numer of layers
        - h_dim: hidden dimensions
        - n_layer: layer number
        - n_head: head number (only for transformer model)
        - batch_size: number of samples in mini-batch
        - epochs: number of epochs
        - learning_rate: learning rate
      - static_mode: how to utilize static features (concatenate or None)
      - time_mode: how to utilize time information (concatenate or None)
      - task: classification or regression
  '''
    #%% Step 0: Set basic parameters
    metric_parameters = {
        'problem': args.problem,
        'label_name': [args.label_name]
    }

    #%% Step 1: Upload Dataset
    # File names
    data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_'

    data_loader_training = CSVLoader(
        static_file=data_directory + 'static_train_data.csv.gz',
        temporal_file=data_directory + 'temporal_train_data_eav.csv.gz')

    data_loader_testing = CSVLoader(
        static_file=data_directory + 'static_test_data.csv.gz',
        temporal_file=data_directory + 'temporal_test_data_eav.csv.gz')

    dataset_training = data_loader_training.load()
    dataset_testing = data_loader_testing.load()

    print('Finish data loading.')

    #%% Step 2: Preprocess Dataset
    # (0) filter out negative values (Automatically)
    negative_filter = FilterNegative()
    # (1) one-hot encode categorical features
    onehot_encoder = OneHotEncoder(
        one_hot_encoding_features=[args.one_hot_encoding])
    # (2) Normalize features: 3 options (minmax, standard, none)
    normalizer = Normalizer(args.normalization)

    filter_pipeline = PipelineComposer(negative_filter, onehot_encoder,
                                       normalizer)

    dataset_training = filter_pipeline.fit_transform(dataset_training)
    dataset_testing = filter_pipeline.transform(dataset_testing)

    print('Finish preprocessing.')

    #%% Step 3: Define Problem
    problem_maker = ProblemMaker(problem=args.problem,
                                 label=[args.label_name],
                                 max_seq_len=args.max_seq_len,
                                 treatment=args.treatment)

    dataset_training = problem_maker.fit_transform(dataset_training)
    dataset_testing = problem_maker.fit_transform(dataset_testing)

    print('Finish defining problem.')

    #%% Step 4: Impute Dataset
    static_imputation = Imputation(
        imputation_model_name=args.static_imputation_model, data_type='static')
    temporal_imputation = Imputation(
        imputation_model_name=args.temporal_imputation_model,
        data_type='temporal')

    imputation_pipeline = PipelineComposer(static_imputation,
                                           temporal_imputation)

    dataset_training = imputation_pipeline.fit_transform(dataset_training)
    dataset_testing = imputation_pipeline.transform(dataset_testing)

    print('Finish imputation.')

    #%% Step 5: Feature selection (4 options)
    static_feature_selection = \
    FeatureSelection(feature_selection_model_name = args.static_feature_selection_model,
                     feature_type = 'static', feature_number = args.static_feature_selection_number,
                     task = args.task, metric_name = args.metric_name,
                     metric_parameters = metric_parameters)

    temporal_feature_selection = \
    FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model,
                     feature_type = 'temporal', feature_number = args.temporal_feature_selection_number,
                     task = args.task, metric_name = args.metric_name,
                     metric_parameters = metric_parameters)

    feature_selection_pipeline = PipelineComposer(static_feature_selection,
                                                  temporal_feature_selection)

    dataset_training = feature_selection_pipeline.fit_transform(
        dataset_training)
    dataset_testing = feature_selection_pipeline.transform(dataset_testing)

    print('Finish feature selection.')

    #%% Step 6: Fit and Predict (6 options)
    # Set predictor model parameters
    model_parameters = {
        'h_dim': args.h_dim,
        'n_layer': args.n_layer,
        'batch_size': args.batch_size,
        'epoch': args.epochs,
        'model_type': args.model_name,
        'learning_rate': args.learning_rate,
        'static_mode': args.static_mode,
        'time_mode': args.time_mode,
        'verbose': True
    }

    # Set the validation data for best model saving
    dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0)

    active_sensing_class = active_sensing(args.active_sensing_model_name,
                                          model_parameters, args.task)
    active_sensing_class.fit(dataset_training)
    test_s_hat = active_sensing_class.predict(dataset_testing)

    print('Finish original predictor model training and testing.')

    #%% Step 7: Visualize Results
    idx = np.random.permutation(len(test_s_hat))[:2]

    # Visualize the output
    print('Future Measurements Recommendation')
    print_interpretation(test_s_hat[idx], dataset_testing.feature_name,
                         metric_parameters, model_parameters)

    return
コード例 #7
0
def main(args):
    '''Main function for AutoML in time-series predictions.
  
  Args:
    - data loading parameters:
      - data_names: mimic, ward, cf    
      
    - preprocess parameters: 
      - normalization: minmax, standard, None
      - one_hot_encoding: input features that need to be one-hot encoded
      - problem: 'one-shot' or 'online'
        - 'one-shot': one time prediction at the end of the time-series 
        - 'online': preditcion at every time stamps of the time-series
      - max_seq_len: maximum sequence length after padding
      - label_name: the column name for the label(s)
      - treatment: the column name for treatments
      
    - imputation parameters: 
      - static_imputation_model: mean, median, mice, missforest, knn, gain
      - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain
            
    - feature selection parameters:
      - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None
      - feature_number: selected featuer number
      
    - predictor_parameters:
      - epochs: number of epochs
      - bo_itr: bayesian optimization iterations
      - static_mode: how to utilize static features (concatenate or None)
      - time_mode: how to utilize time information (concatenate or None)
      - task: classification or regression
      
    - metric_name: auc, apr, mae, mse
  '''
    #%% Step 0: Set basic parameters
    metric_sets = [args.metric_name]
    metric_parameters = {
        'problem': args.problem,
        'label_name': [args.label_name]
    }

    #%% Step 1: Upload Dataset
    # File names
    data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_'

    data_loader_training = CSVLoader(
        static_file=data_directory + 'static_train_data.csv.gz',
        temporal_file=data_directory + 'temporal_train_data_eav.csv.gz')

    data_loader_testing = CSVLoader(
        static_file=data_directory + 'static_test_data.csv.gz',
        temporal_file=data_directory + 'temporal_test_data_eav.csv.gz')

    dataset_training = data_loader_training.load()
    dataset_testing = data_loader_testing.load()

    print('Finish data loading.')

    #%% Step 2: Preprocess Dataset
    # (0) filter out negative values (Automatically)
    negative_filter = FilterNegative()
    # (1) one-hot encode categorical features
    onehot_encoder = OneHotEncoder(
        one_hot_encoding_features=[args.one_hot_encoding])
    # (2) Normalize features: 3 options (minmax, standard, none)
    normalizer = Normalizer(args.normalization)

    filter_pipeline = PipelineComposer(negative_filter, onehot_encoder,
                                       normalizer)

    dataset_training = filter_pipeline.fit_transform(dataset_training)
    dataset_testing = filter_pipeline.transform(dataset_testing)

    print('Finish preprocessing.')

    #%% Step 3: Define Problem
    problem_maker = ProblemMaker(problem=args.problem,
                                 label=[args.label_name],
                                 max_seq_len=args.max_seq_len,
                                 treatment=[args.treatment])

    dataset_training = problem_maker.fit_transform(dataset_training)
    dataset_testing = problem_maker.fit_transform(dataset_testing)

    print('Finish defining problem.')

    #%% Step 4: Impute Dataset
    static_imputation = Imputation(
        imputation_model_name=args.static_imputation_model, data_type='static')
    temporal_imputation = Imputation(
        imputation_model_name=args.temporal_imputation_model,
        data_type='temporal')

    imputation_pipeline = PipelineComposer(static_imputation,
                                           temporal_imputation)

    dataset_training = imputation_pipeline.fit_transform(dataset_training)
    dataset_testing = imputation_pipeline.transform(dataset_testing)

    print('Finish imputation.')

    #%% Step 5: Feature selection (4 options)
    static_feature_selection = \
    FeatureSelection(feature_selection_model_name = args.static_feature_selection_model,
                     feature_type = 'static', feature_number = args.static_feature_selection_number,
                     task = args.task, metric_name = args.metric_name,
                     metric_parameters = metric_parameters)

    temporal_feature_selection = \
    FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model,
                     feature_type = 'temporal', feature_number = args.temporal_feature_selection_number,
                     task = args.task, metric_name = args.metric_name,
                     metric_parameters = metric_parameters)

    feature_selection_pipeline = PipelineComposer(static_feature_selection,
                                                  temporal_feature_selection)

    dataset_training = feature_selection_pipeline.fit_transform(
        dataset_training)
    dataset_testing = feature_selection_pipeline.transform(dataset_testing)

    print('Finish feature selection.')

    #%% Step 6: Bayesian Optimization
    ## Model define

    model_parameters = {
        'projection_horizon': 5,
        'static_mode': 'concatenate',
        'time_mode': 'concatenate'
    }

    crn_model = CRN_Model(task=args.task)
    crn_model.set_params(**model_parameters)

    model_class = crn_model

    # train_validate split
    dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.2)

    # Bayesian Optimization Start
    metric = BOMetric(metric='auc', fold=0, split='test')

    # Run BO for selected model class
    BO_model = AutoTS(dataset_training, model_class, metric)
    models, bo_score = BO_model.training_loop(num_iter=2)
    auto_ens_model = AutoEnsemble(models, bo_score)

    # Prediction
    assert not dataset_testing.is_validation_defined
    test_y_hat = auto_ens_model.predict(dataset_testing, test_split='test')
    test_y = dataset_testing.label

    print('Finish AutoML model training and testing.')

    #%% Step 7: Visualize Results
    idx = np.random.permutation(len(test_y_hat))[:2]

    # Evaluate predictor model
    result = Metrics(metric_sets,
                     metric_parameters).evaluate(test_y, test_y_hat)
    print('Finish predictor model evaluation.')

    # Visualize the output
    # (1) Performance
    print('Overall performance')
    print_performance(result, metric_sets, metric_parameters)
    # (2) Predictions
    print('Each prediction')
    print_prediction(test_y_hat[idx], metric_parameters)

    return
コード例 #8
0
    def read_split(self, Lpath, target_name):
        """Creates train dataset
         Parameters
        ----------
        Lpath : list, defaut = None
            List of str paths to load the data
        target_name : str, default = None
            The name of the target. Works for both classification
            (multiclass or not) and regression.
        Returns
        -------
        dict
            Dictionnary containing :
            - 'train' : pandas dataframe for train dataset
            - 'target' : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification)
        """

        col = []
        col_train = []
        df_train = dict()
        y_train = dict()

        if (type(Lpath) != list):

            raise ValueError("You must specify a list of paths "
                             "to load all the data")

        elif (self.to_path is None):

            raise ValueError("You must specify a path to save your data "
                             "and make sure your files are not already saved")

        else:

            ##############################################################
            #                    Reading the files
            ##############################################################

            for path in Lpath:

                # Reading each file
                df = self.pre_clean(path, drop_duplicate=False)
                # Checking if the target exists
                if (target_name in df.columns):

                    is_null = df[target_name].isnull()
                    train = df

                    df_train[path] = train[~is_null].drop(target_name, axis=1)
                    y_train[path] = train[target_name][~is_null]

            del df

            # Exceptions

            if (sum([df_train[path].shape[0]
                     for path in df_train.keys()]) == 0):
                raise ValueError("You have no train dataset. "
                                 "Please check that the "
                                 "target name is correct.")
            # Finding the common subset of features

            for i, df in enumerate(df_train.values()):

                if (i == 0):
                    col_train = df.columns
                else:
                    col_train = list(set(col_train) & set(df.columns))

            col = sorted(list(set(col_train)))

            if (self.verbose):
                print("")
                print("> Number of common features : " + str(len(col)))

                ##############################################################
                #          Creating train and target dataframes
                ##############################################################

                print("")
                print("gathering and crunching for train datasets ...")

            # TODO: Optimize
            df_train = pd.concat([df[col] for df in df_train.values()])
            y_train = pd.concat([y for y in y_train.values()])  # optimiser !!
            # Checking shape of the target

            if (type(y_train) == pd.core.frame.DataFrame):
                raise ValueError(
                    "Your train target contains more than two columns !"
                    " Please check that only one column "
                    "is named " + target_name)
            else:
                pass

            # Handling indices

            if (self.verbose):
                print("reindexing for train datasets ...")

            if (df_train.index.nunique() < df_train.shape[0]):
                df_train.index = range(df_train.shape[0])

            if (y_train.index.nunique() < y_train.shape[0]):
                y_train.index = range(y_train.shape[0])

        #    Dropping duplicates

            if (self.verbose):
                print("dropping training duplicates ...")

            # Temp adding target to check (x,y) duplicates...
            df_train[target_name] = y_train.values
            df_train = df_train.drop_duplicates()
            del df_train[target_name]
            y_train = y_train.loc[df_train.index]  # TODO: Need to reindex ?

            #   Deleting constant variables

            if (self.verbose):
                print("dropping constant variables on training set ...")
            for var in col:
                if (df_train[var].nunique(dropna=False) == 1):
                    del df_train[var]

            # Missing values
            sparse_features = (df_train.isnull().sum() /
                               df_train.shape[0]).sort_values(ascending=False)
            sparse = True
            if (sparse_features.max() == 0.0):
                sparse = False

            # Print information

            if (self.verbose):
                if (sparse):
                    print("")
                    print("> " "% missing values on train set:")
                    print(
                        np.round(sparse_features[sparse_features > 0.0][:5],
                                 1))

                else:
                    print("")
                    print("> You have no missing values on train set...")

            high_missing_features = sparse_features[
                sparse_features > 0.8].index

            if len(high_missing_features) > 0:
                if (self.verbose):
                    print("")
                    print(
                        f'dropping training set columns with high missing rate: {self.missing_threshold}...'
                    )
                    print(F'drop {len(high_missing_features)} columns')
                    print(high_missing_features)
                df_train.drop(high_missing_features, axis=1, inplace=True)

            else:
                print("")
                print(
                    f'dropping columns with high missing rate >{self.missing_threshold}...'
                )
                print(f'> No need to dropping!')

            if (self.verbose):
                print("")
                print(
                    "> Number of categorical features:"
                    " " +
                    str(len(df_train.dtypes[df_train.dtypes ==
                                            'object'].index)))  # noqa
                print(
                    "> Number of numerical features:"
                    " " +
                    str(len(df_train.dtypes[
                        df_train.dtypes != 'object'].index)))  # noqa
                print("> Number of training samples : " +
                      str(df_train.shape[0]))
            ##############################################################
            #                    Encoding target
            ##############################################################

            task = "classification"

            if (y_train.nunique() <= 2):
                task = "classification"

            else:
                if (y_train.dtype == object):
                    task = "classification"
                else:
                    # no needs to convert into float
                    pass

            if (self.verbose):
                print("")
                print("> Task : " + task)

            if (task == "classification"):
                if (self.verbose):
                    print('Train Traget')
                    print(y_train.value_counts())
                    print("")
                    print("encoding target ...")
                enc = LabelEncoder()
                y_train = pd.Series(enc.fit_transform(y_train.values),
                                    index=y_train.index,
                                    name=target_name,
                                    dtype='int')
                print("training set encoding finished")

            else:
                if (self.verbose):
                    print(y_train.describe())

            ##############################################################
            #                         Dumping
            ##############################################################

            # Creating a folder to save the files and target encoder

            try:
                os.mkdir(self.to_path)
            except OSError:
                pass

            if (self.to_hdf5):

                start_time = time.time()

                if (self.verbose):
                    print("")
                    print("dumping files into directory : " + self.to_path)

                # Temp adding target to dump train file...
                df_train[target_name] = y_train.values
                df_train.to_hdf(self.to_path + '/df_train.h5', 'train')
                del df_train[target_name]

                if (self.verbose):
                    print("train dumped")

            else:
                pass

            if (task == "classification"):
                fhand = open(self.to_path + '/target_encoder.obj', 'wb')
                pickle.dump(enc, fhand)
                fhand.close()
            else:
                pass

            if (self.verbose):
                print("")
                print("Impute the Missing Values...")
                imp = Imputation()
                df_train = imp.fit_transform(df_train)
                print("")

            return {
                "train": df_train,
                "target": y_train,
            }
コード例 #9
0
	parser.add_argument('--study', help='Absolute path of the directory off the study panel')
	parser.add_argument('--output', help='Absolute path of the output (results) directory')
	parser.add_argument('--chromosomes', help='comma separated values of chromosomes (If not set, imputation for all chromosomes will be performed')
	parser.add_argument('--additional_shapeit_parameters', help='Extra command line arguments to pass to SHAPEIT tool', default=' ')
	parser.add_argument('--additional_impute2_parameters', help='Extra command line arguments to pass to impute2 tool', default=' ')
	parser.add_argument('--position_batch_size', help='Size of the chromosomal size of each imputation batch', default=5000000, type=int)
	parser.add_argument('--sample_batch_size', help='Minimum number of samples in imputation batches', default=500, type=int)
	parser.add_argument('--reference', help='name of the imputation reference panel')
	parser.add_argument('--action', help='Action to do: liftover, phase, impute', choices=['liftover', 'phase', 'impute'])
	parser.add_argument('--add_reference', help='Add a new reference panel', action='store_true')
	parser.add_argument('--backend', help='Execution environment. Default: local', choices=['pbs',  'grid', 'local'], default='local')
	parser.add_argument('--nosubmit', help='Create scripts but don\'t submit them for execution', action='store_true')
	
	args = parser.parse_args()

	imp = Imputation(tools_dir=args.tools_dir, reference_dir=args.reference_dir)

	if args.dl_tools:
		imp.install_imputation_tools()

	elif args.list:
		imp.list_reference_panels()

	elif args.dl_reference:
		imp.install_reference_panel(args.dl_reference)

	elif args.add_reference:
		imp.add_custom_reference_panels()

	elif args.action:
		if not args.study:
コード例 #10
0
def main(args):
    """Main function for AutoML in time-series predictions.
  
  Args:
    - data loading parameters:
      - data_names: mimic, ward, cf    
      
    - preprocess parameters: 
      - normalization: minmax, standard, None
      - one_hot_encoding: input features that need to be one-hot encoded
      - problem: 'one-shot' or 'online'
        - 'one-shot': one time prediction at the end of the time-series 
        - 'online': preditcion at every time stamps of the time-series
      - max_seq_len: maximum sequence length after padding
      - label_name: the column name for the label(s)
      - treatment: the column name for treatments
      
    - imputation parameters: 
      - static_imputation_model: mean, median, mice, missforest, knn, gain
      - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain
            
    - feature selection parameters:
      - feature_selection_model: greedy-addition, greedy-deletion, recursive-addition, recursive-deletion, None
      - feature_number: selected featuer number
      
    - predictor_parameters:
      - epochs: number of epochs
      - bo_itr: bayesian optimization iterations
      - static_mode: how to utilize static features (concatenate or None)
      - time_mode: how to utilize time information (concatenate or None)
      - task: classification or regression
      
    - metric_name: auc, apr, mae, mse
  """
    #%% Step 0: Set basic parameters
    metric_sets = [args.metric_name]
    metric_parameters = {
        "problem": args.problem,
        "label_name": [args.label_name]
    }

    #%% Step 1: Upload Dataset
    # File names
    data_directory = "../datasets/data/" + args.data_name + "/" + args.data_name + "_"

    data_loader_training = CSVLoader(
        static_file=data_directory + "static_train_data.csv.gz",
        temporal_file=data_directory + "temporal_train_data_eav.csv.gz",
    )

    data_loader_testing = CSVLoader(
        static_file=data_directory + "static_test_data.csv.gz",
        temporal_file=data_directory + "temporal_test_data_eav.csv.gz",
    )

    dataset_training = data_loader_training.load()
    dataset_testing = data_loader_testing.load()

    print("Finish data loading.")

    #%% Step 2: Preprocess Dataset
    # (0) filter out negative values (Automatically)
    negative_filter = FilterNegative()
    # (1) one-hot encode categorical features
    onehot_encoder = OneHotEncoder(
        one_hot_encoding_features=[args.one_hot_encoding])
    # (2) Normalize features: 3 options (minmax, standard, none)
    normalizer = Normalizer(args.normalization)

    filter_pipeline = PipelineComposer(negative_filter, onehot_encoder,
                                       normalizer)

    dataset_training = filter_pipeline.fit_transform(dataset_training)
    dataset_testing = filter_pipeline.transform(dataset_testing)

    print("Finish preprocessing.")

    #%% Step 3: Define Problem
    problem_maker = ProblemMaker(problem=args.problem,
                                 label=[args.label_name],
                                 max_seq_len=args.max_seq_len,
                                 treatment=args.treatment)

    dataset_training = problem_maker.fit_transform(dataset_training)
    dataset_testing = problem_maker.fit_transform(dataset_testing)

    print("Finish defining problem.")

    #%% Step 4: Impute Dataset
    static_imputation = Imputation(
        imputation_model_name=args.static_imputation_model, data_type="static")
    temporal_imputation = Imputation(
        imputation_model_name=args.temporal_imputation_model,
        data_type="temporal")

    imputation_pipeline = PipelineComposer(static_imputation,
                                           temporal_imputation)

    dataset_training = imputation_pipeline.fit_transform(dataset_training)
    dataset_testing = imputation_pipeline.transform(dataset_testing)

    print("Finish imputation.")

    #%% Step 5: Feature selection (4 options)
    static_feature_selection = FeatureSelection(
        feature_selection_model_name=args.static_feature_selection_model,
        feature_type="static",
        feature_number=args.static_feature_selection_number,
        task=args.task,
        metric_name=args.metric_name,
        metric_parameters=metric_parameters,
    )

    temporal_feature_selection = FeatureSelection(
        feature_selection_model_name=args.temporal_feature_selection_model,
        feature_type="temporal",
        feature_number=args.temporal_feature_selection_number,
        task=args.task,
        metric_name=args.metric_name,
        metric_parameters=metric_parameters,
    )

    feature_selection_pipeline = PipelineComposer(static_feature_selection,
                                                  temporal_feature_selection)

    dataset_training = feature_selection_pipeline.fit_transform(
        dataset_training)
    dataset_testing = feature_selection_pipeline.transform(dataset_testing)

    print("Finish feature selection.")

    #%% Step 6: Bayesian Optimization
    ## Model define
    # RNN model
    rnn_parameters = {
        "model_type": "lstm",
        "epoch": args.epochs,
        "static_mode": args.static_mode,
        "time_mode": args.time_mode,
        "verbose": False,
    }

    general_rnn = GeneralRNN(task=args.task)
    general_rnn.set_params(**rnn_parameters)

    # CNN model
    cnn_parameters = {
        "epoch": args.epochs,
        "static_mode": args.static_mode,
        "time_mode": args.time_mode,
        "verbose": False,
    }
    temp_cnn = TemporalCNN(task=args.task)
    temp_cnn.set_params(**cnn_parameters)

    # Transformer
    transformer = TransformerPredictor(task=args.task,
                                       epoch=args.epochs,
                                       static_mode=args.static_mode,
                                       time_mode=args.time_mode)

    # Attention model
    attn_parameters = {
        "model_type": "lstm",
        "epoch": args.epochs,
        "static_mode": args.static_mode,
        "time_mode": args.time_mode,
        "verbose": False,
    }
    attn = Attention(task=args.task)
    attn.set_params(**attn_parameters)

    # model_class_list = [general_rnn, attn, temp_cnn, transformer]
    model_class_list = [general_rnn, attn]

    # train_validate split
    dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.1)

    # Bayesian Optimization Start
    metric = BOMetric(metric="auc", fold=0, split="test")

    ens_model_list = []

    # Run BO for each model class
    for m in model_class_list:
        BO_model = automl.model.AutoTS(dataset_training,
                                       m,
                                       metric,
                                       model_path="tmp/")
        models, bo_score = BO_model.training_loop(num_iter=args.bo_itr)
        auto_ens_model = AutoEnsemble(models, bo_score)
        ens_model_list.append(auto_ens_model)

    # Load all ensemble models
    for ens in ens_model_list:
        for m in ens.models:
            m.load_model(BO_model.model_path + "/" + m.model_id + ".h5")

    # Stacking algorithm
    stacking_ens_model = StackingEnsemble(ens_model_list)
    stacking_ens_model.fit(dataset_training, fold=0, train_split="val")

    # Prediction
    assert not dataset_testing.is_validation_defined
    test_y_hat = stacking_ens_model.predict(dataset_testing, test_split="test")
    test_y = dataset_testing.label

    print("Finish AutoML model training and testing.")

    #%% Step 7: Visualize Results
    idx = np.random.permutation(len(test_y_hat))[:2]

    # Evaluate predictor model
    result = Metrics(metric_sets,
                     metric_parameters).evaluate(test_y, test_y_hat)
    print("Finish predictor model evaluation.")

    # Visualize the output
    # (1) Performance
    print("Overall performance")
    print_performance(result, metric_sets, metric_parameters)
    # (2) Predictions
    print("Each prediction")
    print_prediction(test_y_hat[idx], metric_parameters)

    return
コード例 #11
0
ファイル: main.py プロジェクト: ksharma75/MLPreprocessingTool
def main():
    print('-'*10+'Welcome to ML Preprocessor CLI'+'-'*10+'\n\n')
    try:
        file_path=sys.argv[1]
        if file_path.endswith('.csv')==False:
            raise IncorrectFileFormatError("file is not in CSV format")

        revised_df=readCSV(file_path)
        print('\nScreenshot of independent dataframe:\n')
        print(revised_df.head())
        print('\n'+'-'*30+'\n')
        while True:
            print('\nTasks(Preprocessing)')
            print('1.Data Description')
            print('2.Handling NULL values')
            print('3.Encoding Categorical Data')
            print('4.Feature Scaling of the Dataset')
            print('5.Download the modified Dataset\n')
            option=int(input('What do you want to do?(Press -1 to exit):'))
            if option==-1:
                raise ExitError
            elif option==1:
                data_desc=DataDescription(revised_df)
                while True:
                    option=data_desc.getOption()
                    if option==-1:
                        break
                    elif option==1:
                        data_desc.showProperty()
                    elif option==2:
                        data_desc.showStats()
                    elif option==3:
                        data_desc.showDF()
                    else:
                        print('Incorrect option!Try again.')
                
            elif option==2:
                impute=Imputation(revised_df)
                while True:
                    option=impute.getOption()
                    if option==-1:
                        break
                    elif option==1:
                        impute.countNULL()
                    elif option==2:
                        revised_df=impute.dropColumn()
                    elif option==3:
                        revised_df=impute.fillUtil()
                    elif option==4:
                        impute.showDF()
                    else:
                        print('Incorrect option!Try again.')
            elif option==3:
                encode=EncodeCategorical(revised_df)
                while True:
                    option=encode.getOption()
                    if option==-1:
                        break
                    elif option==1:
                        encode.showCategorical()
                    elif option==2:
                        revised_df=encode.performOneHotEncodingUtil()
                    elif option==3:
                        encode.showDF()
                    else:
                        print('Incorrect option!Try again.')
            elif option==4:
                while True:
                    scale=FeatureScaling(revised_df)
                    option=scale.getOption()
                    if option==-1:
                        break
                    elif option==1:
                        scale.normalizeUtil()
                    elif option==2:
                        scale.standardizeUtil()
                    elif option==3:
                        scale.showDF()
                    else:
                        print('Incorrect option!Try again.')
            elif option==5:
                download=Download(revised_df)
                download.downloadDataframe()
            else:
                print('Incorrect option!Try again.')
    except IndexError as e:
        print('File path missing.',e)
    except IncorrectFileFormatError as e:
        print('Incorrect file format.',e)
    except FileNotFoundError as e:
        print('File Not Found!',e)
    except ExitError as e:
        print(e)
    except Exception as e:
        print('Incorrect option chosen.',e)       
コード例 #12
0
def main(args):
    '''Main function for individual treatment effect estimation.

  Args:
    - data loading parameters:
      - data_names: mimic, ward, cf, mimic_antibiotics

    - preprocess parameters:
      - normalization: minmax, standard, None
      - one_hot_encoding: input features that need to be one-hot encoded
      - problem: 'online'
        - 'online': preiction at every time stamps of the time-series
      - max_seq_len: maximum sequence length after padding
      - label_name: the column name for the label(s)
      - treatment: the column name for treatments

    - imputation parameters:
      - static_imputation_model: mean, median, mice, missforest, knn, gain
      - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain

    - feature selection parameters:
      - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None
      - feature_number: selected featuer number

    - treatment effects model parameters:
      - model_name: CRN, RMSN, GANITE
      Each model has different types of hyperparameters that need to be set.

        - Parameters needed for the Counterfactual Recurrent Network (CRN):
          - hyperparameters for encoder:
              - rnn_hidden_units: hidden dimensions in the LSTM unit
              - rnn_keep_prob: keep probability used for variational dropout in the LSTM unit
              - br_size: size of the balancing representation
              - fc_hidden_units: hidden dimensions of the fully connected layers used for treatment classifier and predictor
              - batch_size: number of samples in mini-batch
              - num_epochs: number of epochs
              - learning_rate: learning rate
              - max_alpha: alpha controls the trade-off between building tratment invariant representations (domain
                discrimination) and being able to predict outcomes (outcome prediction); during training, CRN uses an
                exponentially increasing schedule for alpha from 0 to max_alpha.
          - hyperparameters for decoder:
              - the decoder requires the same hyperparameters as the encoder with the exception of the rnn_hidden_units
                which is set to be equal to the br_size of the encoder

        - Parameters for Recurrent Marginal Structural Networks (RMSN):
            - hyperparameters for encoder:
                - dropout_rate: dropout probability used for variational
                - rnn_hidden_units: hidden dimensions in the LSTM unit
                - batch_size: number of samples in mini-batch
                - num_epochs: number of epochs
                - learning_rate: learning rate
                - max_norm: max gradient norm used for gradient clipping during training
            - hyperparameters for decoder:
                - the decoder requires the same hyperparameters as the encoder.
            - model_dir: directory where the model is saved
            - model_name: name of the saved model

        - Parameters for GANITE:
          - batch size: number of samples in mini-batch
          - alpha: parameter trading off between discriminator loss and supervised loss for the generator training
          - learning_rate: learning rate
          - hidden_units: hidden dimensions of the fully connected layers used in the networks
          - stack_dim: number of timesteps to stack

        All models have the following common parameters:
          - static_mode: how to utilize static features (concatenate or None)
          - time_mode: how to utilize time information (concatenate or None)
          - taks: 'classification' or 'regression'


    - metric_name: auc, apr, mae, mse (used for factual prediction)
    - patient id: patient for which counterfactual trajectories are computed
    - timestep: timestep in patient trajectory for estimating counterfactuals
  '''
    # %% Step 0: Set basic parameters
    metric_sets = [args.metric_name]
    metric_parameters = {
        'problem': args.problem,
        'label_name': [args.label_name]
    }

    # %% Step 1: Upload Dataset
    # File names
    data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_'

    data_loader_training = CSVLoader(
        static_file=data_directory + 'static_train_data.csv.gz',
        temporal_file=data_directory + 'temporal_train_data_eav.csv.gz')

    data_loader_testing = CSVLoader(
        static_file=data_directory + 'static_test_data.csv.gz',
        temporal_file=data_directory + 'temporal_test_data_eav.csv.gz')

    dataset_training = data_loader_training.load()
    dataset_testing = data_loader_testing.load()

    print('Finish data loading.')

    # %% Step 2: Preprocess Dataset
    # (0) filter out negative values (Automatically)
    negative_filter = FilterNegative()
    # (1) one-hot encode categorical features
    onehot_encoder = OneHotEncoder(
        one_hot_encoding_features=[args.one_hot_encoding])
    # (2) Normalize features: 3 options (minmax, standard, none)
    normalizer = Normalizer(args.normalization)

    filter_pipeline = PipelineComposer(negative_filter, onehot_encoder,
                                       normalizer)

    dataset_training = filter_pipeline.fit_transform(dataset_training)
    dataset_testing = filter_pipeline.transform(dataset_testing)

    print('Finish preprocessing.')

    # %% Step 3: Define Problem
    problem_maker = ProblemMaker(problem=args.problem,
                                 label=[args.label_name],
                                 max_seq_len=args.max_seq_len,
                                 treatment=[args.treatment])

    dataset_training = problem_maker.fit_transform(dataset_training)
    dataset_testing = problem_maker.fit_transform(dataset_testing)

    print('Finish defining problem.')

    # %% Step 4: Impute Dataset
    static_imputation = Imputation(
        imputation_model_name=args.static_imputation_model, data_type='static')
    temporal_imputation = Imputation(
        imputation_model_name=args.temporal_imputation_model,
        data_type='temporal')

    imputation_pipeline = PipelineComposer(static_imputation,
                                           temporal_imputation)

    dataset_training = imputation_pipeline.fit_transform(dataset_training)
    dataset_testing = imputation_pipeline.transform(dataset_testing)

    print('Finish imputation.')

    # %% Step 5: Feature selection (4 options)
    static_feature_selection = \
      FeatureSelection(feature_selection_model_name=args.static_feature_selection_model,
                       feature_type='static', feature_number=args.static_feature_selection_number,
                       task=args.task, metric_name=args.metric_name,
                       metric_parameters=metric_parameters)

    temporal_feature_selection = \
      FeatureSelection(feature_selection_model_name=args.temporal_feature_selection_model,
                       feature_type='temporal', feature_number=args.temporal_feature_selection_number,
                       task=args.task, metric_name=args.metric_name,
                       metric_parameters=metric_parameters)

    feature_selection_pipeline = PipelineComposer(static_feature_selection,
                                                  temporal_feature_selection)

    dataset_training = feature_selection_pipeline.fit_transform(
        dataset_training)
    dataset_testing = feature_selection_pipeline.transform(dataset_testing)

    print('Finish feature selection.')

    # %% Step 6: Fit treatment effects (3 options)
    # Set the validation data for best model saving
    dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0)

    # Set the treatment effects model
    model_name = args.model_name

    # Set treatment effects model parameters
    if model_name == 'CRN':
        model_parameters = {
            'encoder_rnn_hidden_units': args.crn_encoder_rnn_hidden_units,
            'encoder_br_size': args.crn_encoder_br_size,
            'encoder_fc_hidden_units': args.crn_encoder_fc_hidden_units,
            'encoder_learning_rate': args.crn_encoder_learning_rate,
            'encoder_batch_size': args.crn_encoder_batch_size,
            'encoder_keep_prob': args.crn_encoder_keep_prob,
            'encoder_num_epochs': args.crn_encoder_num_epochs,
            'encoder_max_alpha': args.crn_encoder_max_alpha,
            'decoder_br_size': args.crn_decoder_br_size,
            'decoder_fc_hidden_units': args.crn_decoder_fc_hidden_units,
            'decoder_learning_rate': args.crn_decoder_learning_rate,
            'decoder_batch_size': args.crn_decoder_batch_size,
            'decoder_keep_prob': args.crn_decoder_keep_prob,
            'decoder_num_epochs': args.crn_decoder_num_epochs,
            'decoder_max_alpha': args.crn_decoder_max_alpha,
            'projection_horizon': args.projection_horizon,
            'static_mode': args.static_mode,
            'time_mode': args.time_mode
        }
        treatment_model = treatment_effects_model(model_name,
                                                  model_parameters,
                                                  task='classification')
        treatment_model.fit(dataset_training)

    elif model_name == 'RMSN':
        hyperparams_encoder_iptw = {
            'dropout_rate': args.rmsn_encoder_dropout_rate,
            'memory_multiplier': args.rmsn_encoder_memory_multiplier,
            'num_epochs': args.rmsn_encoder_num_epochs,
            'batch_size': args.rmsn_encoder_batch_size,
            'learning_rate': args.rmsn_encoder_learning_rate,
            'max_norm': args.rmsn_encoder_max_norm
        }

        hyperparams_decoder_iptw = {
            'dropout_rate': args.rmsn_decoder_dropout_rate,
            'memory_multiplier': args.rmsn_decoder_memory_multiplier,
            'num_epochs': args.rmsn_decoder_num_epochs,
            'batch_size': args.rmsn_decoder_batch_size,
            'learning_rate': args.rmsn_decoder_learning_rate,
            'max_norm': args.rmsn_decoder_max_norm
        }

        model_parameters = {
            'hyperparams_encoder_iptw': hyperparams_encoder_iptw,
            'hyperparams_decoder_iptw': hyperparams_decoder_iptw,
            'model_dir': args.rmsn_model_dir,
            'model_name': args.rmsn_model_name,
            'static_mode': args.static_mode,
            'time_mode': args.time_mode
        }

        treatment_model = treatment_effects_model(model_name,
                                                  model_parameters,
                                                  task='classification')
        treatment_model.fit(dataset_training,
                            projection_horizon=args.projection_horizon)

    elif model_name == 'GANITE':
        hyperparams = {
            'batch_size': args.ganite_batch_size,
            'alpha': args.ganite_alpha,
            'hidden_dims': args.ganite_hidden_dims,
            'learning_rate': args.ganite_learning_rate
        }

        model_parameters = {
            'hyperparams': hyperparams,
            'stack_dim': args.ganite_stack_dim,
            'static_mode': args.static_mode,
            'time_mode': args.time_mode
        }

        treatment_model = treatment_effects_model(model_name,
                                                  model_parameters,
                                                  task='classification')
        treatment_model.fit(dataset_training)

    test_y_hat = treatment_model.predict(dataset_testing)

    print('Finish treatment effects model training and testing.')

    # %% Step 9: Visualize Results

    # Evaluate predictor model
    result = Metrics(metric_sets,
                     metric_parameters).evaluate(dataset_testing.label,
                                                 test_y_hat)
    print('Finish predictor model evaluation.')

    # Visualize the output
    # (1) Performance on estimating factual outcomes
    print('Overall performance on estimating factual outcomes')
    print_performance(result, metric_sets, metric_parameters)

    # (2) Counterfactual trajectories
    print('Counterfactual trajectories')
    if model_name in ['CRN', 'RMSN']:
        # Predict and visualize counterfactuals for the sequence of treatments indicated by the user
        # through the treatment_options. The lengths of each sequence of treatments needs to be projection_horizon + 1.
        treatment_options = np.array([[[1], [1], [1], [1], [1], [0]],
                                      [[0], [0], [0], [0], [1], [1]]])
        history, counterfactual_traj = treatment_model.predict_counterfactual_trajectories(
            dataset=dataset_testing,
            patient_id=args.patient_id,
            timestep=args.timestep,
            treatment_options=treatment_options)

        print_counterfactual_predictions(
            patient_history=history,
            treatment_options=treatment_options,
            counterfactual_predictions=counterfactual_traj)

    return
コード例 #13
0
def main(args):
    """Main function for time-series prediction.
    
    Args:
        - data loading parameters:
            - data_names: mimic, ward, cf        
            
        - preprocess parameters: 
            - normalization: minmax, standard, None
            - one_hot_encoding: input features that need to be one-hot encoded
            - problem: 'one-shot' or 'online'
                - 'one-shot': one time prediction at the end of the time-series 
                - 'online': prediction at every time stamps of the time-series
            - max_seq_len: maximum sequence length after padding
            - label_name: the column name for the label(s)
            - treatment: the column name for treatments
            
        - imputation parameters: 
            - static_imputation_model: mean, median, mice, missforest, knn, gain
            - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain
                        
        - feature selection parameters:
            - feature_selection_model: greedy-addition, greedy-deletion, recursive-addition, recursive-deletion, None
            - feature_number: selected feature number
            
        - predictor_parameters:
            - model_name: rnn, gru, lstm, attention, tcn, transformer
            - model_parameters: network parameters such as number of layers
                - h_dim: hidden dimensions
                - n_layer: layer number
                - n_head: head number (only for transformer model)
                - batch_size: number of samples in mini-batch
                - epochs: number of epochs
                - learning_rate: learning rate
            - static_mode: how to utilize static features (concatenate or None)
            - time_mode: how to utilize time information (concatenate or None)
            - task: classification or regression
            
        - uncertainty_model_name: uncertainty estimation model name (ensemble)
        - interpretation_model_name: interpretation model name (tinvase)
        - metric_name: auc, apr, mae, mse
    """
    #%% Step 0: Set basic parameters
    metric_sets = [args.metric_name]
    metric_parameters = {
        "problem": args.problem,
        "label_name": [args.label_name]
    }

    #%% Step 1: Upload Dataset
    # File names
    data_directory = "../datasets/data/" + args.data_name + "/" + args.data_name + "_"

    data_loader_training = CSVLoader(
        static_file=data_directory + "static_train_data.csv.gz",
        temporal_file=data_directory + "temporal_train_data_eav.csv.gz",
    )

    data_loader_testing = CSVLoader(
        static_file=data_directory + "static_test_data.csv.gz",
        temporal_file=data_directory + "temporal_test_data_eav.csv.gz",
    )

    dataset_training = data_loader_training.load()
    dataset_testing = data_loader_testing.load()

    print("Finish data loading.")

    #%% Step 2: Preprocess Dataset
    # (0) filter out negative values (Automatically)
    negative_filter = FilterNegative()
    # (1) one-hot encode categorical features
    onehot_encoder = OneHotEncoder(
        one_hot_encoding_features=[args.one_hot_encoding])
    # (2) Normalize features: 3 options (minmax, standard, none)
    normalizer = Normalizer(args.normalization)

    filter_pipeline = PipelineComposer(negative_filter, onehot_encoder,
                                       normalizer)

    dataset_training = filter_pipeline.fit_transform(dataset_training)
    dataset_testing = filter_pipeline.transform(dataset_testing)

    print("Finish preprocessing.")

    #%% Step 3: Define Problem
    problem_maker = ProblemMaker(problem=args.problem,
                                 label=[args.label_name],
                                 max_seq_len=args.max_seq_len,
                                 treatment=args.treatment)

    dataset_training = problem_maker.fit_transform(dataset_training)
    dataset_testing = problem_maker.fit_transform(dataset_testing)

    print("Finish defining problem.")

    #%% Step 4: Impute Dataset
    static_imputation = Imputation(
        imputation_model_name=args.static_imputation_model, data_type="static")
    temporal_imputation = Imputation(
        imputation_model_name=args.temporal_imputation_model,
        data_type="temporal")

    imputation_pipeline = PipelineComposer(static_imputation,
                                           temporal_imputation)

    dataset_training = imputation_pipeline.fit_transform(dataset_training)
    dataset_testing = imputation_pipeline.transform(dataset_testing)

    print("Finish imputation.")

    #%% Step 5: Feature selection (4 options)
    static_feature_selection = FeatureSelection(
        feature_selection_model_name=args.static_feature_selection_model,
        feature_type="static",
        feature_number=args.static_feature_selection_number,
        task=args.task,
        metric_name=args.metric_name,
        metric_parameters=metric_parameters,
    )

    temporal_feature_selection = FeatureSelection(
        feature_selection_model_name=args.temporal_feature_selection_model,
        feature_type="temporal",
        feature_number=args.temporal_feature_selection_number,
        task=args.task,
        metric_name=args.metric_name,
        metric_parameters=metric_parameters,
    )

    feature_selection_pipeline = PipelineComposer(static_feature_selection,
                                                  temporal_feature_selection)

    dataset_training = feature_selection_pipeline.fit_transform(
        dataset_training)
    dataset_testing = feature_selection_pipeline.transform(dataset_testing)

    print("Finish feature selection.")

    #%% Step 6: Fit and Predict (6 options)
    # Set predictor model parameters
    model_parameters = {
        "h_dim": args.h_dim,
        "n_layer": args.n_layer,
        "n_head": args.n_head,
        "batch_size": args.batch_size,
        "epoch": args.epochs,
        "model_type": args.model_name,
        "learning_rate": args.learning_rate,
        "static_mode": args.static_mode,
        "time_mode": args.time_mode,
        "verbose": True,
    }

    # Set the validation data for best model saving
    dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0)

    pred_class = prediction(args.model_name, model_parameters, args.task)
    pred_class.fit(dataset_training)
    test_y_hat = pred_class.predict(dataset_testing)

    print("Finish predictor model training and testing.")

    #%% Step 7: Estimate Uncertainty (1 option)
    uncertainty_model = uncertainty(args.uncertainty_model_name,
                                    model_parameters, pred_class, args.task)
    uncertainty_model.fit(dataset_training)
    test_ci_hat = uncertainty_model.predict(dataset_testing)
    print("Finish uncertainty estimation")

    #%% Step 8: Interpret Predictions (1 option)
    interpretor = interpretation(args.interpretation_model_name,
                                 model_parameters, pred_class, args.task)
    interpretor.fit(dataset_training)
    test_s_hat = interpretor.predict(dataset_testing)
    print("Finish model interpretation")

    #%% Step 9: Visualize Results
    idx = np.random.permutation(len(test_y_hat))[:2]

    # Evaluate predictor model
    result = Metrics(metric_sets,
                     metric_parameters).evaluate(dataset_testing.label,
                                                 test_y_hat)
    print("Finish predictor model evaluation.")

    # Visualize the output
    # (1) Performance
    print("Overall performance")
    print_performance(result, metric_sets, metric_parameters)
    # (2) Predictions
    print("Each prediction")
    print_prediction(test_y_hat[idx], metric_parameters)
    # (3) Uncertainty
    print("Uncertainty estimations")
    print_uncertainty(test_y_hat[idx], test_ci_hat[idx], metric_parameters)
    # (4) Model interpretation
    print("Model interpretation")
    print_interpretation(test_s_hat[idx], dataset_training.feature_name,
                         metric_parameters, model_parameters)

    return