Ejemplo n.º 1
0
    def __init__(self, data_init_params):

        for key in data_init_params:
            setattr(self, key, data_init_params[key])
        alm_fun.show_msg(
            self.log, self.verbose, 'Class: [alm_data] [__init__] ' +
            self.name + ' ......done @' + str(datetime.now()))
Ejemplo n.º 2
0
    def __init__(self, predictor_init_params):
        for key in predictor_init_params:
            setattr(self, key, predictor_init_params[key])

        #parameters that are not open for configuration yet
        self.rfp_cutoff = 0.9
        self.pfr_cutoff = 0.9
        self.init_weights = None
        self.target_as_source = None
        self.fill_na_type = None
        self.tune_tree_nums_before_test = 0
        self.tune_tree_nums_during_cv = 0
        self.shuffle_features = []
        self.eval_obj = 'auc'
        self.trials_mv_step = 0
        self.use_extra_train_data = 2
        self.nofit = 0
        self.if_feature_engineer = 0

        alm_fun.show_msg(
            self.log, self.verbose, 'Class: [alm_predictor] [__init__] ' +
            self.name + ' ...... @' + str(datetime.now()))

        if self.type == 1:  # VARITY predictors
            #             if self.hp_tune_type == 'hyperopt':
            #                 self.create_hyperopt_hps() # create hp_config_dict and hyperopt_hps
            #             if self.hp_tune_type == 'hyperopt_logistic':
            self.create_hyperopt_logistic_hps()
Ejemplo n.º 3
0
    def __init__(self, data_init_params):

        for key in data_init_params:
            setattr(self, key, data_init_params[key])

        #parameters that are not open for configuration yet
        self.filter_train = 0
        self.filter_test = 0
        self.filter_target = 0
        self.filter_validation = 0
        self.if_gradient = 0
        self.verbose = 1
        self.dependent_variable = 'label'
        self.independent_testset = 0
        self.validation_from_testset = 0
        self.if_engineer = 0
        self.use_extra_data = 1
        self.extra_data_index = 0

        alm_fun.show_msg(
            self.log, self.verbose, 'Class: [alm_data] [__init__] ' +
            self.name + ' ...... @' + str(datetime.now()))
Ejemplo n.º 4
0
    def create_hyperopt_hps(self):
        hyperopt_hps = {}
        extra_data = self.data_instance.extra_train_data_df_lst[0]
        self.hp_parameters = {}
        self.hp_parameters['all'] = []
        self.hp_parameters['hyperopt'] = []
        self.hp_parameters['sp'] = []
        self.hp_default = {}
        self.hp_values = {}
        self.hp_range_start = {}
        self.hp_range_end = {}
        self.hp_indices = {}
        self.hp_rest_indices = {}
        self.hp_directions = {}

        self.hp_mv_values = {}
        self.hp_mv_range_start = {}
        self.hp_mv_range_end = {}
        self.hp_mv_indices = {}
        self.hp_mv_rest_indices = {}

        self.hps = {}

        create_new_hp_config = 0
        hp_config_file = self.project_path + '/output/npy/' + self.session_id + '_' + self.name + '_hp_config_dict.npy'
        if os.path.isfile(hp_config_file):
            if self.init_hp_config == 1:
                create_new_hp_config = 1
        else:
            create_new_hp_config = 1

        if create_new_hp_config == 0:
            hp_config_dict = np.load(hp_config_file).item()
            self.hp_directions = hp_config_dict['hp_directions']
            self.hp_parameters = hp_config_dict['hp_parameters']
            self.hp_values = hp_config_dict['hp_values']
            self.hp_range_start = hp_config_dict['hp_range_start']
            self.hp_range_end = hp_config_dict['hp_range_end']
            self.hp_indices = hp_config_dict['hp_indices']
            self.hp_rest_indices = hp_config_dict['hp_rest_indices']

            if self.old_system == 1:
                #                 self.hp_directions['extra_gnomad_af'] = 1
                #the old hp_config_file didn't include hp_default and moving window analysis config so we add it manually
                #for hp evaluation config
                for cur_hp in self.hyperparameter.keys():
                    hp = self.hyperparameter[cur_hp]
                    #parameters that are not open for configuration yet
                    hp['filter_type'] = 3  # filtering method
                    hp['mv_type'] = 0  # moving analysis method
                    hp['mv_size'] = 0
                    if self.name not in hp['predictor']:
                        continue
                    else:
                        self.hps[cur_hp] = hp

                    if hp['hp_type'] == 1:  #filtering parameters
                        extra_data_df = extra_data.loc[
                            extra_data['set_name'].isin(hp['source']), :]
                        self.hp_parameters['sp'].append(cur_hp)
                        [
                            pivots, pivot_indices, pivot_rest_indices,
                            pivot_values_range_start, pivot_values_range_end
                        ] = self.pivot_points(extra_data_df, hp, 1)
                        self.hp_mv_values[cur_hp] = range(len(pivots))
                        self.hp_mv_indices[cur_hp] = pivot_indices
                        self.hp_mv_rest_indices[cur_hp] = pivot_rest_indices
                        self.hp_mv_range_start[
                            cur_hp] = pivot_values_range_start
                        self.hp_mv_range_end[cur_hp] = pivot_values_range_end

                #for hp_default
                self.hp_default = hp_config_dict['hp_default']
                for cur_hp in self.hyperparameter.keys():
                    hp = self.hyperparameter[cur_hp]
                    self.hp_default[cur_hp] = hp['default']

                hp_config_dict['hp_mv_values'] = self.hp_mv_values
                hp_config_dict['hp_mv_range_start'] = self.hp_mv_range_start
                hp_config_dict['hp_mv_range_end'] = self.hp_mv_range_end
                hp_config_dict['hp_mv_indices'] = self.hp_mv_indices
                hp_config_dict['hp_mv_rest_indices'] = self.hp_mv_rest_indices

                hp_config_dict['hp_default'] = self.hp_default
                hp_config_dict['hps'] = self.hps

                np.save(hp_config_file, hp_config_dict)
                print('old system hp_config_dict converted.....')
            else:
                self.hp_mv_values = hp_config_dict['hp_mv_values']
                self.hp_mv_indices = hp_config_dict['hp_mv_indices']
                self.hp_mv_rest_indices = hp_config_dict['hp_mv_rest_indices']
                self.hp_mv_range_start = hp_config_dict['hp_mv_range_start']
                self.hp_mv_range_end = hp_config_dict['hp_mv_range_end']
                self.hp_default = hp_config_dict['hp_default']
                self.hps = hp_config_dict['hps']

            alm_fun.show_msg(self.log, self.verbose,
                             'Saved hp config dict loaded.')
        else:
            for cur_hp in self.hyperparameter.keys():
                hp = self.hyperparameter[cur_hp]
                #parameters that are not open for configuration yet
                hp['filter_type'] = 3  # filtering method
                hp['mv_type'] = 0  # moving analysis method
                hp['mv_size'] = 0

                if self.name not in hp['predictor']:
                    continue
                else:
                    self.hps[cur_hp] = hp

                if hp['hp_type'] == 1:  #filtering parameters
                    self.hp_default[cur_hp] = hp['default']
                    self.hp_directions[cur_hp] = hp['direction']
                    if hp['enable'] == 1:
                        self.hp_parameters['hyperopt'].append(cur_hp)
                    extra_data_df = extra_data.loc[
                        extra_data['set_name'].isin(hp['source']), :]
                    [
                        pivots, pivot_indices, pivot_rest_indices,
                        pivot_values_range_start, pivot_values_range_end
                    ] = self.pivot_points(extra_data_df, hp, 0)
                    self.hp_values[cur_hp] = range(len(pivots))
                    self.hp_indices[cur_hp] = pivot_indices
                    self.hp_rest_indices[cur_hp] = pivot_rest_indices
                    self.hp_range_start[cur_hp] = pivot_values_range_start
                    self.hp_range_end[cur_hp] = pivot_values_range_end

                    #for moving widow analysis
                    self.hp_parameters['sp'].append(cur_hp)
                    [
                        pivots, pivot_indices, pivot_rest_indices,
                        pivot_values_range_start, pivot_values_range_end
                    ] = self.pivot_points(extra_data_df, hp, 1)
                    self.hp_mv_values[cur_hp] = range(len(pivots))
                    self.hp_mv_indices[cur_hp] = pivot_indices
                    self.hp_mv_rest_indices[cur_hp] = pivot_rest_indices
                    self.hp_mv_range_start[cur_hp] = pivot_values_range_start
                    self.hp_mv_range_end[cur_hp] = pivot_values_range_end

                if hp['hp_type'] == 2:  #weight parameters
                    self.hp_default[cur_hp] = hp['default']
                    self.hp_parameters['hyperopt'].append(cur_hp)
                    #                     cur_weights = np.linspace(start = hp['from'], stop = hp['to'], num = 101)
                    cur_weights = np.round(
                        np.arange(hp['from'], hp['to'] + hp['step'],
                                  hp['step']), 2)
                    self.hp_values[cur_hp] = cur_weights
                    self.hp_indices[cur_hp] = {}
                    self.hp_rest_indices[cur_hp] = {}
                    self.hp_range_start[cur_hp] = {}
                    self.hp_range_end[cur_hp] = {}
                    for weight in cur_weights:
                        self.hp_indices[cur_hp][weight] = extra_data_df.index
                        self.hp_rest_indices[cur_hp][weight] = []
                        self.hp_range_start[cur_hp][weight] = np.nan
                        self.hp_range_end[cur_hp][weight] = np.nan

            #save current hyper parameter configurations
            hp_config_dict = {}
            hp_config_dict['hp_parameters'] = self.hp_parameters
            hp_config_dict['hp_values'] = self.hp_values
            hp_config_dict['hp_range_start'] = self.hp_range_start
            hp_config_dict['hp_range_end'] = self.hp_range_end
            hp_config_dict['hp_indices'] = self.hp_indices
            hp_config_dict['hp_rest_indices'] = self.hp_rest_indices
            hp_config_dict['hp_directions'] = self.hp_directions

            hp_config_dict['hp_mv_values'] = self.hp_mv_values
            hp_config_dict['hp_mv_range_start'] = self.hp_mv_range_start
            hp_config_dict['hp_mv_range_end'] = self.hp_mv_range_end
            hp_config_dict['hp_mv_indices'] = self.hp_mv_indices
            hp_config_dict['hp_mv_rest_indices'] = self.hp_mv_rest_indices

            hp_config_dict['hp_default'] = self.hp_default
            hp_config_dict['hps'] = self.hps

            np.save(hp_config_file, hp_config_dict)
            alm_fun.show_msg(
                self.log, self.verbose,
                'Hyperparameter config dictionary for ' + self.name +
                ' saved.')

        for hp_parameter in self.hp_parameters['hyperopt']:
            hyperopt_hps[hp_parameter] = hyperopt.hp.choice(
                hp_parameter, self.hp_values[hp_parameter])
        pass

        for cur_hp in self.hyperparameter.keys():
            hp = self.hyperparameter[cur_hp]
            if self.name not in hp['predictor']:
                continue

            if hp['hp_type'] == 3:  #algorithm level hyper-parameters
                self.hp_default[cur_hp] = hp['default']
                if hp['type'] == 'real':
                    hyperopt_hps[cur_hp] = hyperopt.hp.quniform(
                        cur_hp, hp['from'], hp['to'], hp['step'])
                if (hp['type'] == 'int') | (hp['type'] == 'category'):
                    hyperopt_hps[cur_hp] = hyperopt.hp.choice(
                        cur_hp, np.arange(hp['from'], hp['to'], dtype=int))

#         np.save(self.project_path + '/output/npy/' + self.session_id + '_hyperopt_hps.npy',hyperopt_hps)
        return (hyperopt_hps)
Ejemplo n.º 5
0
    def refresh_data(self):

        #         self.verbose = verbose

        if (self.load_from_disk == 0) | (not os.path.isfile(
                self.project_path + '/output/npy/' + self.session_id + '_' +
                self.name + '_savedata.npy')):
            # load data (set initial features, handel onehot features,remove samples without valid dependent variable)
            self.load_data()
            msg = "Data loading......\n" + self.data_msg(split=0)
            alm_fun.show_msg(self.log, self.verbose, msg)
            # slice data
            self.preprocess_data()
            msg = "Data preprocessing......\n" + self.data_msg(split=0)
            alm_fun.show_msg(self.log, self.verbose, msg)
            # filter data
            self.filter_data()
            msg = "Data filtering......\n" + self.data_msg(split=0)
            alm_fun.show_msg(self.log, self.verbose, msg)

            #split data
            self.split_data()
            msg = "Data spliting.....\n" + self.data_msg()
            alm_fun.show_msg(self.log, self.verbose, msg)

            # gradient reshape
            if self.if_gradient == 1:
                self.gradient_data()
                msg = "[gradient_data]\n" + self.data_msg()
                alm_fun.show_msg(self.log, self.verbose, msg)

            # engineer data
            if self.if_engineer == 1:
                self.engineer_data()
                msg = "[egineer_data]\n" + self.data_msg()
                alm_fun.show_msg(self.log, self.verbose, msg)

            if self.save_to_disk == 1:
                self.save_data()

        else:
            self.dict_savedata = np.load(self.project_path + '/output/npy/' +
                                         self.session_id + '_' + self.name +
                                         '_savedata.npy')
            self.train_data_original_df = self.dict_savedata.get(
                'train_data_original_df', None)
            self.extra_train_data_df_lst = self.dict_savedata[
                'extra_train_data_df_lst']
            self.train_data_df = self.dict_savedata['train_data_df']
            self.test_data_df = self.dict_savedata['test_data_df']
            self.target_data_df = self.dict_savedata['target_data_df']

            self.train_data_index_df = self.dict_savedata[
                'train_data_index_df']
            self.validation_data_index_df = self.dict_savedata[
                'validation_data_index_df']
            self.test_data_index_df = self.dict_savedata['test_data_index_df']
            self.target_data_index_df = self.dict_savedata[
                'target_data_index_df']

            self.train_data_for_target_df = self.dict_savedata[
                'train_data_for_target_df']
            self.target_data_for_target_df = self.dict_savedata[
                'target_data_for_target_df']
            self.validation_data_for_target_df = self.dict_savedata[
                'validation_data_for_target_df']

            self.train_splits_df = self.dict_savedata['train_splits_df']
            self.test_splits_df = self.dict_savedata['test_splits_df']

            self.train_cv_splits_df = self.dict_savedata['train_cv_splits_df']
            self.validation_cv_splits_df = self.dict_savedata[
                'validation_cv_splits_df']

            self.train_data_index_df['weight'] = 1
            self.extra_train_data_df_lst[0]['weight'] = 1

            #             print (str(self.train_data_index_df.loc[self.train_data_index_df['mpc_obs_exp'].notnull(),:].shape))

            alm_fun.show_msg(
                self.log, self.verbose,
                str(self.train_data_index_df['set_name'].value_counts().
                    sort_index()))
            alm_fun.show_msg(
                self.log, self.verbose,
                str(self.extra_train_data_df_lst[0]
                    ['set_name'].value_counts().sort_index()))

            if self.if_gradient:
                self.gradients = self.dict_savedata['gradients']

            if self.if_engineer == 1:
                self.dict_savedata_engineered = np.load(
                    self.project_path + '/output/npy/' + self.session_id +
                    '_' + self.name + '_savedata_engineered.npy')
                self.train_data_for_target_engineered_df = self.dict_savedata_engineered[
                    'train_data_for_target_engineered_df']
                self.target_data_for_target_engineered_df = self.dict_savedata_engineered[
                    'target_data_for_target_engineered_df']
                self.validation_data_for_target_engineered_df = self.dict_savedata_engineered[
                    'validation_data_for_target_engineered_df']

                self.train_splits_engineered_df = self.dict_savedata_engineered[
                    'train_splits_engineered_df']
                self.test_splits_engineered_df = self.dict_savedata_engineered[
                    'test_splits_engineered_df']

                self.train_cv_splits_engineered_df = self.dict_savedata_engineered[
                    'train_cv_splits_engineered_df']
                self.validation_cv_splits_engineered_df = self.dict_savedata_engineered[
                    'validation_cv_splits_engineered_df']
Ejemplo n.º 6
0
    def refresh_data(self):

        #         self.verbose = verbose

        if self.load_from_disk == 0:
            # load data (set initial features, handel onehot features,remove samples without valid dependent variable)
            self.load_data()
            msg = "[load_data]\n" + self.data_msg()
            alm_fun.show_msg(self.log, self.verbose, msg)
            # slice data
            self.slice_data()
            msg = "[slice_data]\n" + self.data_msg()
            alm_fun.show_msg(self.log, self.verbose, msg)
            # filter data
            self.filter_data()
            msg = "[filter_data]\n" + self.data_msg()
            alm_fun.show_msg(self.log, self.verbose, msg)

            #split data
            self.split_data()
            msg = "[split_data]\n" + self.data_msg()
            alm_fun.show_msg(self.log, self.verbose, msg)

            # gradient reshape
            if self.if_gradient == 1:
                self.gradient_data()
                msg = "[gradient_data]\n" + self.data_msg()
                alm_fun.show_msg(self.log, self.verbose, msg)

            # engineer data

            if self.if_engineer == 1:
                self.engineer_data()
                msg = "[egineer_data]\n" + self.data_msg()
                alm_fun.show_msg(self.log, self.verbose, msg)

            if self.save_to_disk == 1:
                self.save_data()

        else:
            self.dict_savedata = np.load(self.path + 'output/npy/' +
                                         self.name + '_savedata.npy')

            self.extra_train_data_df = self.dict_savedata[
                'extra_train_data_df']
            self.train_data_df = self.dict_savedata['train_data_df']
            self.test_data_df = self.dict_savedata['test_data_df']
            self.target_data_df = self.dict_savedata['target_data_df']

            self.train_data_index_df = self.dict_savedata[
                'train_data_index_df']
            self.validation_data_index_df = self.dict_savedata[
                'validation_data_index_df']
            self.test_data_index_df = self.dict_savedata['test_data_index_df']
            self.target_data_index_df = self.dict_savedata[
                'target_data_index_df']

            self.train_data_for_target_df = self.dict_savedata[
                'train_data_for_target_df']
            self.target_data_for_target_df = self.dict_savedata[
                'target_data_for_target_df']
            self.validation_data_for_target_df = self.dict_savedata[
                'validation_data_for_target_df']

            self.train_splits_df = self.dict_savedata['train_splits_df']
            self.test_splits_df = self.dict_savedata['test_splits_df']

            self.train_cv_splits_df = self.dict_savedata['train_cv_splits_df']
            self.validation_cv_splits_df = self.dict_savedata[
                'validation_cv_splits_df']

            if self.if_gradient:
                self.gradients = self.dict_savedata['gradients']

            if self.if_engineer == 1:
                self.dict_savedata_engineered = np.load(
                    self.path + 'output/npy/' + self.name +
                    '_savedata_engineered.npy')
                self.train_data_for_target_engineered_df = self.dict_savedata_engineered[
                    'train_data_for_target_engineered_df']
                self.target_data_for_target_engineered_df = self.dict_savedata_engineered[
                    'target_data_for_target_engineered_df']
                self.validation_data_for_target_engineered_df = self.dict_savedata_engineered[
                    'validation_data_for_target_engineered_df']

                self.train_splits_engineered_df = self.dict_savedata_engineered[
                    'train_splits_engineered_df']
                self.test_splits_engineered_df = self.dict_savedata_engineered[
                    'test_splits_engineered_df']

                self.train_cv_splits_engineered_df = self.dict_savedata_engineered[
                    'train_cv_splits_engineered_df']
                self.validation_cv_splits_engineered_df = self.dict_savedata_engineered[
                    'validation_cv_splits_engineered_df']

            msg = "[refresh_data] -- load from disk --" + self.data_msg()

            #             if self.save_to_disk == 1:
            #
            #                 self.dict_savedata = {}
            #                 self.dict_savedata['extra_train_data_df'] = self.extra_train_data_df
            #                 self.dict_savedata['train_data_df'] = self.train_data_df
            #                 self.dict_savedata['test_data_df'] = self.test_data_df
            #                 self.dict_savedata['target_data_df'] = self.target_data_df
            #
            #                 self.dict_savedata['train_data_index_df'] = self.train_data_index_df
            #                 self.dict_savedata['validation_data_index_df'] = self.validation_data_index_df
            #                 self.dict_savedata['test_data_index_df'] = self.test_data_index_df
            #                 self.dict_savedata['target_data_index_df'] = self.target_data_index_df
            #
            #                 self.dict_savedata['train_data_for_target_df'] = self.train_data_for_target_df
            #                 self.dict_savedata['target_data_for_target_df'] = self.target_data_for_target_df
            #                 self.dict_savedata['validation_data_for_target_df'] = self.validation_data_for_target_df
            #
            #                 self.dict_savedata['train_splits_df'] = self.train_splits_df
            #                 self.dict_savedata['test_splits_df'] = self.test_splits_df
            #
            #                 self.dict_savedata['train_cv_splits_df'] = self.train_cv_splits_df
            #                 self.dict_savedata['validation_cv_splits_df'] = self.validation_cv_splits_df
            #
            #                 if self.if_gradient:
            #                     self.dict_savedata['gradients'] = self.gradients
            #
            #                 pickle_out = open(self.path + 'output/npy/' + self.name + '_savedata.npy', 'wb')
            #                 pickle.dump(self.dict_savedata, pickle_out)
            #                 pickle_out.close()
            #
            #                 self.engineer_data()
            #
            #                 self.dict_savedata_engineered = {}
            #                 self.dict_savedata_engineered['train_data_for_target_engineered_df'] = self.train_data_for_target_engineered_df
            #                 self.dict_savedata_engineered['target_data_for_target_engineered_df'] = self.target_data_for_target_engineered_df
            #                 self.dict_savedata_engineered['validation_data_for_target_engineered_df'] = self.validation_data_for_target_engineered_df
            #
            #                 self.dict_savedata_engineered['train_splits_engineered_df'] = self.train_splits_engineered_df
            #                 self.dict_savedata_engineered['test_splits_engineered_df'] = self.test_splits_engineered_df
            #
            #                 self.dict_savedata_engineered['train_cv_splits_engineered_df'] = self.train_cv_splits_engineered_df
            #                 self.dict_savedata_engineered['validation_cv_splits_engineered_df'] = self.validation_cv_splits_engineered_df
            #
            #                 if self.if_gradient:
            #                     self.dict_savedata_engineered['gradients'] = self.gradients
            #
            #                 pickle_out = open(self.path + 'output/npy/' + self.name + '_savedata_engineered.npy','wb')
            #                 pickle.dump(self.dict_savedata_engineered, pickle_out)
            #                 pickle_out.close()

            alm_fun.show_msg(self.log, self.verbose, msg)
Ejemplo n.º 7
0
    def run(self,
            features,
            dependent_variable,
            ml_type,
            core_train,
            test,
            extra_train=None,
            validation=None,
            alm_predictor=None,
            model_file=None):

        if alm_predictor is None:
            use_extra_train_data = 0
            nofit = 0
            tune_tree_num = 0
            shap_test_interaction = 0
            shap_train_interaction = 0
            shuffle_features = []
            if_feature_engineer = 0
            load_existing_model = 0
        else:
            use_extra_train_data = alm_predictor.use_extra_train_data
            nofit = alm_predictor.nofit
            tune_tree_num = alm_predictor.tune_tree_nums_before_test
            shap_test_interaction = alm_predictor.shap_test_interaction
            shap_train_interaction = alm_predictor.shap_train_interaction
            shuffle_features = alm_predictor.shuffle_features
            if_feature_engineer = alm_predictor.if_feature_engineer
            load_existing_model = alm_predictor.load_existing_model
            eval_obj = alm_predictor.eval_obj

        #####********************************************************************************************
        # Run feature engineer fucntion if necessary
        #####********************************************************************************************
        if if_feature_engineer:
            [core_train, test] = self.feature_engineer(core_train, test)

        #####********************************************************************************************
        # If features are nested list, flat the list of list if necessary
        #####********************************************************************************************
        if any(isinstance(i, list) for i in features):
            features = list(itertools.chain(*features))

        #####********************************************************************************************
        # Shuffle features if necessary , for feature interaction analysis
        #####********************************************************************************************
        if len(shuffle_features) > 0:
            for f in shuffle_features:
                if f != '':
                    core_train[f] = np.random.permutation(core_train[f])

        #####********************************************************************************************
        # copy the core_train,extra_train,test and validation dataset
        #####********************************************************************************************
        core_train = core_train.copy()
        test = test.copy()
        if (tune_tree_num == 1) & (validation is not None):
            validation = validation.copy()

        if extra_train is not None:
            if extra_train.shape[0] != 0:
                extra_train = extra_train.copy()

        #####********************************************************************************************
        # Combine train and extra_train dataset to make the final training dataset
        #####********************************************************************************************
        if use_extra_train_data == 0:  # do not use extra training data
            all_train = core_train

        if use_extra_train_data == 1:  # only use extra training data
            all_train = extra_train

        if use_extra_train_data == 2:  # use extra training data directly + training data, no prediction
            all_train = pd.concat([extra_train, core_train])
#             all_train = all_train.sort_index()
#####********************************************************************************************
# Reorder the traning data and form groups for "ranking" loss function
#####********************************************************************************************
#         all_train = all_train.sort_values('p_vid')
#         group_counts = all_train['p_vid'].value_counts().sort_index()
#         group_counts = [5000,5000,len(all_train)-10000]
#         group_counts = [len(all_train)]
#         query_group = np.array(group_counts)
#         group_weights = np.ones(len(query_group))

#####********************************************************************************************
# Separate features , labels or weight of the test and final training set
#####********************************************************************************************
#### first check if core_train and test have all the features, if not, add feature columns with np.nan

        core_train_x = core_train[features]
        core_train_y = core_train[dependent_variable]
        extra_train_x = extra_train[features]
        extra_train_y = extra_train[dependent_variable]
        test_x = test[features]
        test_y = test[dependent_variable]
        test_index = test.index
        if validation is not None:
            validation_x = validation[features]
            validation_y = validation[dependent_variable]

        #####********************************************************************************************
        # Remove extra training examples that weight == 0
        #####********************************************************************************************
        if self.weighted_example == 1:


            print('Core examples for training: ' + str(len(core_train_y)) + '[P:' + str(sum(core_train_y==1)) + ' N:' + str(sum(core_train_y==0)) + ']' + \
                  ',  weights:' + str(core_train['weight'].sum()) + '[P:' + str(core_train['weight'][core_train_y==1].sum()) + ' N:' + str(core_train['weight'][core_train_y==0].sum()) + ']')

            print('Extra examples for training: ' + str(len(extra_train_y)) + '[P:' + str(sum(extra_train_y==1)) + ' N:' + str(sum(extra_train_y==0))  + ']' + \
                  ',  weights:' + str(extra_train['weight'].sum()) +  '[P:' + str(extra_train['weight'][extra_train_y==1].sum()) + ' N:' + str(extra_train['weight'][extra_train_y==0].sum()) + ']')

            print('All examples for training: ' + str(len(all_train[dependent_variable])) + '[P:' + str(sum(all_train[dependent_variable]==1)) + ' N:' + str(sum(all_train[dependent_variable]==0))  + ']' + \
                  ',  weights:' + str(all_train['weight'].sum()) + '[P:' + str(all_train['weight'][all_train[dependent_variable]==1].sum()) + ' N:' + str(all_train['weight'][all_train[dependent_variable]==0].sum()) + ']')

            all_train = all_train.loc[(all_train['weight'] != 0), :]

            print('All examples for training after removing examples with ZERO weight: ' + str(len(all_train[dependent_variable])) + '[P:' + str(sum(all_train[dependent_variable]==1)) + ' N:' + str(sum(all_train[dependent_variable]==0))  + ']' + \
                  ',  weights:' + str(all_train['weight'].sum()) + '[P:' + str(all_train['weight'][all_train[dependent_variable]==1].sum()) + ' N:' + str(all_train['weight'][all_train[dependent_variable]==0].sum()) + ']')

        #####********************************************************************************************
        # Feature and labels for all training examples
        #####********************************************************************************************
        all_train_x = all_train[features]
        all_train_y = all_train[dependent_variable]

        #####********************************************************************************************
        # Determine the final weights (after balancing positive and negative weights)
        #####********************************************************************************************
        if self.weighted_example == 1:
            weights = all_train['weight']
            negative_idx = all_train_y == 0
            positive_idx = all_train_y == 1
            negative_weights = weights[negative_idx].sum()
            positive_weights = weights[positive_idx].sum()
            weight_ratio = negative_weights / positive_weights
            print('Total negative weights: ' + str(negative_weights))
            print('Total positive weights: ' + str(positive_weights))
            weights[positive_idx] = weights[
                positive_idx] * weight_ratio  #balance negative and positive weights
            print('weights ratio negative/postive :' + str(weight_ratio))
            print('Total weights after balancing: ' + str(weights.sum()))
        else:
            weights = [1] * all_train.shape[0]

        #####********************************************************************************************
        # Flip the label for training and test set for contamination analysis if necessary
        #####********************************************************************************************
        if self.flip_contamination_test == 1:
            test_contamination = test['contamination']
            test_y = [
                list(test_y)[i] if list(test_contamination)[i] != 1 else
                abs(list(test_y)[i] - 1) for i in range(len(test_y))
            ]
            print("Test contamination " +
                  str((test_contamination == 1).sum()) + " flipped!")

        if self.flip_contamination_train == 1:
            train_contamination = train['contamination']
            all_train_y = [
                list(all_train_y)[i] if list(train_contamination)[i] != 1 else
                abs(list(all_train_y)[i] - 1) for i in range(len(all_train_y))
            ]
            print("Train contamination " +
                  str((train_contamination == 1).sum()) + " flipped!")

        load_model = 0
        if load_existing_model == 1:
            if os.path.isfile(model_file):
                self.estimator.load_model(model_file)
                load_model = 1

        if load_model == 0:
            #####********************************************************************************************
            # Reset the estimator for every run
            #####********************************************************************************************
            if (self.estimator != None):
                n_estimators = int(self.estimator.n_estimators)
                max_depth = self.estimator.max_depth
                learning_rate = self.estimator.learning_rate
                gamma = self.estimator.gamma
                min_child_weight = self.estimator.min_child_weight
                subsample = self.estimator.subsample
                colsample_bytree = self.estimator.colsample_bytree

                if 'regression' in ml_type:
                    self.estimator = xgb.XGBRegressor(
                        **{
                            'max_depth': max_depth,
                            'n_estimators': n_estimators,
                            'learning_rate': learning_rate,
                            'gamma': gamma,
                            'min_child_weight': min_child_weight,
                            'subsample': subsample,
                            'colsample_bytree': colsample_bytree,
                            'n_jobs': -1
                        })
                if 'classification' in ml_type:
                    self.estimator = xgb.XGBClassifier(
                        **{
                            'max_depth': max_depth,
                            'n_estimators': n_estimators,
                            'learning_rate': learning_rate,
                            'gamma': gamma,
                            'min_child_weight': min_child_weight,
                            'subsample': subsample,
                            'colsample_bytree': colsample_bytree,
                            'n_jobs': -1
                        })

            #####********************************************************************************************
            # Fit the model
            #####********************************************************************************************
            if (self.estimator == None) | (
                (self.single_feature_as_prediction == 1) & (len(features) == 1)
            ):  # if estimator is None, there is no need to train the model
                feature_importance = pd.DataFrame(np.zeros(len(features)),
                                                  index=features).transpose()
            else:
                if nofit == 0:
                    if self.weighted_example == 1:
                        if tune_tree_num == 1:
                            self.estimator.n_estimators = 1000
                            if 'rank' in self.estimator.objective:
                                self.estimator.fit(all_train_x,
                                                   all_train_y,
                                                   group=query_group,
                                                   sample_weight=group_weights,
                                                   verbose=False,
                                                   eval_set=[
                                                       (validation_x[features],
                                                        validation_y)
                                                   ],
                                                   early_stopping_rounds=50,
                                                   eval_metric=eval_obj)
                            else:
                                self.estimator.fit(all_train_x,
                                                   all_train_y,
                                                   sample_weight=weights,
                                                   verbose=False,
                                                   eval_set=[
                                                       (validation_x[features],
                                                        validation_y)
                                                   ],
                                                   early_stopping_rounds=50,
                                                   eval_metric=eval_obj)
                        else:
                            if 'rank' in self.estimator.objective:
                                self.estimator.fit(all_train_x,
                                                   all_train_y,
                                                   group=query_group,
                                                   sample_weight=group_weights)
                            else:
                                print("Start fit the model : " +
                                      str(datetime.now()))
                                print("Training examples: " +
                                      str(all_train_x.shape[0]) +
                                      " Training weights: " +
                                      str(weights.sum()) + " # of Trees: " +
                                      str(self.estimator.n_estimators))
                                self.estimator.fit(all_train_x,
                                                   all_train_y,
                                                   sample_weight=weights)
                                print("End fit the model : " +
                                      str(datetime.now()))
                    else:
                        if 'rank' in self.estimator.objective:
                            self.estimator.fit(all_train_x,
                                               all_train_y,
                                               group=query_group)
                        else:
                            self.estimator.fit(all_train_x, all_train_y)
        else:
            alm_fun.show_msg(self.log, self.verbose,
                             'Existing model ' + model_file + ' loaded.')

        #####********************************************************************************************
        # Record the feature importance
        #####********************************************************************************************
        if self.feature_importance_name == 'coef_':
            feature_importance = np.squeeze(self.estimator.coef_)
        if self.feature_importance_name == 'feature_importances_':
            feature_importance = np.squeeze(
                self.estimator.feature_importances_)
        if self.feature_importance_name == 'booster':
            if len(features) == 1:
                feature_importance = np.zeros(len(features))
            else:
                if load_existing_model == 0:
                    feature_importance = []
                    im_dict = self.estimator.get_booster().get_score(
                        importance_type='gain')
                    for feature in features:
                        feature_importance.append(im_dict.get(feature, 0))
                else:
                    feature_importance = []
                    im_dict = self.estimator.get_booster().get_score(
                        importance_type='gain')
                    for i in range(len(features)):
                        feature_importance.append(im_dict.get('f' + str(i), 0))
        if self.feature_importance_name == 'none':
            feature_importance = np.zeros(len(features))

        feature_importance = pd.DataFrame(feature_importance,
                                          index=features).transpose()

        #####********************************************************************************************
        # Predict the train and test data
        #####********************************************************************************************
        if ml_type == "regression":
            if (self.estimator
                    == None) | ((self.single_feature_as_prediction == 1) &
                                (len(features) == 1)):
                test_y_predicted = np.array(list(np.squeeze(test_x[features])))
            else:
                try:
                    test_y_predicted = self.estimator.predict_proba(
                        test_x[features])[:, 1]
                except:
                    test_y_predicted = self.estimator.predict(test_x[features])

                if self.prediction_transformation is not None:
                    test_y_predicted = self.prediction_transformation(
                        test_y_predicted)

            test_score_df = pd.DataFrame(np.zeros(2),
                                         index=['pcc', 'rmse']).transpose()
            rmse = alm_fun.rmse_cal(test_y, test_y_predicted)
            pcc = alm_fun.pcc_cal(test_y, test_y_predicted)
            spc = alm_fun.spc_cal(test_y, test_y_predicted)
            test_score_df['rmse'] = rmse
            test_score_df['pcc'] = pcc
            test_score_df['spc'] = spc

            if (self.estimator
                    == None) | ((self.single_feature_as_prediction == 1) &
                                (len(features) == 1)):
                core_train_y_predicted = np.array(
                    list(np.squeeze(all_train_x[features])))
            else:
                try:
                    core_train_y_predicted = self.estimator.predict_proba(
                        all_train_x[features])[:, 1]
                except:
                    core_train_y_predicted = self.estimator.predict(
                        all_train_x[features])

                if self.prediction_transformation is not None:
                    core_train_y_predicted = self.prediction_transformation(
                        core_train_y_predicted)

            core_train_score_df = pd.DataFrame(np.zeros(2),
                                               index=['pcc',
                                                      'rmse']).transpose()
            rmse = alm_fun.rmse_cal(core_train_y, core_train_y_predicted)
            pcc = alm_fun.pcc_cal(core_train_y, core_train_y_predicted)
            spc = alm_fun.spc_cal(core_train_y, core_train_y_predicted)
            core_train_score_df['rmse'] = rmse
            core_train_score_df['pcc'] = pcc
            core_train_score_df['spc'] = spc

        if ml_type == "classification_binary":
            if shap_test_interaction == 1:
                X = xgb.DMatrix(test_x)
                shap_output_test_interaction = self.estimator.get_booster(
                ).predict(X, ntree_limit=-1, pred_interactions=True)
            else:
                shap_output_test_interaction = None

            if (self.estimator
                    == None) | ((self.single_feature_as_prediction == 1) &
                                (len(features) == 1)):
                test_y_predicted = np.array(list(np.squeeze(test_x[features])))
            else:
                try:
                    test_y_predicted = self.estimator.predict_proba(
                        test_x[features])[:, 1]
                except:
                    test_y_predicted = self.estimator.predict(test_x[features])
                if self.prediction_transformation is not None:
                    test_y_predicted = self.prediction_transformation(
                        test_y_predicted)

            test_score_df = pd.DataFrame(np.zeros(10),
                                         index=[
                                             'size', 'prior', 'auroc', 'auprc',
                                             'aubprc', 'up_auprc', 'pfr',
                                             'bpfr', 'rfp', 'brfp'
                                         ]).transpose()
            if len(np.unique(test_y)) == 1:
                test_score_df['size'] = len(test_y)
                test_score_df['auroc'] = np.nan
                test_score_df['auprc'] = np.nan
                test_score_df['aubprc'] = np.nan
                test_score_df['up_auprc'] = np.nan
                test_score_df['prior'] = np.nan
                test_score_df['pfr'] = np.nan
                test_score_df['rfp'] = np.nan
                test_score_df['bpfr'] = np.nan
                test_score_df['brfp'] = np.nan
                test_score_df['logloss'] = np.nan
            else:
                [best_y_predicted, metric, multiclass_metrics
                 ] = alm_fun.classification_metrics(test_y, test_y_predicted)
                test_score_df['size'] = len(test_y)
                test_score_df['auroc'] = metric['auroc']
                test_score_df['auprc'] = metric['auprc']
                test_score_df['aubprc'] = metric['aubprc']
                test_score_df['up_auprc'] = metric['up_auprc']
                test_score_df['prior'] = metric['prior']
                test_score_df['pfr'] = metric['pfr']
                test_score_df['rfp'] = metric['rfp']
                test_score_df['bpfr'] = metric['bpfr']
                test_score_df['brfp'] = metric['brfp']
                test_score_df['logloss'] = metric['logloss']

            #get the shap value for all training data
            if shap_train_interaction == 1:
                X = xgb.DMatrix(all_train_x)
                shap_output_train_interaction = self.estimator.get_booster(
                ).predict(X, ntree_limit=-1, pred_interactions=True)
            else:
                shap_output_train_interaction = None

            if (self.estimator
                    == None) | ((self.single_feature_as_prediction == 1) &
                                (len(features) == 1)):
                core_train_y_predicted = np.array(
                    list(np.squeeze(core_train_x[features])))
            else:
                try:
                    core_train_y_predicted = self.estimator.predict_proba(
                        core_train_x[features])[:, 1]
                except:
                    core_train_y_predicted = self.estimator.predict(
                        core_train_x[features])

                if self.prediction_transformation is not None:
                    core_train_y_predicted = self.prediction_transformation(
                        core_train_y_predicted)

            core_train_score_df = pd.DataFrame(np.zeros(10),
                                               index=[
                                                   'size', 'prior', 'auroc',
                                                   'auprc', 'aubprc',
                                                   'up_auprc', 'pfr', 'bpfr',
                                                   'rfp', 'brfp'
                                               ]).transpose()
            if len(np.unique(core_train_y)) == 1:
                core_train_score_df['size'] = len(core_train_y)
                core_train_score_df['auroc'] = np.nan
                core_train_score_df['auprc'] = np.nan
                core_train_score_df['aubprc'] = np.nan
                core_train_score_df['up_auprc'] = np.nan
                core_train_score_df['prior'] = np.nan
                core_train_score_df['pfr'] = np.nan
                core_train_score_df['rfp'] = np.nan
                core_train_score_df['bpfr'] = np.nan
                core_train_score_df['brfp'] = np.nan
                core_train_score_df['logloss'] = np.nan
            else:
                [best_y_predicted, metric, multiclass_metrics
                 ] = alm_fun.classification_metrics(core_train_y,
                                                    core_train_y_predicted)
                core_train_score_df['size'] = len(core_train_y)
                core_train_score_df['auroc'] = metric['auroc']
                core_train_score_df['auprc'] = metric['auprc']
                core_train_score_df['aubprc'] = metric['aubprc']
                core_train_score_df['up_auprc'] = metric['up_auprc']
                core_train_score_df['prior'] = metric['prior']
                core_train_score_df['pfr'] = metric['pfr']
                core_train_score_df['rfp'] = metric['rfp']
                core_train_score_df['bpfr'] = metric['bpfr']
                core_train_score_df['brfp'] = metric['brfp']
                core_train_score_df['logloss'] = metric['logloss']

        if ml_type == "classification_multiclass":
            test_y_predicted_probs = self.estimator.predict_proba(
                test_x[features])
            test_y_predicted = self.estimator.predict(test_x[features])
            core_train_y_predicted_probs = self.estimator.predict_proba(
                core_train_x[features])
            core_train_y_predicted = self.estimator.predict(
                core_train_x[features])

            if self.prediction_transformation is not None:
                test_y_predicted = self.prediction_transformation(
                    test_y_predicted)

            if self.prediction_transformation is not None:
                core_train_y_predicted = self.prediction_transformation(
                    core_train_y_predicted)

            core_train_score_df = pd.DataFrame(np.zeros(1),
                                               index=['neg_log_loss'
                                                      ]).transpose()
            core_train_score_df[
                'neg_log_loss'] = alm_fun.get_classification_metrics(
                    'neg_log_loss', 4, all_train_y,
                    core_train_y_predicted_probs)

            test_score_df = pd.DataFrame(np.zeros(1),
                                         index=['neg_log_loss']).transpose()
            test_score_df['neg_log_loss'] = alm_fun.get_classification_metrics(
                'neg_log_loss', 4, test_y, test_y_predicted_probs)

        core_train_score_df = round(core_train_score_df, self.round_digits)
        test_score_df = round(test_score_df, self.round_digits)
        test_y_predicted = pd.Series(test_y_predicted, index=test_x.index)
        core_train_y_predicted = pd.Series(core_train_y_predicted,
                                           index=core_train_x.index)

        #####********************************************************************************************
        # Return the result dictionary
        #####********************************************************************************************
        return_dict = {}
        return_dict['train_y_predicted'] = core_train_y_predicted
        return_dict['train_y_truth'] = core_train_y
        return_dict['train_score_df'] = core_train_score_df
        return_dict['test_y_predicted'] = test_y_predicted
        return_dict['test_y_truth'] = test_y
        return_dict['test_y_index'] = test_index
        return_dict['test_score_df'] = test_score_df
        return_dict['feature_importance'] = feature_importance.transpose(
        ).sort_values([0])
        return_dict[
            'shap_output_test_interaction'] = shap_output_test_interaction
        return_dict[
            'shap_output_train_interaction'] = shap_output_train_interaction
        return_dict['all_train_indices'] = all_train.index
        return_dict['model'] = self.estimator

        if (self.estimator == None) | (
            (self.single_feature_as_prediction == 1) & (len(features) == 1)):
            return_dict['tuned_tree_num'] = 0
        else:
            if tune_tree_num == 1:
                return_dict['tuned_tree_num'] = len(
                    self.estimator.evals_result()['validation_0']
                    [eval_obj]) - 50
            else:
                return_dict['tuned_tree_num'] = self.estimator.n_estimators

        #return the test dataframe in the case some features were engineered
        if if_feature_engineer:
            predicted_test = test.copy()
            predicted_test[dependent_variable] = test_y_predicted
        else:
            predicted_test = None

        return_dict['predicted_df'] = predicted_test
        return (return_dict)