def __init__(self, data_init_params): for key in data_init_params: setattr(self, key, data_init_params[key]) alm_fun.show_msg( self.log, self.verbose, 'Class: [alm_data] [__init__] ' + self.name + ' ......done @' + str(datetime.now()))
def __init__(self, predictor_init_params): for key in predictor_init_params: setattr(self, key, predictor_init_params[key]) #parameters that are not open for configuration yet self.rfp_cutoff = 0.9 self.pfr_cutoff = 0.9 self.init_weights = None self.target_as_source = None self.fill_na_type = None self.tune_tree_nums_before_test = 0 self.tune_tree_nums_during_cv = 0 self.shuffle_features = [] self.eval_obj = 'auc' self.trials_mv_step = 0 self.use_extra_train_data = 2 self.nofit = 0 self.if_feature_engineer = 0 alm_fun.show_msg( self.log, self.verbose, 'Class: [alm_predictor] [__init__] ' + self.name + ' ...... @' + str(datetime.now())) if self.type == 1: # VARITY predictors # if self.hp_tune_type == 'hyperopt': # self.create_hyperopt_hps() # create hp_config_dict and hyperopt_hps # if self.hp_tune_type == 'hyperopt_logistic': self.create_hyperopt_logistic_hps()
def __init__(self, data_init_params): for key in data_init_params: setattr(self, key, data_init_params[key]) #parameters that are not open for configuration yet self.filter_train = 0 self.filter_test = 0 self.filter_target = 0 self.filter_validation = 0 self.if_gradient = 0 self.verbose = 1 self.dependent_variable = 'label' self.independent_testset = 0 self.validation_from_testset = 0 self.if_engineer = 0 self.use_extra_data = 1 self.extra_data_index = 0 alm_fun.show_msg( self.log, self.verbose, 'Class: [alm_data] [__init__] ' + self.name + ' ...... @' + str(datetime.now()))
def create_hyperopt_hps(self): hyperopt_hps = {} extra_data = self.data_instance.extra_train_data_df_lst[0] self.hp_parameters = {} self.hp_parameters['all'] = [] self.hp_parameters['hyperopt'] = [] self.hp_parameters['sp'] = [] self.hp_default = {} self.hp_values = {} self.hp_range_start = {} self.hp_range_end = {} self.hp_indices = {} self.hp_rest_indices = {} self.hp_directions = {} self.hp_mv_values = {} self.hp_mv_range_start = {} self.hp_mv_range_end = {} self.hp_mv_indices = {} self.hp_mv_rest_indices = {} self.hps = {} create_new_hp_config = 0 hp_config_file = self.project_path + '/output/npy/' + self.session_id + '_' + self.name + '_hp_config_dict.npy' if os.path.isfile(hp_config_file): if self.init_hp_config == 1: create_new_hp_config = 1 else: create_new_hp_config = 1 if create_new_hp_config == 0: hp_config_dict = np.load(hp_config_file).item() self.hp_directions = hp_config_dict['hp_directions'] self.hp_parameters = hp_config_dict['hp_parameters'] self.hp_values = hp_config_dict['hp_values'] self.hp_range_start = hp_config_dict['hp_range_start'] self.hp_range_end = hp_config_dict['hp_range_end'] self.hp_indices = hp_config_dict['hp_indices'] self.hp_rest_indices = hp_config_dict['hp_rest_indices'] if self.old_system == 1: # self.hp_directions['extra_gnomad_af'] = 1 #the old hp_config_file didn't include hp_default and moving window analysis config so we add it manually #for hp evaluation config for cur_hp in self.hyperparameter.keys(): hp = self.hyperparameter[cur_hp] #parameters that are not open for configuration yet hp['filter_type'] = 3 # filtering method hp['mv_type'] = 0 # moving analysis method hp['mv_size'] = 0 if self.name not in hp['predictor']: continue else: self.hps[cur_hp] = hp if hp['hp_type'] == 1: #filtering parameters extra_data_df = extra_data.loc[ extra_data['set_name'].isin(hp['source']), :] self.hp_parameters['sp'].append(cur_hp) [ pivots, pivot_indices, pivot_rest_indices, pivot_values_range_start, pivot_values_range_end ] = self.pivot_points(extra_data_df, hp, 1) self.hp_mv_values[cur_hp] = range(len(pivots)) self.hp_mv_indices[cur_hp] = pivot_indices self.hp_mv_rest_indices[cur_hp] = pivot_rest_indices self.hp_mv_range_start[ cur_hp] = pivot_values_range_start self.hp_mv_range_end[cur_hp] = pivot_values_range_end #for hp_default self.hp_default = hp_config_dict['hp_default'] for cur_hp in self.hyperparameter.keys(): hp = self.hyperparameter[cur_hp] self.hp_default[cur_hp] = hp['default'] hp_config_dict['hp_mv_values'] = self.hp_mv_values hp_config_dict['hp_mv_range_start'] = self.hp_mv_range_start hp_config_dict['hp_mv_range_end'] = self.hp_mv_range_end hp_config_dict['hp_mv_indices'] = self.hp_mv_indices hp_config_dict['hp_mv_rest_indices'] = self.hp_mv_rest_indices hp_config_dict['hp_default'] = self.hp_default hp_config_dict['hps'] = self.hps np.save(hp_config_file, hp_config_dict) print('old system hp_config_dict converted.....') else: self.hp_mv_values = hp_config_dict['hp_mv_values'] self.hp_mv_indices = hp_config_dict['hp_mv_indices'] self.hp_mv_rest_indices = hp_config_dict['hp_mv_rest_indices'] self.hp_mv_range_start = hp_config_dict['hp_mv_range_start'] self.hp_mv_range_end = hp_config_dict['hp_mv_range_end'] self.hp_default = hp_config_dict['hp_default'] self.hps = hp_config_dict['hps'] alm_fun.show_msg(self.log, self.verbose, 'Saved hp config dict loaded.') else: for cur_hp in self.hyperparameter.keys(): hp = self.hyperparameter[cur_hp] #parameters that are not open for configuration yet hp['filter_type'] = 3 # filtering method hp['mv_type'] = 0 # moving analysis method hp['mv_size'] = 0 if self.name not in hp['predictor']: continue else: self.hps[cur_hp] = hp if hp['hp_type'] == 1: #filtering parameters self.hp_default[cur_hp] = hp['default'] self.hp_directions[cur_hp] = hp['direction'] if hp['enable'] == 1: self.hp_parameters['hyperopt'].append(cur_hp) extra_data_df = extra_data.loc[ extra_data['set_name'].isin(hp['source']), :] [ pivots, pivot_indices, pivot_rest_indices, pivot_values_range_start, pivot_values_range_end ] = self.pivot_points(extra_data_df, hp, 0) self.hp_values[cur_hp] = range(len(pivots)) self.hp_indices[cur_hp] = pivot_indices self.hp_rest_indices[cur_hp] = pivot_rest_indices self.hp_range_start[cur_hp] = pivot_values_range_start self.hp_range_end[cur_hp] = pivot_values_range_end #for moving widow analysis self.hp_parameters['sp'].append(cur_hp) [ pivots, pivot_indices, pivot_rest_indices, pivot_values_range_start, pivot_values_range_end ] = self.pivot_points(extra_data_df, hp, 1) self.hp_mv_values[cur_hp] = range(len(pivots)) self.hp_mv_indices[cur_hp] = pivot_indices self.hp_mv_rest_indices[cur_hp] = pivot_rest_indices self.hp_mv_range_start[cur_hp] = pivot_values_range_start self.hp_mv_range_end[cur_hp] = pivot_values_range_end if hp['hp_type'] == 2: #weight parameters self.hp_default[cur_hp] = hp['default'] self.hp_parameters['hyperopt'].append(cur_hp) # cur_weights = np.linspace(start = hp['from'], stop = hp['to'], num = 101) cur_weights = np.round( np.arange(hp['from'], hp['to'] + hp['step'], hp['step']), 2) self.hp_values[cur_hp] = cur_weights self.hp_indices[cur_hp] = {} self.hp_rest_indices[cur_hp] = {} self.hp_range_start[cur_hp] = {} self.hp_range_end[cur_hp] = {} for weight in cur_weights: self.hp_indices[cur_hp][weight] = extra_data_df.index self.hp_rest_indices[cur_hp][weight] = [] self.hp_range_start[cur_hp][weight] = np.nan self.hp_range_end[cur_hp][weight] = np.nan #save current hyper parameter configurations hp_config_dict = {} hp_config_dict['hp_parameters'] = self.hp_parameters hp_config_dict['hp_values'] = self.hp_values hp_config_dict['hp_range_start'] = self.hp_range_start hp_config_dict['hp_range_end'] = self.hp_range_end hp_config_dict['hp_indices'] = self.hp_indices hp_config_dict['hp_rest_indices'] = self.hp_rest_indices hp_config_dict['hp_directions'] = self.hp_directions hp_config_dict['hp_mv_values'] = self.hp_mv_values hp_config_dict['hp_mv_range_start'] = self.hp_mv_range_start hp_config_dict['hp_mv_range_end'] = self.hp_mv_range_end hp_config_dict['hp_mv_indices'] = self.hp_mv_indices hp_config_dict['hp_mv_rest_indices'] = self.hp_mv_rest_indices hp_config_dict['hp_default'] = self.hp_default hp_config_dict['hps'] = self.hps np.save(hp_config_file, hp_config_dict) alm_fun.show_msg( self.log, self.verbose, 'Hyperparameter config dictionary for ' + self.name + ' saved.') for hp_parameter in self.hp_parameters['hyperopt']: hyperopt_hps[hp_parameter] = hyperopt.hp.choice( hp_parameter, self.hp_values[hp_parameter]) pass for cur_hp in self.hyperparameter.keys(): hp = self.hyperparameter[cur_hp] if self.name not in hp['predictor']: continue if hp['hp_type'] == 3: #algorithm level hyper-parameters self.hp_default[cur_hp] = hp['default'] if hp['type'] == 'real': hyperopt_hps[cur_hp] = hyperopt.hp.quniform( cur_hp, hp['from'], hp['to'], hp['step']) if (hp['type'] == 'int') | (hp['type'] == 'category'): hyperopt_hps[cur_hp] = hyperopt.hp.choice( cur_hp, np.arange(hp['from'], hp['to'], dtype=int)) # np.save(self.project_path + '/output/npy/' + self.session_id + '_hyperopt_hps.npy',hyperopt_hps) return (hyperopt_hps)
def refresh_data(self): # self.verbose = verbose if (self.load_from_disk == 0) | (not os.path.isfile( self.project_path + '/output/npy/' + self.session_id + '_' + self.name + '_savedata.npy')): # load data (set initial features, handel onehot features,remove samples without valid dependent variable) self.load_data() msg = "Data loading......\n" + self.data_msg(split=0) alm_fun.show_msg(self.log, self.verbose, msg) # slice data self.preprocess_data() msg = "Data preprocessing......\n" + self.data_msg(split=0) alm_fun.show_msg(self.log, self.verbose, msg) # filter data self.filter_data() msg = "Data filtering......\n" + self.data_msg(split=0) alm_fun.show_msg(self.log, self.verbose, msg) #split data self.split_data() msg = "Data spliting.....\n" + self.data_msg() alm_fun.show_msg(self.log, self.verbose, msg) # gradient reshape if self.if_gradient == 1: self.gradient_data() msg = "[gradient_data]\n" + self.data_msg() alm_fun.show_msg(self.log, self.verbose, msg) # engineer data if self.if_engineer == 1: self.engineer_data() msg = "[egineer_data]\n" + self.data_msg() alm_fun.show_msg(self.log, self.verbose, msg) if self.save_to_disk == 1: self.save_data() else: self.dict_savedata = np.load(self.project_path + '/output/npy/' + self.session_id + '_' + self.name + '_savedata.npy') self.train_data_original_df = self.dict_savedata.get( 'train_data_original_df', None) self.extra_train_data_df_lst = self.dict_savedata[ 'extra_train_data_df_lst'] self.train_data_df = self.dict_savedata['train_data_df'] self.test_data_df = self.dict_savedata['test_data_df'] self.target_data_df = self.dict_savedata['target_data_df'] self.train_data_index_df = self.dict_savedata[ 'train_data_index_df'] self.validation_data_index_df = self.dict_savedata[ 'validation_data_index_df'] self.test_data_index_df = self.dict_savedata['test_data_index_df'] self.target_data_index_df = self.dict_savedata[ 'target_data_index_df'] self.train_data_for_target_df = self.dict_savedata[ 'train_data_for_target_df'] self.target_data_for_target_df = self.dict_savedata[ 'target_data_for_target_df'] self.validation_data_for_target_df = self.dict_savedata[ 'validation_data_for_target_df'] self.train_splits_df = self.dict_savedata['train_splits_df'] self.test_splits_df = self.dict_savedata['test_splits_df'] self.train_cv_splits_df = self.dict_savedata['train_cv_splits_df'] self.validation_cv_splits_df = self.dict_savedata[ 'validation_cv_splits_df'] self.train_data_index_df['weight'] = 1 self.extra_train_data_df_lst[0]['weight'] = 1 # print (str(self.train_data_index_df.loc[self.train_data_index_df['mpc_obs_exp'].notnull(),:].shape)) alm_fun.show_msg( self.log, self.verbose, str(self.train_data_index_df['set_name'].value_counts(). sort_index())) alm_fun.show_msg( self.log, self.verbose, str(self.extra_train_data_df_lst[0] ['set_name'].value_counts().sort_index())) if self.if_gradient: self.gradients = self.dict_savedata['gradients'] if self.if_engineer == 1: self.dict_savedata_engineered = np.load( self.project_path + '/output/npy/' + self.session_id + '_' + self.name + '_savedata_engineered.npy') self.train_data_for_target_engineered_df = self.dict_savedata_engineered[ 'train_data_for_target_engineered_df'] self.target_data_for_target_engineered_df = self.dict_savedata_engineered[ 'target_data_for_target_engineered_df'] self.validation_data_for_target_engineered_df = self.dict_savedata_engineered[ 'validation_data_for_target_engineered_df'] self.train_splits_engineered_df = self.dict_savedata_engineered[ 'train_splits_engineered_df'] self.test_splits_engineered_df = self.dict_savedata_engineered[ 'test_splits_engineered_df'] self.train_cv_splits_engineered_df = self.dict_savedata_engineered[ 'train_cv_splits_engineered_df'] self.validation_cv_splits_engineered_df = self.dict_savedata_engineered[ 'validation_cv_splits_engineered_df']
def refresh_data(self): # self.verbose = verbose if self.load_from_disk == 0: # load data (set initial features, handel onehot features,remove samples without valid dependent variable) self.load_data() msg = "[load_data]\n" + self.data_msg() alm_fun.show_msg(self.log, self.verbose, msg) # slice data self.slice_data() msg = "[slice_data]\n" + self.data_msg() alm_fun.show_msg(self.log, self.verbose, msg) # filter data self.filter_data() msg = "[filter_data]\n" + self.data_msg() alm_fun.show_msg(self.log, self.verbose, msg) #split data self.split_data() msg = "[split_data]\n" + self.data_msg() alm_fun.show_msg(self.log, self.verbose, msg) # gradient reshape if self.if_gradient == 1: self.gradient_data() msg = "[gradient_data]\n" + self.data_msg() alm_fun.show_msg(self.log, self.verbose, msg) # engineer data if self.if_engineer == 1: self.engineer_data() msg = "[egineer_data]\n" + self.data_msg() alm_fun.show_msg(self.log, self.verbose, msg) if self.save_to_disk == 1: self.save_data() else: self.dict_savedata = np.load(self.path + 'output/npy/' + self.name + '_savedata.npy') self.extra_train_data_df = self.dict_savedata[ 'extra_train_data_df'] self.train_data_df = self.dict_savedata['train_data_df'] self.test_data_df = self.dict_savedata['test_data_df'] self.target_data_df = self.dict_savedata['target_data_df'] self.train_data_index_df = self.dict_savedata[ 'train_data_index_df'] self.validation_data_index_df = self.dict_savedata[ 'validation_data_index_df'] self.test_data_index_df = self.dict_savedata['test_data_index_df'] self.target_data_index_df = self.dict_savedata[ 'target_data_index_df'] self.train_data_for_target_df = self.dict_savedata[ 'train_data_for_target_df'] self.target_data_for_target_df = self.dict_savedata[ 'target_data_for_target_df'] self.validation_data_for_target_df = self.dict_savedata[ 'validation_data_for_target_df'] self.train_splits_df = self.dict_savedata['train_splits_df'] self.test_splits_df = self.dict_savedata['test_splits_df'] self.train_cv_splits_df = self.dict_savedata['train_cv_splits_df'] self.validation_cv_splits_df = self.dict_savedata[ 'validation_cv_splits_df'] if self.if_gradient: self.gradients = self.dict_savedata['gradients'] if self.if_engineer == 1: self.dict_savedata_engineered = np.load( self.path + 'output/npy/' + self.name + '_savedata_engineered.npy') self.train_data_for_target_engineered_df = self.dict_savedata_engineered[ 'train_data_for_target_engineered_df'] self.target_data_for_target_engineered_df = self.dict_savedata_engineered[ 'target_data_for_target_engineered_df'] self.validation_data_for_target_engineered_df = self.dict_savedata_engineered[ 'validation_data_for_target_engineered_df'] self.train_splits_engineered_df = self.dict_savedata_engineered[ 'train_splits_engineered_df'] self.test_splits_engineered_df = self.dict_savedata_engineered[ 'test_splits_engineered_df'] self.train_cv_splits_engineered_df = self.dict_savedata_engineered[ 'train_cv_splits_engineered_df'] self.validation_cv_splits_engineered_df = self.dict_savedata_engineered[ 'validation_cv_splits_engineered_df'] msg = "[refresh_data] -- load from disk --" + self.data_msg() # if self.save_to_disk == 1: # # self.dict_savedata = {} # self.dict_savedata['extra_train_data_df'] = self.extra_train_data_df # self.dict_savedata['train_data_df'] = self.train_data_df # self.dict_savedata['test_data_df'] = self.test_data_df # self.dict_savedata['target_data_df'] = self.target_data_df # # self.dict_savedata['train_data_index_df'] = self.train_data_index_df # self.dict_savedata['validation_data_index_df'] = self.validation_data_index_df # self.dict_savedata['test_data_index_df'] = self.test_data_index_df # self.dict_savedata['target_data_index_df'] = self.target_data_index_df # # self.dict_savedata['train_data_for_target_df'] = self.train_data_for_target_df # self.dict_savedata['target_data_for_target_df'] = self.target_data_for_target_df # self.dict_savedata['validation_data_for_target_df'] = self.validation_data_for_target_df # # self.dict_savedata['train_splits_df'] = self.train_splits_df # self.dict_savedata['test_splits_df'] = self.test_splits_df # # self.dict_savedata['train_cv_splits_df'] = self.train_cv_splits_df # self.dict_savedata['validation_cv_splits_df'] = self.validation_cv_splits_df # # if self.if_gradient: # self.dict_savedata['gradients'] = self.gradients # # pickle_out = open(self.path + 'output/npy/' + self.name + '_savedata.npy', 'wb') # pickle.dump(self.dict_savedata, pickle_out) # pickle_out.close() # # self.engineer_data() # # self.dict_savedata_engineered = {} # self.dict_savedata_engineered['train_data_for_target_engineered_df'] = self.train_data_for_target_engineered_df # self.dict_savedata_engineered['target_data_for_target_engineered_df'] = self.target_data_for_target_engineered_df # self.dict_savedata_engineered['validation_data_for_target_engineered_df'] = self.validation_data_for_target_engineered_df # # self.dict_savedata_engineered['train_splits_engineered_df'] = self.train_splits_engineered_df # self.dict_savedata_engineered['test_splits_engineered_df'] = self.test_splits_engineered_df # # self.dict_savedata_engineered['train_cv_splits_engineered_df'] = self.train_cv_splits_engineered_df # self.dict_savedata_engineered['validation_cv_splits_engineered_df'] = self.validation_cv_splits_engineered_df # # if self.if_gradient: # self.dict_savedata_engineered['gradients'] = self.gradients # # pickle_out = open(self.path + 'output/npy/' + self.name + '_savedata_engineered.npy','wb') # pickle.dump(self.dict_savedata_engineered, pickle_out) # pickle_out.close() alm_fun.show_msg(self.log, self.verbose, msg)
def run(self, features, dependent_variable, ml_type, core_train, test, extra_train=None, validation=None, alm_predictor=None, model_file=None): if alm_predictor is None: use_extra_train_data = 0 nofit = 0 tune_tree_num = 0 shap_test_interaction = 0 shap_train_interaction = 0 shuffle_features = [] if_feature_engineer = 0 load_existing_model = 0 else: use_extra_train_data = alm_predictor.use_extra_train_data nofit = alm_predictor.nofit tune_tree_num = alm_predictor.tune_tree_nums_before_test shap_test_interaction = alm_predictor.shap_test_interaction shap_train_interaction = alm_predictor.shap_train_interaction shuffle_features = alm_predictor.shuffle_features if_feature_engineer = alm_predictor.if_feature_engineer load_existing_model = alm_predictor.load_existing_model eval_obj = alm_predictor.eval_obj #####******************************************************************************************** # Run feature engineer fucntion if necessary #####******************************************************************************************** if if_feature_engineer: [core_train, test] = self.feature_engineer(core_train, test) #####******************************************************************************************** # If features are nested list, flat the list of list if necessary #####******************************************************************************************** if any(isinstance(i, list) for i in features): features = list(itertools.chain(*features)) #####******************************************************************************************** # Shuffle features if necessary , for feature interaction analysis #####******************************************************************************************** if len(shuffle_features) > 0: for f in shuffle_features: if f != '': core_train[f] = np.random.permutation(core_train[f]) #####******************************************************************************************** # copy the core_train,extra_train,test and validation dataset #####******************************************************************************************** core_train = core_train.copy() test = test.copy() if (tune_tree_num == 1) & (validation is not None): validation = validation.copy() if extra_train is not None: if extra_train.shape[0] != 0: extra_train = extra_train.copy() #####******************************************************************************************** # Combine train and extra_train dataset to make the final training dataset #####******************************************************************************************** if use_extra_train_data == 0: # do not use extra training data all_train = core_train if use_extra_train_data == 1: # only use extra training data all_train = extra_train if use_extra_train_data == 2: # use extra training data directly + training data, no prediction all_train = pd.concat([extra_train, core_train]) # all_train = all_train.sort_index() #####******************************************************************************************** # Reorder the traning data and form groups for "ranking" loss function #####******************************************************************************************** # all_train = all_train.sort_values('p_vid') # group_counts = all_train['p_vid'].value_counts().sort_index() # group_counts = [5000,5000,len(all_train)-10000] # group_counts = [len(all_train)] # query_group = np.array(group_counts) # group_weights = np.ones(len(query_group)) #####******************************************************************************************** # Separate features , labels or weight of the test and final training set #####******************************************************************************************** #### first check if core_train and test have all the features, if not, add feature columns with np.nan core_train_x = core_train[features] core_train_y = core_train[dependent_variable] extra_train_x = extra_train[features] extra_train_y = extra_train[dependent_variable] test_x = test[features] test_y = test[dependent_variable] test_index = test.index if validation is not None: validation_x = validation[features] validation_y = validation[dependent_variable] #####******************************************************************************************** # Remove extra training examples that weight == 0 #####******************************************************************************************** if self.weighted_example == 1: print('Core examples for training: ' + str(len(core_train_y)) + '[P:' + str(sum(core_train_y==1)) + ' N:' + str(sum(core_train_y==0)) + ']' + \ ', weights:' + str(core_train['weight'].sum()) + '[P:' + str(core_train['weight'][core_train_y==1].sum()) + ' N:' + str(core_train['weight'][core_train_y==0].sum()) + ']') print('Extra examples for training: ' + str(len(extra_train_y)) + '[P:' + str(sum(extra_train_y==1)) + ' N:' + str(sum(extra_train_y==0)) + ']' + \ ', weights:' + str(extra_train['weight'].sum()) + '[P:' + str(extra_train['weight'][extra_train_y==1].sum()) + ' N:' + str(extra_train['weight'][extra_train_y==0].sum()) + ']') print('All examples for training: ' + str(len(all_train[dependent_variable])) + '[P:' + str(sum(all_train[dependent_variable]==1)) + ' N:' + str(sum(all_train[dependent_variable]==0)) + ']' + \ ', weights:' + str(all_train['weight'].sum()) + '[P:' + str(all_train['weight'][all_train[dependent_variable]==1].sum()) + ' N:' + str(all_train['weight'][all_train[dependent_variable]==0].sum()) + ']') all_train = all_train.loc[(all_train['weight'] != 0), :] print('All examples for training after removing examples with ZERO weight: ' + str(len(all_train[dependent_variable])) + '[P:' + str(sum(all_train[dependent_variable]==1)) + ' N:' + str(sum(all_train[dependent_variable]==0)) + ']' + \ ', weights:' + str(all_train['weight'].sum()) + '[P:' + str(all_train['weight'][all_train[dependent_variable]==1].sum()) + ' N:' + str(all_train['weight'][all_train[dependent_variable]==0].sum()) + ']') #####******************************************************************************************** # Feature and labels for all training examples #####******************************************************************************************** all_train_x = all_train[features] all_train_y = all_train[dependent_variable] #####******************************************************************************************** # Determine the final weights (after balancing positive and negative weights) #####******************************************************************************************** if self.weighted_example == 1: weights = all_train['weight'] negative_idx = all_train_y == 0 positive_idx = all_train_y == 1 negative_weights = weights[negative_idx].sum() positive_weights = weights[positive_idx].sum() weight_ratio = negative_weights / positive_weights print('Total negative weights: ' + str(negative_weights)) print('Total positive weights: ' + str(positive_weights)) weights[positive_idx] = weights[ positive_idx] * weight_ratio #balance negative and positive weights print('weights ratio negative/postive :' + str(weight_ratio)) print('Total weights after balancing: ' + str(weights.sum())) else: weights = [1] * all_train.shape[0] #####******************************************************************************************** # Flip the label for training and test set for contamination analysis if necessary #####******************************************************************************************** if self.flip_contamination_test == 1: test_contamination = test['contamination'] test_y = [ list(test_y)[i] if list(test_contamination)[i] != 1 else abs(list(test_y)[i] - 1) for i in range(len(test_y)) ] print("Test contamination " + str((test_contamination == 1).sum()) + " flipped!") if self.flip_contamination_train == 1: train_contamination = train['contamination'] all_train_y = [ list(all_train_y)[i] if list(train_contamination)[i] != 1 else abs(list(all_train_y)[i] - 1) for i in range(len(all_train_y)) ] print("Train contamination " + str((train_contamination == 1).sum()) + " flipped!") load_model = 0 if load_existing_model == 1: if os.path.isfile(model_file): self.estimator.load_model(model_file) load_model = 1 if load_model == 0: #####******************************************************************************************** # Reset the estimator for every run #####******************************************************************************************** if (self.estimator != None): n_estimators = int(self.estimator.n_estimators) max_depth = self.estimator.max_depth learning_rate = self.estimator.learning_rate gamma = self.estimator.gamma min_child_weight = self.estimator.min_child_weight subsample = self.estimator.subsample colsample_bytree = self.estimator.colsample_bytree if 'regression' in ml_type: self.estimator = xgb.XGBRegressor( **{ 'max_depth': max_depth, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'colsample_bytree': colsample_bytree, 'n_jobs': -1 }) if 'classification' in ml_type: self.estimator = xgb.XGBClassifier( **{ 'max_depth': max_depth, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'gamma': gamma, 'min_child_weight': min_child_weight, 'subsample': subsample, 'colsample_bytree': colsample_bytree, 'n_jobs': -1 }) #####******************************************************************************************** # Fit the model #####******************************************************************************************** if (self.estimator == None) | ( (self.single_feature_as_prediction == 1) & (len(features) == 1) ): # if estimator is None, there is no need to train the model feature_importance = pd.DataFrame(np.zeros(len(features)), index=features).transpose() else: if nofit == 0: if self.weighted_example == 1: if tune_tree_num == 1: self.estimator.n_estimators = 1000 if 'rank' in self.estimator.objective: self.estimator.fit(all_train_x, all_train_y, group=query_group, sample_weight=group_weights, verbose=False, eval_set=[ (validation_x[features], validation_y) ], early_stopping_rounds=50, eval_metric=eval_obj) else: self.estimator.fit(all_train_x, all_train_y, sample_weight=weights, verbose=False, eval_set=[ (validation_x[features], validation_y) ], early_stopping_rounds=50, eval_metric=eval_obj) else: if 'rank' in self.estimator.objective: self.estimator.fit(all_train_x, all_train_y, group=query_group, sample_weight=group_weights) else: print("Start fit the model : " + str(datetime.now())) print("Training examples: " + str(all_train_x.shape[0]) + " Training weights: " + str(weights.sum()) + " # of Trees: " + str(self.estimator.n_estimators)) self.estimator.fit(all_train_x, all_train_y, sample_weight=weights) print("End fit the model : " + str(datetime.now())) else: if 'rank' in self.estimator.objective: self.estimator.fit(all_train_x, all_train_y, group=query_group) else: self.estimator.fit(all_train_x, all_train_y) else: alm_fun.show_msg(self.log, self.verbose, 'Existing model ' + model_file + ' loaded.') #####******************************************************************************************** # Record the feature importance #####******************************************************************************************** if self.feature_importance_name == 'coef_': feature_importance = np.squeeze(self.estimator.coef_) if self.feature_importance_name == 'feature_importances_': feature_importance = np.squeeze( self.estimator.feature_importances_) if self.feature_importance_name == 'booster': if len(features) == 1: feature_importance = np.zeros(len(features)) else: if load_existing_model == 0: feature_importance = [] im_dict = self.estimator.get_booster().get_score( importance_type='gain') for feature in features: feature_importance.append(im_dict.get(feature, 0)) else: feature_importance = [] im_dict = self.estimator.get_booster().get_score( importance_type='gain') for i in range(len(features)): feature_importance.append(im_dict.get('f' + str(i), 0)) if self.feature_importance_name == 'none': feature_importance = np.zeros(len(features)) feature_importance = pd.DataFrame(feature_importance, index=features).transpose() #####******************************************************************************************** # Predict the train and test data #####******************************************************************************************** if ml_type == "regression": if (self.estimator == None) | ((self.single_feature_as_prediction == 1) & (len(features) == 1)): test_y_predicted = np.array(list(np.squeeze(test_x[features]))) else: try: test_y_predicted = self.estimator.predict_proba( test_x[features])[:, 1] except: test_y_predicted = self.estimator.predict(test_x[features]) if self.prediction_transformation is not None: test_y_predicted = self.prediction_transformation( test_y_predicted) test_score_df = pd.DataFrame(np.zeros(2), index=['pcc', 'rmse']).transpose() rmse = alm_fun.rmse_cal(test_y, test_y_predicted) pcc = alm_fun.pcc_cal(test_y, test_y_predicted) spc = alm_fun.spc_cal(test_y, test_y_predicted) test_score_df['rmse'] = rmse test_score_df['pcc'] = pcc test_score_df['spc'] = spc if (self.estimator == None) | ((self.single_feature_as_prediction == 1) & (len(features) == 1)): core_train_y_predicted = np.array( list(np.squeeze(all_train_x[features]))) else: try: core_train_y_predicted = self.estimator.predict_proba( all_train_x[features])[:, 1] except: core_train_y_predicted = self.estimator.predict( all_train_x[features]) if self.prediction_transformation is not None: core_train_y_predicted = self.prediction_transformation( core_train_y_predicted) core_train_score_df = pd.DataFrame(np.zeros(2), index=['pcc', 'rmse']).transpose() rmse = alm_fun.rmse_cal(core_train_y, core_train_y_predicted) pcc = alm_fun.pcc_cal(core_train_y, core_train_y_predicted) spc = alm_fun.spc_cal(core_train_y, core_train_y_predicted) core_train_score_df['rmse'] = rmse core_train_score_df['pcc'] = pcc core_train_score_df['spc'] = spc if ml_type == "classification_binary": if shap_test_interaction == 1: X = xgb.DMatrix(test_x) shap_output_test_interaction = self.estimator.get_booster( ).predict(X, ntree_limit=-1, pred_interactions=True) else: shap_output_test_interaction = None if (self.estimator == None) | ((self.single_feature_as_prediction == 1) & (len(features) == 1)): test_y_predicted = np.array(list(np.squeeze(test_x[features]))) else: try: test_y_predicted = self.estimator.predict_proba( test_x[features])[:, 1] except: test_y_predicted = self.estimator.predict(test_x[features]) if self.prediction_transformation is not None: test_y_predicted = self.prediction_transformation( test_y_predicted) test_score_df = pd.DataFrame(np.zeros(10), index=[ 'size', 'prior', 'auroc', 'auprc', 'aubprc', 'up_auprc', 'pfr', 'bpfr', 'rfp', 'brfp' ]).transpose() if len(np.unique(test_y)) == 1: test_score_df['size'] = len(test_y) test_score_df['auroc'] = np.nan test_score_df['auprc'] = np.nan test_score_df['aubprc'] = np.nan test_score_df['up_auprc'] = np.nan test_score_df['prior'] = np.nan test_score_df['pfr'] = np.nan test_score_df['rfp'] = np.nan test_score_df['bpfr'] = np.nan test_score_df['brfp'] = np.nan test_score_df['logloss'] = np.nan else: [best_y_predicted, metric, multiclass_metrics ] = alm_fun.classification_metrics(test_y, test_y_predicted) test_score_df['size'] = len(test_y) test_score_df['auroc'] = metric['auroc'] test_score_df['auprc'] = metric['auprc'] test_score_df['aubprc'] = metric['aubprc'] test_score_df['up_auprc'] = metric['up_auprc'] test_score_df['prior'] = metric['prior'] test_score_df['pfr'] = metric['pfr'] test_score_df['rfp'] = metric['rfp'] test_score_df['bpfr'] = metric['bpfr'] test_score_df['brfp'] = metric['brfp'] test_score_df['logloss'] = metric['logloss'] #get the shap value for all training data if shap_train_interaction == 1: X = xgb.DMatrix(all_train_x) shap_output_train_interaction = self.estimator.get_booster( ).predict(X, ntree_limit=-1, pred_interactions=True) else: shap_output_train_interaction = None if (self.estimator == None) | ((self.single_feature_as_prediction == 1) & (len(features) == 1)): core_train_y_predicted = np.array( list(np.squeeze(core_train_x[features]))) else: try: core_train_y_predicted = self.estimator.predict_proba( core_train_x[features])[:, 1] except: core_train_y_predicted = self.estimator.predict( core_train_x[features]) if self.prediction_transformation is not None: core_train_y_predicted = self.prediction_transformation( core_train_y_predicted) core_train_score_df = pd.DataFrame(np.zeros(10), index=[ 'size', 'prior', 'auroc', 'auprc', 'aubprc', 'up_auprc', 'pfr', 'bpfr', 'rfp', 'brfp' ]).transpose() if len(np.unique(core_train_y)) == 1: core_train_score_df['size'] = len(core_train_y) core_train_score_df['auroc'] = np.nan core_train_score_df['auprc'] = np.nan core_train_score_df['aubprc'] = np.nan core_train_score_df['up_auprc'] = np.nan core_train_score_df['prior'] = np.nan core_train_score_df['pfr'] = np.nan core_train_score_df['rfp'] = np.nan core_train_score_df['bpfr'] = np.nan core_train_score_df['brfp'] = np.nan core_train_score_df['logloss'] = np.nan else: [best_y_predicted, metric, multiclass_metrics ] = alm_fun.classification_metrics(core_train_y, core_train_y_predicted) core_train_score_df['size'] = len(core_train_y) core_train_score_df['auroc'] = metric['auroc'] core_train_score_df['auprc'] = metric['auprc'] core_train_score_df['aubprc'] = metric['aubprc'] core_train_score_df['up_auprc'] = metric['up_auprc'] core_train_score_df['prior'] = metric['prior'] core_train_score_df['pfr'] = metric['pfr'] core_train_score_df['rfp'] = metric['rfp'] core_train_score_df['bpfr'] = metric['bpfr'] core_train_score_df['brfp'] = metric['brfp'] core_train_score_df['logloss'] = metric['logloss'] if ml_type == "classification_multiclass": test_y_predicted_probs = self.estimator.predict_proba( test_x[features]) test_y_predicted = self.estimator.predict(test_x[features]) core_train_y_predicted_probs = self.estimator.predict_proba( core_train_x[features]) core_train_y_predicted = self.estimator.predict( core_train_x[features]) if self.prediction_transformation is not None: test_y_predicted = self.prediction_transformation( test_y_predicted) if self.prediction_transformation is not None: core_train_y_predicted = self.prediction_transformation( core_train_y_predicted) core_train_score_df = pd.DataFrame(np.zeros(1), index=['neg_log_loss' ]).transpose() core_train_score_df[ 'neg_log_loss'] = alm_fun.get_classification_metrics( 'neg_log_loss', 4, all_train_y, core_train_y_predicted_probs) test_score_df = pd.DataFrame(np.zeros(1), index=['neg_log_loss']).transpose() test_score_df['neg_log_loss'] = alm_fun.get_classification_metrics( 'neg_log_loss', 4, test_y, test_y_predicted_probs) core_train_score_df = round(core_train_score_df, self.round_digits) test_score_df = round(test_score_df, self.round_digits) test_y_predicted = pd.Series(test_y_predicted, index=test_x.index) core_train_y_predicted = pd.Series(core_train_y_predicted, index=core_train_x.index) #####******************************************************************************************** # Return the result dictionary #####******************************************************************************************** return_dict = {} return_dict['train_y_predicted'] = core_train_y_predicted return_dict['train_y_truth'] = core_train_y return_dict['train_score_df'] = core_train_score_df return_dict['test_y_predicted'] = test_y_predicted return_dict['test_y_truth'] = test_y return_dict['test_y_index'] = test_index return_dict['test_score_df'] = test_score_df return_dict['feature_importance'] = feature_importance.transpose( ).sort_values([0]) return_dict[ 'shap_output_test_interaction'] = shap_output_test_interaction return_dict[ 'shap_output_train_interaction'] = shap_output_train_interaction return_dict['all_train_indices'] = all_train.index return_dict['model'] = self.estimator if (self.estimator == None) | ( (self.single_feature_as_prediction == 1) & (len(features) == 1)): return_dict['tuned_tree_num'] = 0 else: if tune_tree_num == 1: return_dict['tuned_tree_num'] = len( self.estimator.evals_result()['validation_0'] [eval_obj]) - 50 else: return_dict['tuned_tree_num'] = self.estimator.n_estimators #return the test dataframe in the case some features were engineered if if_feature_engineer: predicted_test = test.copy() predicted_test[dependent_variable] = test_y_predicted else: predicted_test = None return_dict['predicted_df'] = predicted_test return (return_dict)