def test_evalAEF5(): evaluation = definedEvaluation() e = evaluation.eval_function("AEF5_0") value0 = e(bin_pred, dtrain_cont) evaluation = definedEvaluation() e = evaluation.eval_function("AEF5") value1 = e(bin_pred, dtrain_bin) assert value0 == value1 assert np.round([value0[1]], 2) == 0.99
def test_evalefr015(): evaluation = definedEvaluation() e = evaluation.eval_function("EFR015_0") value0 = e(bin_pred, dtrain_cont) evaluation = definedEvaluation() e = evaluation.eval_function("EFR015") value1 = e(bin_pred, dtrain_bin) assert value0 == value1 assert np.round([value0[1]], 2) == 1.34
def test_evalNEFauc25(): evaluation = definedEvaluation() e = evaluation.eval_function("NEFAUC25_0") value0 = e(bin_pred, dtrain_cont) evaluation = definedEvaluation() e = evaluation.eval_function("NEFAUC25") value1 = e(bin_pred, dtrain_bin) assert value0 == value1 assert np.round([value0[1]], 2) == 0.55
def test_evalprauc(): evaluation = definedEvaluation() e = evaluation.eval_function("PRAUC_0") value0 = e(bin_pred, dtrain_cont) evaluation = definedEvaluation() e = evaluation.eval_function("PRAUC") value1 = e(bin_pred, dtrain_bin) assert value0 == value1 assert np.round([value0[1]], 2) == 0.51
def test_evalReliabilityScore(): evaluation = definedEvaluation() e = evaluation.eval_function("ReliabilityScore_0") value0 = e(bin_pred, dtrain_cont) evaluation = definedEvaluation() e = evaluation.eval_function("ReliabilityScore") value1 = e(bin_pred, dtrain_bin) assert value0 == value1 assert np.round([value0[1]], 2) == 0.25
def test_evalLogloss(): evaluation = definedEvaluation() e = evaluation.eval_function("Logloss_0") value0 = e(bin_pred, dtrain_cont) evaluation = definedEvaluation() e = evaluation.eval_function("Logloss") value1 = e(bin_pred, dtrain_bin) assert value0 == value1 assert np.round([value0[1]], 2) == 1504.69
def __init__(self,xgbData,eval_name,model_type,model_name): """ Parameters: ----------- xgbData: object Default data object that contains training data, testing data, cv-fold info, and label. eval_name: str Name of evaluation metric used to monitor training process. Must in pre-defined evaluation list. model_type: str Name of model type you want to use. Must in pre-defined model type list. model_name: str Unique name for this model. """ self.name = model_name self.__preDefined_model = defined_model.definedModel() self.__DEFINED_MODEL_TYPE = self.__preDefined_model.model_type() self.__preDefined_eval = defined_eval.definedEvaluation() self.__DEFINED_EVAL = self.__preDefined_eval.eval_list() self.__xgbData = xgbData self.__preDefined_eval.validate_eval_name(eval_name) self.__eval_name = eval_name self.__preDefined_model.validate_model_type(model_type) self.__model_type_writeout = model_type self.__collect_model = None self.__track_best_ntree = pd.DataFrame(columns = ['model_name','best_ntree']) self.__best_score = list() self.__param = self.__preDefined_model.model_param(model_type) self.__eval_function = self.__preDefined_eval.eval_function(self.__eval_name) self.__MAXIMIZE = self.__preDefined_eval.is_maximize(self.__eval_name) self.__STOPPING_ROUND = self.__preDefined_eval.stopping_round(self.__eval_name) self.__holdout = None
def test_defined_eval(): eval = defined_eval.definedEvaluation() assert eval.is_maximize('ROCAUC') == True assert eval.stopping_round('ROCAUC') == 100 mark = 0 try: eval.is_maximize('not_exist_eval_name') assert mark == 1 except ValueError: mark = 1
def __init__(self,xgbData,list_firstLayerModel,eval_name,model_type,model_name): """Use holdout(out of fold) predictions from several firstLayerModels as training features to train a secondLayerModel.(So called stacking model) Parameters: ----------- xgbData: object contains the label you want to use in second layer model. list_firstLayerModel: list list contains firstLayerModel. eval_name: str Name of evaluation metric used to monitor training process. Must in pre-defined evaluation list. model_type: str Name of model type you want to use. Must in pre-defined model type list. model_name: str Unique name for this model. """ self.name = model_name self.__preDefined_model = defined_model.definedModel() self.__preDefined_eval = defined_eval.definedEvaluation() self.__DEFINED_EVAL = self.__preDefined_eval.eval_list() self.__xgbData = xgbData assert all([isinstance(item,first_layer_model.firstLayerModel) for item in list_firstLayerModel]) self.__list_firstLayerModel = list_firstLayerModel self.__preDefined_eval.validate_eval_name(eval_name) self.__eval_name = eval_name self.__preDefined_model.validate_model_type(model_type) self.__model_type_writeout = model_type self.__collect_model = None self.__track_best_ntree = pd.DataFrame(columns = ['model_name','best_ntree']) self.__best_score = list() self.__firstLayerModel_prediction = None self.__param = self.__preDefined_model.model_param(model_type) self.__eval_function = self.__preDefined_eval.eval_function(self.__eval_name) self.__MAXIMIZE = self.__preDefined_eval.is_maximize(self.__eval_name) self.__STOPPING_ROUND = self.__preDefined_eval.stopping_round(self.__eval_name) self.__holdout = None
# Read command line inpu and match input. with open(sys.argv[1], 'r') as f: info = f.read() info = pd.read_json(info) target_name = np.str(info.loc['target_name'][0]) dir_train = np.str(info.loc['full_directory_to_training_data'][0]) dir_test = np.str(info.loc['full_directory_to_dataToPredict_if_exit'][0]) smile_colname = np.str(info.loc['smile_column_name'][0]) label_name_list = info.loc['label_name_list'][0] label_name_list = [np.str(item) for item in label_name_list] eval_name = np.str(info.loc['evaluation_name'][0]) dir_to_store = np.str(info.loc['full_directory_to_store_prediction'][0]) maccKeys_column_name = np.str(info.loc['maccKeys_column_name'][0]) ecfp1024_column_name = np.str(info.loc['ecfp1024_column_name'][0]) preDefined_eval = defined_eval.definedEvaluation() preDefined_eval.validate_eval_name(eval_name) df = pd.read_csv(dir_train) # identify NA row. missing_row = pd.isnull(df.loc[:, label_name_list[0]]) df = df.loc[~missing_row] df = df.reset_index(drop=True) print 'Preparing training data fingerprints' # morgan(ecfp) fp morgan_fp = df.copy() morgan_fp = morgan_fp.rename(columns={'ecfp1024': 'fingerprint'}) # MACCSkeys fp maccs_fp = df.copy() maccs_fp = maccs_fp.rename(columns={'maccKeys': 'fingerprint'}) comb1 = (morgan_fp, label_name_list) comb2 = (maccs_fp, label_name_list)
def train(self): """ Train the model. Train and check potential first and second layer models. """ evaluation_metric_name = self.__eval_name print 'Building first layer models' #---------------------------------first layer models ---------- for data_dict in self.__setting_list: for model_type in data_dict['model_type']: unique_name = 'layer1_' + data_dict['data_name'] + '_' + model_type + '_' + evaluation_metric_name model = first_layer_model.firstLayerModel(data_dict['data'], evaluation_metric_name,model_type,unique_name) # Retrieve default parameter and change default seed. default_param,default_MAXIMIZE,default_STOPPING_ROUND = model.get_param() default_param['seed'] = self.seed if self.__verbose == True: default_param['silent'] = 1 elif self.__verbose == False: default_param['verbose_eval'] = False model.update_param(default_param,default_MAXIMIZE,default_STOPPING_ROUND) model.xgb_cv() model.generate_holdout_pred() self.__layer1_model_list.append(model) #------------------------------------second layer models layer2_label_data = self.__setting_list[0]['data'] # layer1 data object containing the label for layer2 model layer2_modeltype = ['GbtreeLogistic','GblinearLogistic'] layer2_evaluation_metric_name = [self.__eval_name] print 'Building second layer models' for evaluation_metric_name in layer2_evaluation_metric_name: for model_type in layer2_modeltype: unique_name = 'layer2' + '_' + model_type + '_' + evaluation_metric_name l2model = second_layer_model.secondLayerModel(layer2_label_data,self.__layer1_model_list, evaluation_metric_name,model_type,unique_name) l2model.second_layer_data() # Retrieve default parameter and change default seed. default_param,default_MAXIMIZE,default_STOPPING_ROUND = l2model.get_param() default_param['seed'] = self.seed if self.__verbose == True: default_param['silent'] = 0 elif self.__verbose == False: default_param['verbose_eval'] = False l2model.update_param(default_param,default_MAXIMIZE,default_STOPPING_ROUND) l2model.xgb_cv() self.__layer2_model_list.append(l2model) #------------------------------------ evaluate model performance on test data # prepare test data, retrive from layer1 data list_TestData = [] for data_dict in self.__setting_list: for model_type in data_dict['model_type']: list_TestData.append(data_dict['data'].get_dtest()) test_label = layer2_label_data.get_testLabel() test_result_list = [] i = 0 for evaluation_metric_name in layer2_evaluation_metric_name: for model_type in layer2_modeltype: test_result = eval_testset.eval_testset(self.__layer2_model_list[i], list_TestData,test_label, evaluation_metric_name) test_result_list.append(test_result) i += 1 # merge cv and test result together. Calcuate the weighted average of # cv and test result for each model(layer1, layer2 model). Then use the best # model to predict. all_model = self.__layer1_model_list + self.__layer2_model_list result = [] for model in all_model: result = result + [item for item in np.array(model.cv_score_df())[0]] # Retrieve corresponding name of cv result result_index = [] for model in all_model: result_index.append(model.name) # create a dataframe cv_result = pd.DataFrame({'cv_result' : result},index = result_index) test_result = pd.concat(test_result_list,axis = 0,ignore_index=False) test_result = test_result.rename(columns = {self.__eval_name:'test_result'}) #selet distinct row. test_result['temp_name'] = test_result.index test_result = test_result.drop_duplicates(['temp_name']) test_result = test_result.drop('temp_name',1) cv_test = pd.merge(cv_result,test_result,how='left',left_index=True,right_index=True) self.__num_folds = np.float64(self.__num_folds) cv_test['weighted_score'] = cv_test.cv_result * (self.__num_folds-1)/self.__num_folds + cv_test.test_result * (1/self.__num_folds) weighted_score = cv_test.cv_result * (self.__num_folds-1)/self.__num_folds + cv_test.test_result * (1/self.__num_folds) # Determine does current evaluation metric need to maximize or minimize eval_info = defined_eval.definedEvaluation() is_max = eval_info.is_maximize(self.__eval_name) if is_max: position = np.where(cv_test.weighted_score == cv_test.weighted_score.max()) best_model_name = cv_test.weighted_score.iloc[position].index[0] else: position = np.where(cv_test.weighted_score == cv_test.weighted_score.min()) best_model_name = cv_test.weighted_score.iloc[position].index[0] # find best model all_model_name = [model.name for model in all_model] model_position = all_model_name.index(best_model_name) self.__best_model = all_model[model_position] self.__best_model_result = pd.DataFrame(cv_test.loc[self.__best_model.name]) self.__all_model_result = cv_test
def __prepare_result(self): # merge cv and test result together. Calcuate the weighted average of # cv and test result for each model(layer1, layer2 model). Then use the best # model to predict. all_model = self.__layer1_model_list + self.__layer2_model_list result = [] for model in all_model: result = result + [item for item in np.array(model.cv_score_df())[0]] # Retrieve corresponding name of cv result result_index = [] for model in all_model: result_index.append(model.name) # create a dataframe cv_result = pd.DataFrame({'cv_result' : result},index = result_index) #------------------------------------ evaluate model performance on test data # prepare test data, retrive from layer1 data if self.__createTestset: list_TestData = [] layer2_modeltype = self.__layer2_modeltype for data_dict in self.__setting_list: for model_type in data_dict['model_type']: list_TestData.append(data_dict['data'].get_dtest()) test_label = layer2_label_data.get_testLabel() test_result_list = [] i = 0 for evaluation_metric_name in layer2_evaluation_metric_name: for model_type in layer2_modeltype: test_result = eval_testset.eval_testset(self.__layer2_model_list[i], list_TestData,test_label, evaluation_metric_name) test_result_list.append(test_result) i += 1 test_result = pd.concat(test_result_list,axis = 0,ignore_index=False) test_result = test_result.rename(columns = {self.__eval_name:'test_result'}) #selet distinct row. test_result['temp_name'] = test_result.index test_result = test_result.drop_duplicates(['temp_name']) test_result = test_result.drop('temp_name',1) cv_test = pd.merge(cv_result,test_result,how='left',left_index=True,right_index=True) self.__num_folds = np.float64(self.__num_folds) cv_test['weighted_score'] = cv_test.cv_result * (self.__num_folds-1)/self.__num_folds + cv_test.test_result * (1/self.__num_folds) weighted_score = cv_test.cv_result * (self.__num_folds-1)/self.__num_folds + cv_test.test_result * (1/self.__num_folds) else: cv_test = cv_result cv_test['weighted_score'] = cv_result.cv_result # Based on user specific finalModel if self.__finalModel == None: final_cv_test = cv_test else: finalModel_names = [item for item in list(cv_test.index) if self.__finalModel in item] final_cv_test = cv_test.loc[finalModel_names] # Determine does current evaluation metric need to maximize or minimize eval_info = defined_eval.definedEvaluation() is_max = eval_info.is_maximize(self.__eval_name) if is_max: position = np.where(final_cv_test.weighted_score == final_cv_test.weighted_score.max()) best_model_name = final_cv_test.weighted_score.iloc[position].index[0] else: position = np.where(final_cv_test.weighted_score == final_cv_test.weighted_score.min()) best_model_name = final_cv_test.weighted_score.iloc[position].index[0] # find best model all_model_name = [model.name for model in all_model] model_position = all_model_name.index(best_model_name) self.__best_model = all_model[model_position] self.__best_model_result = pd.DataFrame(cv_test.loc[self.__best_model.name]) self.__all_model_result = cv_test # Find model contains the final label if self.__final_labelType == 'binary': model_has_finalLabel = [item for item in list(cv_test.index) if 'Logistic' in item] model_position = all_model_name.index(model_has_finalLabel[0]) self.__model_has_finalLabel = all_model[model_position] elif self.__final_labelType == 'continuous': model_has_finalLabel = [item for item in list(cv_test.index) if 'Regression' in item] model_position = all_model_name.index(model_has_finalLabel[0]) self.__model_has_finalLabel = all_model[model_position]