Example #1
0
    def __prepare_xgbdata_train(self):
        """
        Internal method to build required data(xgbData) objects
        """
        print 'Preparing data'
        #Based on how many unique number, automatically detect if label column
        # is binary or continuous.
        num_xgbData = 0
        for item in self.__training_info:
            temp_df = item[0]
            for column_name in item[1]:
                # if it is binary label, use models for binary label.
                if len(np.unique(temp_df[column_name])) == 2:
                    model_type_to_use = ['GbtreeLogistic','GblinearLogistic']
                    temp_labelType = 'binary'
                else:
                    model_type_to_use = ['GbtreeRegression','GblinearRegression']
                    temp_labelType = 'continuous'

                temp_data = load.readData(temp_df,column_name)
                temp_data.read()
                X_data = temp_data.features()
                y_data = temp_data.label()
                # Need to generate fold once, based on binary label
                if not self.__has_fold:
                    self.my_fold = fold.fold(X_data,y_data,self.__num_folds,self.seed)
                    self.my_fold = self.my_fold.generate_skfolds()
                    self.__has_fold = True
                data = xgb_data.xgbData(self.my_fold,X_data,y_data)
                data.build()
                temp_dataName = 'Number:' + str(num_xgbData) + " xgbData, " + 'labelType: ' + temp_labelType
                self.__setting_list.append({'data_name':temp_dataName,
                                            'model_type':model_type_to_use,
                                            'data':data})
                num_xgbData += 1
 def second_layer_data(self):
     """
     Method to prepare training data for second layer model.
     """
     holdout_list = list()
     # Retrive first layer model's holdout prediction.
     for model in self.__list_firstLayerModel:
         holdout_list.append(model.get_holdout())
     holdout_df = pd.DataFrame(holdout_list).transpose()
     #Remove later: sort the column so that column index is always the same
     #holdout_df = holdout_df[np.sort(holdout_df.columns)]
     label = self.__xgbData.get_holdoutLabel()
     self.__xgbData = xgb_data.xgbData(self.__xgbData.get_train_fold(),
                                       np.array(holdout_df),
                                       np.array(label),
                                       False)
     self.__xgbData.build()
    setting_list = []
    # binary ecfp1024
    file_dir = os.path.join(
        current_dir,
        "../../datasets/muv/classification/muv_BinaryLabel_ecfp1024.csv.zip")
    data_name = 'binaryECFP'
    label_colname = target_name  # label column name of one target
    model_name_to_use = ['GbtreeLogistic',
                         'GblinearLogistic']  # Define model to use
    temp_data = load.readData(file_dir, label_colname)
    temp_data.read()
    X_data = temp_data.features()
    y_data = temp_data.label()
    myfold = fold.fold(X_data, y_data, 4, SEED)
    myfold = myfold.generate_skfolds()
    data = xgb_data.xgbData(myfold, X_data, y_data)
    data.build()
    setting_list.append({
        'data_name': data_name,
        'model_type': model_name_to_use,
        'data': data
    })

    # binary MACCSkeys
    file_dir = os.path.join(
        current_dir,
        "../../datasets/muv/classification/muv_BinaryLabel_MACCSkey167.csv.zip"
    )
    data_name = 'binaryMACCSkeys'
    label_colname = target_name
    model_name_to_use = ['GbtreeLogistic', 'GblinearLogistic']
def test_muv_function():
    SEED = 2016
    #----------------------------------- Build first layer data
    print '{}: loading data'.format(target_name)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    setting_list = []
    # binary ecfp1024
    file_dir = os.path.join(current_dir,
                            "./test_datasets/muv_sample/muv466_ecfp.csv.zip")
    data_name = 'ecfp'
    label_colname = target_name  # label column name of one target
    model_name_to_use = ['GbtreeLogistic',
                         'GblinearLogistic']  # Define model to use
    temp_data = load.readData(file_dir, label_colname)
    temp_data.read()
    X_data = temp_data.features()
    # check training feature dimension
    assert X_data.shape == (93127, 1024)
    # check label dimension
    y_data = temp_data.label()
    assert y_data.shape == (93127, )
    # check label positive number
    assert y_data.sum() == 27
    # Only use portion of data to build model
    index = list(np.where(y_data == 1)[0])
    index = index + range(1000)
    X_data = X_data[index]
    y_data = y_data[index]

    myfold = fold.fold(X_data, y_data, 4, SEED)
    myfold = myfold.generate_skfolds()
    result_dir = tempfile.mkdtemp()
    myfold.to_csv(os.path.join(result_dir, 'fold_all.csv'))
    # check whether stratified 4 folds are the same
    assert filecmp.cmp(
        os.path.join(result_dir, 'fold_all.csv'),
        os.path.join(current_dir,
                     "./test_datasets/muv_sample/muv466_folds_all.csv"))
    data = xgb_data.xgbData(myfold, X_data, y_data)
    data.build()
    # check whether training and test data objects are xgboost.core.DMatrix
    assert isinstance(data.get_dtest(), xgboost.core.DMatrix)
    assert all([
        isinstance(data.get_dtrain(i)[0], xgboost.core.DMatrix)
        for i in range(3)
    ])
    # check positive number of training data and test data.
    assert data.get_holdoutLabel().sum() == 21
    assert data.get_testLabel().sum() == 6
    # check whether train folds are the same
    data.get_train_fold().to_csv(os.path.join(result_dir, 'fold_train.csv'))
    assert filecmp.cmp(
        os.path.join(result_dir, 'fold_train.csv'),
        os.path.join(current_dir,
                     "./test_datasets/muv_sample/muv466_folds_train.csv"))
    # check number of training folds
    assert data.numberOfTrainFold() == 3
    setting_list.append({
        'data_name': data_name,
        'model_type': model_name_to_use,
        'data': data
    })

    # binary MACCSkeys
    file_dir = os.path.join(
        current_dir, "./test_datasets/muv_sample/muv466_macckey.csv.zip")
    data_name = 'macckey'
    label_colname = target_name  # label column name of one target
    model_name_to_use = ['GbtreeLogistic', 'GblinearLogistic']
    temp_data = load.readData(file_dir, label_colname)
    temp_data.read()
    X_data = temp_data.features()
    y_data = temp_data.label()
    X_data = X_data[index]
    y_data = y_data[index]
    data = xgb_data.xgbData(myfold, X_data, y_data)
    data.build()
    setting_list.append({
        'data_name': data_name,
        'model_type': model_name_to_use,
        'data': data
    })

    #---------------------------------first layer models ----------
    # 4 layer1 models based on ecfp,MACCSkeys data
    # gbtree
    print '{}: building first layer models'.format(target_name)
    layer1_model_list = []
    evaluation_metric_name = 'ROCAUC'
    for data_dict in setting_list:
        for model_type in data_dict['model_type']:
            unique_name = 'layer1_' + data_dict[
                'data_name'] + '_' + model_type + '_' + evaluation_metric_name
            model = first_layer_model.firstLayerModel(data_dict['data'],
                                                      evaluation_metric_name,
                                                      model_type, unique_name)
            # Retrieve default parameter and change default seed.
            default_param, default_MAXIMIZE, default_STOPPING_ROUND = model.get_param(
            )
            default_param['seed'] = SEED
            # Default parameters overfit muv dataset, use more conservative param.
            if model_type == 'GbtreeLogistic':
                default_param['eta'] = 0.3
                default_param['max_depth'] = 5
                default_param['colsample_bytree'] = 0.5
                default_param['min_child_weight'] = 2
            elif model_type == 'GblinearLogistic':
                default_param['eta'] = 0.3
            default_STOPPING_ROUND = 10
            model.update_param(default_param, default_MAXIMIZE,
                               default_STOPPING_ROUND)
            model.xgb_cv()
            model.generate_holdout_pred()
            layer1_model_list.append(model)

    # check whether second decimal of cv scores are the same.
    cv_result = pd.concat([
        layer1_model_list[0].cv_score_df(), layer1_model_list[1].cv_score_df(),
        layer1_model_list[2].cv_score_df(), layer1_model_list[3].cv_score_df()
    ])
    cv_result = np.round(cv_result, 2)
    cv_result.to_csv(os.path.join(result_dir, 'firstlayerModel_cvScore.csv'))
    # read previous saved result and combine 2 result together
    old = pd.read_csv(
        os.path.join(
            current_dir,
            "./test_datasets/muv_sample/muv466_firstlayerModel_cvScore.csv"))
    temp_combine = pd.DataFrame({
        'old': old.ROCAUC,
        'new': cv_result.reset_index().ROCAUC
    })
    print rmse(temp_combine.new - temp_combine.old)
    assert rmse(temp_combine.new - temp_combine.old) < 0.05
    # check whether holdout results of first layer model are same, round to THIRD decimal.
    holdout_result = pd.DataFrame({
        layer1_model_list[0].name:
        layer1_model_list[0].get_holdout(),
        layer1_model_list[1].name:
        layer1_model_list[1].get_holdout(),
        layer1_model_list[2].name:
        layer1_model_list[2].get_holdout(),
        layer1_model_list[3].name:
        layer1_model_list[3].get_holdout()
    })
    holdout_result = np.round(holdout_result, 3)
    old = pd.read_csv(
        os.path.join(
            current_dir,
            "./test_datasets/muv_sample/muv466_firstlayerModel_holdout.csv"))
    # check each model's holdout prediction.
    for colname in holdout_result.columns:
        print colname
        print rmse(old[colname] - holdout_result[colname])
        assert rmse(old[colname] - holdout_result[colname]) < 0.15

    #------------------------------------second layer models
    # use label from binary data to train layer2 models
    #layer1_model_list
    print '{}: building second layer models'.format(target_name)
    layer2_label_data = setting_list[0][
        'data']  # layer1 data object containing the label for layer2 model
    layer2_model_list = []
    layer2_modeltype = ['GbtreeLogistic', 'GblinearLogistic']
    layer2_evaluation_metric_name = ['ROCAUC', 'EFR1']
    for evaluation_metric_name in layer2_evaluation_metric_name:
        for model_type in layer2_modeltype:
            unique_name = 'layer2' + '_' + model_type + '_' + evaluation_metric_name
            l2model = second_layer_model.secondLayerModel(
                layer2_label_data, layer1_model_list, evaluation_metric_name,
                model_type, unique_name)
            l2model.second_layer_data()
            # Retrieve default parameter and change default seed.
            default_param, default_MAXIMIZE, default_STOPPING_ROUND = l2model.get_param(
            )
            default_param['seed'] = SEED
            # Default parameters overfit muv dataset, use more conservative param.
            if model_type == 'GbtreeLogistic':
                default_param['eta'] = 0.3
                default_param['max_depth'] = 5
                default_param['colsample_bytree'] = 0.5
                default_param['min_child_weight'] = 2
            elif model_type == 'GblinearLogistic':
                default_param['eta'] = 0.3
            default_STOPPING_ROUND = 10
            l2model.update_param(default_param, default_MAXIMIZE,
                                 default_STOPPING_ROUND)
            l2model.xgb_cv()
            layer2_model_list.append(l2model)

    # check whether second decimal of cv scores are the same.
    cv_result = pd.concat([
        layer2_model_list[0].cv_score_df(), layer2_model_list[1].cv_score_df(),
        layer2_model_list[2].cv_score_df(), layer2_model_list[3].cv_score_df()
    ])
    cv_result = np.round(cv_result, 2)
    old = pd.read_csv(
        os.path.join(
            current_dir,
            "./test_datasets/muv_sample/muv466_secondlayerModel_cvScore.csv"))
    # Test second layer model cv score
    # rocauc
    temp_combine = pd.DataFrame({
        'old': old.ROCAUC,
        'new': cv_result.reset_index().ROCAUC
    })
    print rmse(temp_combine.new - temp_combine.old)
    assert rmse(temp_combine.new - temp_combine.old) < 0.05
    # EFR1
    temp_combine = pd.DataFrame({
        'old': old.EFR1,
        'new': cv_result.reset_index().EFR1
    })
    print rmse(temp_combine.new - temp_combine.old)
    assert rmse(temp_combine.new - temp_combine.old) < 5

    #------------------------------------ evaluate model performance on test data
    # prepare test data, retrive from layer1 data
    print '{}: evaluating test set'.format(target_name)
    list_TestData = []
    for data_dict in setting_list:
        for model_type in data_dict['model_type']:
            list_TestData.append(data_dict['data'].get_dtest())

    test_label = layer2_label_data.get_testLabel()
    test_result_list = []
    i = 0
    for evaluation_metric_name in layer2_evaluation_metric_name:
        for model_type in layer2_modeltype:
            test_result = eval_testset.eval_testset(layer2_model_list[i],
                                                    list_TestData, test_label,
                                                    evaluation_metric_name)
            test_result_list.append(test_result)
            i += 1
    # collect test result
    result = pd.concat(test_result_list, axis=0, ignore_index=False)
    result = np.round(result, 3)
    old = pd.read_csv(
        os.path.join(current_dir,
                     "./test_datasets/muv_sample/muv466_testResult_all.csv"))

    # Compare final test result.
    # rocauc
    temp_combine = pd.DataFrame({
        'old': old.ROCAUC,
        'new': result.reset_index().ROCAUC
    })
    print rmse(temp_combine.new - temp_combine.old)
    assert rmse(temp_combine.new - temp_combine.old) < 0.05