Example #1
0
def test_lightgbm(tmp_path):
    import lightgbm as lgb
    X, y = simulate_data(500, 10, random_state=43210, classification=True)
    train_data = lgb.Dataset(X, label=y)
    param = {'objective': 'binary', 'metric': 'binary_logloss'}
    num_round = 5
    bst = lgb.train(param, train_data, num_round)
    gbm_preds = bst.predict(X)
    model_path = str(os.path.join(tmp_path, 'lgb.model'))
    bst.save_model(model_path)
    fm = ForestInference.load(model_path,
                              algo='TREE_REORG',
                              output_class=True,
                              model_type="lightgbm")

    fil_preds = np.asarray(fm.predict(X))
    fil_preds = np.reshape(fil_preds, np.shape(gbm_preds))

    assert array_equal(np.round(gbm_preds), fil_preds)

    lcls = lgb.LGBMClassifier().set_params(objective='binary',
                                           metric='binary_logloss')
    lcls.fit(X, y)
    gbm_proba = lcls.predict_proba(X)

    lcls.booster_.save_model(model_path)
    fm = ForestInference.load(model_path,
                              algo='TREE_REORG',
                              output_class=True,
                              model_type="lightgbm")

    fil_proba = np.asarray(fm.predict_proba(X))
    fil_proba = np.reshape(fil_proba, np.shape(gbm_proba))

    assert np.allclose(gbm_proba, fil_proba, 1e-2)
Example #2
0
def _obtain_fil_model(treelite_handle,
                      depth,
                      output_class=True,
                      threshold=0.5,
                      algo='auto',
                      fil_sparse_format='auto'):
    """
    Creates a Forest Inference (FIL) model using the treelite
    handle obtained from the cuML Random Forest model.

    Returns
    ----------
    fil_model :
        A Forest Inference model which can be used to perform
        inferencing on the random forest model.
    """

    storage_format = \
        _check_fil_parameter_validity(depth=depth,
                                      fil_sparse_format=fil_sparse_format,
                                      algo=algo)

    fil_model = ForestInference()
    tl_to_fil_model = \
        fil_model.load_from_randomforest(treelite_handle,
                                         output_class=output_class,
                                         threshold=threshold,
                                         algo=algo,
                                         storage_type=storage_format)

    return tl_to_fil_model
Example #3
0
def test_lightgbm(tmp_path, num_classes):
    import lightgbm as lgb
    X, y = simulate_data(500,
                         10 if num_classes == 2 else 50,
                         num_classes,
                         random_state=43210,
                         classification=True)
    train_data = lgb.Dataset(X, label=y)

    if num_classes == 2:
        param = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'num_class': 1
        }
    else:
        param = {
            'objective': 'ova',  # 'multiclass', would use softmax
            'metric': 'multi_logloss',
            'num_class': num_classes
        }
    num_round = 5
    model_path = str(os.path.join(tmp_path, 'lgb.model'))

    bst = lgb.train(param, train_data, num_round)
    bst.save_model(model_path)
    if num_classes == 2:
        # binary classification
        gbm_proba = bst.predict(X)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        fil_proba = fm.predict_proba(X)
        assert np.allclose(gbm_proba, fil_proba[:, 1], 1e-2)
        gbm_preds = (gbm_proba > 0.5)
        fil_preds = fm.predict(X)
        assert array_equal(gbm_preds, fil_preds)
    else:
        # multi-class classification
        # FIL doesn't yet support predict_proba() for multi-class
        # TODO: Add a test for predict_proba() when it's supported
        gbm_preds = bst.predict(X)
        gbm_preds = gbm_preds.argmax(axis=1)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        fil_preds = fm.predict(X)
        assert array_equal(np.round(gbm_preds), fil_preds)
Example #4
0
def test_lightgbm(tmp_path, num_classes):
    import lightgbm as lgb
    X, y = simulate_data(500,
                         10 if num_classes == 2 else 50,
                         num_classes,
                         random_state=43210,
                         classification=True)
    train_data = lgb.Dataset(X, label=y)
    num_round = 5
    model_path = str(os.path.join(tmp_path, 'lgb.model'))

    if num_classes == 2:
        param = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'num_class': 1
        }
        bst = lgb.train(param, train_data, num_round)
        bst.save_model(model_path)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        # binary classification
        gbm_proba = bst.predict(X)
        fil_proba = fm.predict_proba(X)[:, 1]
        gbm_preds = (gbm_proba > 0.5)
        fil_preds = fm.predict(X)
        assert array_equal(gbm_preds, fil_preds)
        np.testing.assert_allclose(gbm_proba,
                                   fil_proba,
                                   atol=proba_atol[num_classes > 2])
    else:
        # multi-class classification
        lgm = lgb.LGBMClassifier(objective='multiclass',
                                 boosting_type='gbdt',
                                 n_estimators=num_round)
        lgm.fit(X, y)
        lgm.booster_.save_model(model_path)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        lgm_preds = lgm.predict(X)
        assert array_equal(lgm.booster_.predict(X).argmax(axis=1), lgm_preds)
        assert array_equal(lgm_preds, fm.predict(X))
        # lightgbm uses float64 thresholds, while FIL uses float32
        np.testing.assert_allclose(lgm.predict_proba(X),
                                   fm.predict_proba(X),
                                   atol=proba_atol[num_classes > 2])
Example #5
0
def test_cpp_exception(tmp_path):
    import lightgbm as lgb
    num_class = 3
    X, y = simulate_data(50,
                         5 * num_class,
                         num_class,
                         random_state=2020,
                         classification=True)
    train_data = lgb.Dataset(X, label=y)

    num_round = 1
    param = {
        'objective': 'ova',  # 'multiclass', would use softmax
        'metric': 'multi_logloss',
        'num_class': num_class
    }
    bst = lgb.train(param, train_data, num_round)
    model_path = str(os.path.join(tmp_path, 'lgb.model'))
    bst.save_model(model_path)

    fm = ForestInference.load(model_path,
                              algo='TREE_REORG',
                              output_class=True,
                              model_type='lightgbm')

    with pytest.raises(RuntimeError) as excinfo:
        _ = fm.predict_proba(X)

    assert ('predict_proba not supported for multi-class gradient boosted ' +
            'decision trees' in str(excinfo.value))
Example #6
0
    def process(self, inputs):
        input_meta = self.get_input_meta()
        predict_col = self.conf.get('prediction', 'predict')
        data_df = inputs[self.INPUT_PORT_NAME]

        if self.INPUT_PORT_MODEL_NAME in input_meta:
            # use external information instead of conf
            filename = get_file_path(inputs[self.INPUT_PORT_MODEL_NAME])
            train_cols = input_meta[self.INPUT_PORT_MODEL_NAME]['train']
            train_cols = list(train_cols.keys())
        else:
            # use the conf information
            filename = get_file_path(self.conf['file'])
            if 'columns' in self.conf:
                if self.conf.get('include', True):
                    train_cols = self.conf['columns']
                else:
                    train_cols = [
                        col for col in data_df.columns
                        if col not in self.conf['columns']
                    ]
        # train_cols.sort()
        fm = ForestInference.load(filename,
                                  model_type=self.conf.get(
                                      "model_type", "xgboost"))
        prediction = fm.predict(data_df[train_cols])
        prediction.index = data_df.index
        data_df[predict_col] = prediction
        return {self.OUTPUT_PORT_NAME: data_df}
Example #7
0
def test_print_forest_shape(small_classifier_and_preds):
    model_path, model_type, X, xgb_preds = small_classifier_and_preds
    m = ForestInference.load(model_path, model_type=model_type,
                             output_class=True, compute_shape_str=True)
    for substr in ['model size', ' MB', 'Depth histogram:', 'Leaf depth',
                   'Depth histogram fingerprint', 'Avg nodes per tree']:
        assert substr in m.shape_str
Example #8
0
def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth,
                                storage_type, model_class):
    # skip depth 20 for dense tests
    if max_depth == 20 and not storage_type:
        return

    # settings
    classification = True  # change this to false to use regression
    n_categories = 2
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_categories,
                         random_state=random_state,
                         classification=classification)
    # identify shape and indices
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    init_kwargs = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
    }
    if model_class == RandomForestClassifier:
        init_kwargs['max_features'] = 0.3
        init_kwargs['n_jobs'] = -1
    else:
        # model_class == GradientBoostingClassifier
        init_kwargs['init'] = 'zero'

    skl_model = model_class(**init_kwargs)
    skl_model.fit(X_train, y_train)

    skl_preds = skl_model.predict(X_validation)
    skl_preds_int = np.around(skl_preds)
    skl_proba = skl_model.predict_proba(X_validation)

    skl_acc = accuracy_score(y_validation, skl_preds > 0.5)

    algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG'

    fm = ForestInference.load_from_sklearn(skl_model,
                                           algo=algo,
                                           output_class=True,
                                           threshold=0.50,
                                           storage_type=storage_type)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_preds = np.reshape(fil_preds, np.shape(skl_preds_int))

    fil_proba = np.asarray(fm.predict_proba(X_validation))
    fil_proba = np.reshape(fil_proba, np.shape(skl_proba))

    fil_acc = accuracy_score(y_validation, fil_preds)

    assert fil_acc == pytest.approx(skl_acc, abs=1e-5)
    assert array_equal(fil_preds, skl_preds_int)
    assert np.allclose(fil_proba, skl_proba, 1e-3)
Example #9
0
def test_lightgbm(tmp_path, num_classes):
    import lightgbm as lgb
    X, y = simulate_data(500,
                         10 if num_classes == 2 else 50,
                         num_classes,
                         random_state=43210,
                         classification=True)
    train_data = lgb.Dataset(X, label=y)

    if num_classes == 2:
        param = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'num_class': 1
        }
    else:
        param = {
            'objective': 'ova',  # 'multiclass', would use softmax
            'metric': 'multi_logloss',
            'num_class': num_classes
        }
    num_round = 5
    bst = lgb.train(param, train_data, num_round)
    gbm_preds = bst.predict(X)
    if num_classes > 2:
        gbm_preds = gbm_preds.argmax(axis=1)
    model_path = str(os.path.join(tmp_path, 'lgb.model'))
    bst.save_model(model_path)
    fm = ForestInference.load(model_path,
                              algo='TREE_REORG',
                              output_class=True,
                              model_type="lightgbm")
    fil_preds = fm.predict(X)
    assert array_equal(np.round(gbm_preds), fil_preds)

    if num_classes == 2:
        lcls = lgb.LGBMClassifier().set_params(**param)
        lcls.fit(X, y)
        gbm_proba = lcls.predict_proba(X)

        lcls.booster_.save_model(model_path)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        fil_proba = fm.predict_proba(X)
        assert np.allclose(gbm_proba, fil_proba, 1e-2)
Example #10
0
def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth,
                                n_classes, storage_type, model_class):
    # settings
    classification = True  # change this to false to use regression
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_classes,
                         random_state=random_state,
                         classification=classification)
    # identify shape and indices
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    init_kwargs = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
    }
    if model_class == RandomForestClassifier:
        init_kwargs['max_features'] = 0.3
        init_kwargs['n_jobs'] = -1
    else:
        # model_class == GradientBoostingClassifier
        init_kwargs['init'] = 'zero'

    skl_model = model_class(**init_kwargs, random_state=random_state)
    skl_model.fit(X_train, y_train)

    skl_preds = skl_model.predict(X_validation)
    skl_preds_int = np.around(skl_preds)
    skl_proba = skl_model.predict_proba(X_validation)

    skl_acc = accuracy_score(y_validation, skl_preds_int)

    algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG'

    fm = ForestInference.load_from_sklearn(skl_model,
                                           algo=algo,
                                           output_class=True,
                                           threshold=0.50,
                                           storage_type=storage_type)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_preds = np.reshape(fil_preds, np.shape(skl_preds_int))
    fil_acc = accuracy_score(y_validation, fil_preds)
    # fil_acc is within p99 error bars of skl_acc (diff == 0.017 +- 0.012)
    # however, some tests have a delta as big as 0.04.
    # sklearn uses float64 thresholds, while FIL uses float32
    # TODO(levsnv): once FIL supports float64 accuracy, revisit thresholds
    threshold = 1e-5 if n_classes == 2 else 0.1
    assert fil_acc == pytest.approx(skl_acc, abs=threshold)

    if n_classes == 2:
        assert array_equal(fil_preds, skl_preds_int)
        fil_proba = np.asarray(fm.predict_proba(X_validation))
        fil_proba = np.reshape(fil_proba, np.shape(skl_proba))
        assert np.allclose(fil_proba, skl_proba, 1e-3)
Example #11
0
def test_output_args(small_classifier_and_preds):
    model_path, X, xgb_preds = small_classifier_and_preds
    fm = ForestInference.load(model_path,
                              algo='TREE_REORG',
                              output_class=False,
                              threshold=0.50)
    X = np.asarray(X)
    fil_preds = fm.predict(X)
    assert np.allclose(fil_preds, xgb_preds, 1e-3)
Example #12
0
def test_output_storage_type(storage_type, small_classifier_and_preds):
    model_path, X, xgb_preds = small_classifier_and_preds
    fm = ForestInference.load(model_path,
                              output_class=True,
                              storage_type=storage_type,
                              threshold=0.50)

    xgb_preds_int = np.around(xgb_preds)
    fil_preds = np.asarray(fm.predict(X))
    assert np.allclose(fil_preds, xgb_preds_int, 1e-3)
Example #13
0
def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth,
                            storage_type, model_class):

    # skip depth 20 for dense tests
    if max_depth == 20 and storage_type == 'DENSE':
        return

    # settings
    n_categories = 1
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_categories,
                         random_state=random_state,
                         classification=False)
    # identify shape and indices
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    init_kwargs = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
    }
    if model_class == RandomForestRegressor:
        init_kwargs['max_features'] = 0.3
        init_kwargs['n_jobs'] = -1
    else:
        # model_class == GradientBoostingRegressor
        init_kwargs['init'] = 'zero'

    skl_model = model_class(**init_kwargs)
    skl_model.fit(X_train, y_train)

    skl_preds = skl_model.predict(X_validation)

    skl_mse = mean_squared_error(y_validation, skl_preds)

    algo = 'NAIVE' if storage_type == 'SPARSE' else 'BATCH_TREE_REORG'

    fm = ForestInference.load_from_sklearn(skl_model,
                                           algo=algo,
                                           output_class=False,
                                           storage_type=storage_type)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_preds = np.reshape(fil_preds, np.shape(skl_preds))

    fil_mse = mean_squared_error(y_validation, fil_preds)

    # if fil is better than skl, no need to fail the test
    assert fil_mse <= skl_mse * (1. + 1e-7) + 1e-4
    assert array_equal(fil_preds, skl_preds)
Example #14
0
def test_thresholding(output_class, small_classifier_and_preds):
    model_path, X, xgb_preds = small_classifier_and_preds
    fm = ForestInference.load(model_path,
                              algo='TREE_REORG',
                              output_class=output_class,
                              threshold=0.50)
    fil_preds = np.asarray(fm.predict(X))
    if output_class:
        assert ((fil_preds != 0.0) & (fil_preds != 1.0)).sum() == 0
    else:
        assert ((fil_preds != 0.0) & (fil_preds != 1.0)).sum() > 0
Example #15
0
def test_output_algos(algo, small_classifier_and_preds):
    model_path, X, xgb_preds = small_classifier_and_preds
    fm = ForestInference.load(model_path,
                              algo=algo,
                              output_class=True,
                              threshold=0.50)

    xgb_preds_int = np.around(xgb_preds)
    fil_preds = np.asarray(fm.predict(X))
    fil_preds = np.reshape(fil_preds, np.shape(xgb_preds_int))

    assert np.allclose(fil_preds, xgb_preds_int, 1e-3)
Example #16
0
def test_output_args(small_classifier_and_preds):
    model_path, model_type, X, xgb_preds = small_classifier_and_preds
    fm = ForestInference.load(model_path,
                              model_type=model_type,
                              algo='TREE_REORG',
                              output_class=False,
                              threshold=0.50)
    X = np.asarray(X)
    fil_preds = fm.predict(X)
    fil_preds = np.reshape(fil_preds, np.shape(xgb_preds))

    assert array_equal(fil_preds, xgb_preds, 1e-3)
Example #17
0
def test_fil_classification(n_rows, n_columns, num_rounds, n_classes,
                            tmp_path):
    # settings
    classification = True  # change this to false to use regression
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_classes,
                         random_state=random_state,
                         classification=classification)
    # identify shape and indices
    n_rows, n_columns = X.shape
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    model_path = os.path.join(tmp_path, 'xgb_class.model')

    bst = _build_and_save_xgboost(model_path,
                                  X_train,
                                  y_train,
                                  num_rounds=num_rounds,
                                  classification=classification,
                                  n_classes=n_classes)

    dvalidation = xgb.DMatrix(X_validation, label=y_validation)

    if n_classes == 2:
        xgb_preds = bst.predict(dvalidation)
        xgb_preds_int = np.around(xgb_preds)
        xgb_proba = np.stack([1 - xgb_preds, xgb_preds], axis=1)
    else:
        xgb_proba = bst.predict(dvalidation)
        xgb_preds_int = xgb_proba.argmax(axis=1)
    xgb_acc = accuracy_score(y_validation, xgb_preds_int)

    fm = ForestInference.load(model_path,
                              algo='auto',
                              output_class=True,
                              threshold=0.50)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_proba = np.asarray(fm.predict_proba(X_validation))
    fil_acc = accuracy_score(y_validation, fil_preds)

    assert fil_acc == pytest.approx(xgb_acc, abs=0.01)
    assert array_equal(fil_preds, xgb_preds_int)
    np.testing.assert_allclose(fil_proba,
                               xgb_proba,
                               atol=proba_atol[n_classes > 2])
Example #18
0
def test_fil_classification(n_rows, n_columns, num_rounds, tmp_path):
    # settings
    classification = True  # change this to false to use regression
    n_rows = n_rows  # we'll use 1 millions rows
    n_columns = n_columns
    n_categories = 2
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         n_categories,
                         random_state=random_state,
                         classification=classification)
    # identify shape and indices
    n_rows, n_columns = X.shape
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    model_path = os.path.join(tmp_path, 'xgb_class.model')

    bst = _build_and_save_xgboost(model_path,
                                  X_train,
                                  y_train,
                                  num_rounds=num_rounds,
                                  classification=classification)

    dvalidation = xgb.DMatrix(X_validation, label=y_validation)
    xgb_preds = bst.predict(dvalidation)
    xgb_preds_int = np.around(xgb_preds)
    xgb_proba = np.stack([1 - xgb_preds, xgb_preds], axis=1)

    xgb_acc = accuracy_score(y_validation, xgb_preds > 0.5)

    fm = ForestInference.load(model_path,
                              algo='BATCH_TREE_REORG',
                              output_class=True,
                              threshold=0.50)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_preds = np.reshape(fil_preds, np.shape(xgb_preds_int))
    fil_proba = np.asarray(fm.predict_proba(X_validation))

    fil_proba = np.reshape(fil_proba, np.shape(xgb_proba))
    fil_acc = accuracy_score(y_validation, fil_preds)

    assert fil_acc == pytest.approx(xgb_acc, 0.01)
    assert array_equal(fil_preds, xgb_preds_int)
    assert array_equal(fil_proba, xgb_proba)
Example #19
0
def test_output_blocks_per_sm(storage_type, blocks_per_sm,
                              small_classifier_and_preds):
    model_path, model_type, X, xgb_preds = small_classifier_and_preds
    fm = ForestInference.load(model_path,
                              model_type=model_type,
                              output_class=True,
                              storage_type=storage_type,
                              threshold=0.50,
                              blocks_per_sm=blocks_per_sm)

    xgb_preds_int = np.around(xgb_preds)
    fil_preds = np.asarray(fm.predict(X))
    fil_preds = np.reshape(fil_preds, np.shape(xgb_preds_int))

    assert np.allclose(fil_preds, xgb_preds_int, 1e-3)
Example #20
0
def test_output_args(format, small_classifier_and_preds):
    model_path, X, xgb_preds = small_classifier_and_preds
    fm = ForestInference.load(model_path,
                              algo='TREE_REORG',
                              output_class=False,
                              threshold=0.50)
    if format == 'numpy':
        X = np.asarray(X)
    elif format == 'cudf':
        X = cudf.DataFrame.from_gpu_matrix(
            cuda.to_device(np.ascontiguousarray(X)))
    elif format == 'gpuarray':
        X = cuda.to_device(np.ascontiguousarray(X))
    else:
        assert False

    fil_preds = fm.predict(X)
    assert np.allclose(fil_preds, xgb_preds, 1e-3)
Example #21
0
def test_lightgbm(tmp_path):
    import lightgbm as lgb
    X, y = simulate_data(500, 10, random_state=43210, classification=True)
    train_data = lgb.Dataset(X, label=y)
    param = {'objective': 'binary', 'metric': 'binary_logloss'}
    num_round = 5
    bst = lgb.train(param, train_data, num_round)
    gbm_preds = bst.predict(X)

    model_path = str(os.path.join(tmp_path, 'lgb.model'))
    bst.save_model(model_path)
    fm = ForestInference.load(model_path,
                              algo='TREE_REORG',
                              output_class=False,
                              model_type="lightgbm")

    fil_preds = np.asarray(fm.predict(X))
    assert np.allclose(gbm_preds, fil_preds, 1e-3)
Example #22
0
    def load_trained_model():
        """
        Cached loading of trained [ XGBoost or RandomForest ] model into memory
        Note: Models selected via filename parsing, edit if necessary
        """
        xgb_models = glob.glob('/opt/ml/model/*_xgb')
        rf_models = glob.glob('/opt/ml/model/*_rf')
        kmeans_models = glob.glob('/opt/ml/model/*_kmeans')
        app.logger.info(f'detected xgboost models : {xgb_models}')
        app.logger.info(f'detected randomforest models : {rf_models}')
        app.logger.info(f'detected kmeans models : {kmeans_models}\n\n')
        model_type = None

        start_time = time.perf_counter()

        if len(xgb_models):
            model_type = 'XGBoost'
            model_filename = xgb_models[0]
            if GPU_INFERENCE_FLAG:
                # FIL
                reloaded_model = ForestInference.load(model_filename)
            else:
                # native XGBoost
                reloaded_model = xgboost.Booster()
                reloaded_model.load_model(fname=model_filename)

        elif len(rf_models):
            model_type = 'RandomForest'
            model_filename = rf_models[0]
            reloaded_model = joblib.load(model_filename)
            
        elif len(kmeans_models):
            model_type = 'KMeans'
            model_filename = kmeans_models[0]
            reloaded_model = joblib.load(model_filename)
        else:
            raise Exception('! No trained models detected')

        exec_time = time.perf_counter() - start_time
        app.logger.info(f'> model {model_filename} '
                        f'loaded in {exec_time:.5f} s \n')

        return reloaded_model, model_type, model_filename
Example #23
0
def test_threads_per_tree(threads_per_tree,
                          small_classifier_and_preds):
    model_path, model_type, X, xgb_preds = small_classifier_and_preds
    fm = ForestInference.load(model_path,
                              model_type=model_type,
                              output_class=True,
                              storage_type='auto',
                              threshold=0.50,
                              threads_per_tree=threads_per_tree,
                              n_items=1)

    fil_preds = np.asarray(fm.predict(X))
    fil_proba = np.asarray(fm.predict_proba(X))

    xgb_proba = np.stack([1-xgb_preds, xgb_preds], axis=1)
    np.testing.assert_allclose(fil_proba, xgb_proba,
                               atol=proba_atol[False])

    xgb_preds_int = np.around(xgb_preds)
    fil_preds = np.reshape(fil_preds, np.shape(xgb_preds_int))
    assert np.allclose(fil_preds, xgb_preds_int, 1e-3)
Example #24
0
def test_fil_regression(n_rows, n_columns, num_rounds, tmp_path, max_depth):
    # settings
    classification = False  # change this to false to use regression
    n_rows = n_rows  # we'll use 1 millions rows
    n_columns = n_columns
    random_state = np.random.RandomState(43210)

    X, y = simulate_data(n_rows,
                         n_columns,
                         random_state=random_state,
                         classification=classification,
                         bias=10.0)
    # identify shape and indices
    n_rows, n_columns = X.shape
    train_size = 0.80

    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, train_size=train_size, random_state=0)

    model_path = os.path.join(tmp_path, 'xgb_reg.model')
    bst = _build_and_save_xgboost(model_path,
                                  X_train,
                                  y_train,
                                  classification=classification,
                                  num_rounds=num_rounds,
                                  xgboost_params={'max_depth': max_depth})

    dvalidation = xgb.DMatrix(X_validation, label=y_validation)
    xgb_preds = bst.predict(dvalidation)

    xgb_mse = mean_squared_error(y_validation, xgb_preds)
    fm = ForestInference.load(model_path,
                              algo='BATCH_TREE_REORG',
                              output_class=False)
    fil_preds = np.asarray(fm.predict(X_validation))
    fil_preds = np.reshape(fil_preds, np.shape(xgb_preds))
    fil_mse = mean_squared_error(y_validation, fil_preds)

    assert fil_mse == pytest.approx(xgb_mse, 0.01)
    assert array_equal(fil_preds, xgb_preds)
Example #25
0
    def load_trained_model():
        """
        Cached loading of trained [ XGBoost or RandomForest ] model into memory
        Note: Models selected via filename parsing, edit if necessary
        """
        xgb_models = glob.glob("/opt/ml/model/*_xgb")
        rf_models = glob.glob("/opt/ml/model/*_rf")
        app.logger.info(f"detected xgboost models : {xgb_models}")
        app.logger.info(f"detected randomforest models : {rf_models}\n\n")
        model_type = None

        start_time = time.perf_counter()

        if len(xgb_models):
            model_type = "XGBoost"
            model_filename = xgb_models[0]
            if GPU_INFERENCE_FLAG:
                # FIL
                reloaded_model = ForestInference.load(model_filename)
            else:
                # native XGBoost
                reloaded_model = xgboost.Booster()
                reloaded_model.load_model(fname=model_filename)

        elif len(rf_models):
            model_type = "RandomForest"
            model_filename = rf_models[0]
            reloaded_model = joblib.load(model_filename)
        else:
            raise Exception("! No trained models detected")

        exec_time = time.perf_counter() - start_time
        app.logger.info(f"> model {model_filename} "
                        f"loaded in {exec_time:.5f} s \n")

        return reloaded_model, model_type, model_filename
Example #26
0
def test_lightgbm(tmp_path, num_classes, n_categorical):
    import lightgbm as lgb

    if n_categorical > 0:
        n_features = 10
        n_rows = 1000
        n_informative = n_features
    else:
        n_features = 10 if num_classes == 2 else 50
        n_rows = 500
        n_informative = 'auto'

    X, y = simulate_data(n_rows,
                         n_features,
                         num_classes,
                         n_informative=n_informative,
                         random_state=43210,
                         classification=True)
    if n_categorical > 0:
        X_fit, X_predict = to_categorical(X,
                                          n_categorical=n_categorical,
                                          invalid_frac=0.1,
                                          random_state=43210)
    else:
        X_fit, X_predict = X, X

    train_data = lgb.Dataset(X_fit, label=y)
    num_round = 5
    model_path = str(os.path.join(tmp_path, 'lgb.model'))

    if num_classes == 2:
        param = {'objective': 'binary',
                 'metric': 'binary_logloss',
                 'num_class': 1}
        bst = lgb.train(param, train_data, num_round)
        bst.save_model(model_path)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        # binary classification
        gbm_proba = bst.predict(X_predict)
        fil_proba = fm.predict_proba(X_predict)[:, 1]
        gbm_preds = (gbm_proba > 0.5).astype(float)
        fil_preds = fm.predict(X_predict)
        assert array_equal(gbm_preds, fil_preds)
        np.testing.assert_allclose(gbm_proba, fil_proba,
                                   atol=proba_atol[num_classes > 2])
    else:
        # multi-class classification
        lgm = lgb.LGBMClassifier(objective='multiclass',
                                 boosting_type='gbdt',
                                 n_estimators=num_round)
        lgm.fit(X_fit, y)
        lgm.booster_.save_model(model_path)
        lgm_preds = lgm.predict(X_predict).astype(int)
        fm = ForestInference.load(model_path,
                                  algo='TREE_REORG',
                                  output_class=True,
                                  model_type="lightgbm")
        assert array_equal(lgm.booster_.predict(X_predict).argmax(axis=1),
                           lgm_preds)
        assert array_equal(lgm_preds, fm.predict(X_predict))
        # lightgbm uses float64 thresholds, while FIL uses float32
        np.testing.assert_allclose(lgm.predict_proba(X_predict),
                                   fm.predict_proba(X_predict),
                                   atol=proba_atol[num_classes > 2])