def test_lightgbm(tmp_path): import lightgbm as lgb X, y = simulate_data(500, 10, random_state=43210, classification=True) train_data = lgb.Dataset(X, label=y) param = {'objective': 'binary', 'metric': 'binary_logloss'} num_round = 5 bst = lgb.train(param, train_data, num_round) gbm_preds = bst.predict(X) model_path = str(os.path.join(tmp_path, 'lgb.model')) bst.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") fil_preds = np.asarray(fm.predict(X)) fil_preds = np.reshape(fil_preds, np.shape(gbm_preds)) assert array_equal(np.round(gbm_preds), fil_preds) lcls = lgb.LGBMClassifier().set_params(objective='binary', metric='binary_logloss') lcls.fit(X, y) gbm_proba = lcls.predict_proba(X) lcls.booster_.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") fil_proba = np.asarray(fm.predict_proba(X)) fil_proba = np.reshape(fil_proba, np.shape(gbm_proba)) assert np.allclose(gbm_proba, fil_proba, 1e-2)
def _obtain_fil_model(treelite_handle, depth, output_class=True, threshold=0.5, algo='auto', fil_sparse_format='auto'): """ Creates a Forest Inference (FIL) model using the treelite handle obtained from the cuML Random Forest model. Returns ---------- fil_model : A Forest Inference model which can be used to perform inferencing on the random forest model. """ storage_format = \ _check_fil_parameter_validity(depth=depth, fil_sparse_format=fil_sparse_format, algo=algo) fil_model = ForestInference() tl_to_fil_model = \ fil_model.load_from_randomforest(treelite_handle, output_class=output_class, threshold=threshold, algo=algo, storage_type=storage_format) return tl_to_fil_model
def test_lightgbm(tmp_path, num_classes): import lightgbm as lgb X, y = simulate_data(500, 10 if num_classes == 2 else 50, num_classes, random_state=43210, classification=True) train_data = lgb.Dataset(X, label=y) if num_classes == 2: param = { 'objective': 'binary', 'metric': 'binary_logloss', 'num_class': 1 } else: param = { 'objective': 'ova', # 'multiclass', would use softmax 'metric': 'multi_logloss', 'num_class': num_classes } num_round = 5 model_path = str(os.path.join(tmp_path, 'lgb.model')) bst = lgb.train(param, train_data, num_round) bst.save_model(model_path) if num_classes == 2: # binary classification gbm_proba = bst.predict(X) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") fil_proba = fm.predict_proba(X) assert np.allclose(gbm_proba, fil_proba[:, 1], 1e-2) gbm_preds = (gbm_proba > 0.5) fil_preds = fm.predict(X) assert array_equal(gbm_preds, fil_preds) else: # multi-class classification # FIL doesn't yet support predict_proba() for multi-class # TODO: Add a test for predict_proba() when it's supported gbm_preds = bst.predict(X) gbm_preds = gbm_preds.argmax(axis=1) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") fil_preds = fm.predict(X) assert array_equal(np.round(gbm_preds), fil_preds)
def test_lightgbm(tmp_path, num_classes): import lightgbm as lgb X, y = simulate_data(500, 10 if num_classes == 2 else 50, num_classes, random_state=43210, classification=True) train_data = lgb.Dataset(X, label=y) num_round = 5 model_path = str(os.path.join(tmp_path, 'lgb.model')) if num_classes == 2: param = { 'objective': 'binary', 'metric': 'binary_logloss', 'num_class': 1 } bst = lgb.train(param, train_data, num_round) bst.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") # binary classification gbm_proba = bst.predict(X) fil_proba = fm.predict_proba(X)[:, 1] gbm_preds = (gbm_proba > 0.5) fil_preds = fm.predict(X) assert array_equal(gbm_preds, fil_preds) np.testing.assert_allclose(gbm_proba, fil_proba, atol=proba_atol[num_classes > 2]) else: # multi-class classification lgm = lgb.LGBMClassifier(objective='multiclass', boosting_type='gbdt', n_estimators=num_round) lgm.fit(X, y) lgm.booster_.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") lgm_preds = lgm.predict(X) assert array_equal(lgm.booster_.predict(X).argmax(axis=1), lgm_preds) assert array_equal(lgm_preds, fm.predict(X)) # lightgbm uses float64 thresholds, while FIL uses float32 np.testing.assert_allclose(lgm.predict_proba(X), fm.predict_proba(X), atol=proba_atol[num_classes > 2])
def test_cpp_exception(tmp_path): import lightgbm as lgb num_class = 3 X, y = simulate_data(50, 5 * num_class, num_class, random_state=2020, classification=True) train_data = lgb.Dataset(X, label=y) num_round = 1 param = { 'objective': 'ova', # 'multiclass', would use softmax 'metric': 'multi_logloss', 'num_class': num_class } bst = lgb.train(param, train_data, num_round) model_path = str(os.path.join(tmp_path, 'lgb.model')) bst.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type='lightgbm') with pytest.raises(RuntimeError) as excinfo: _ = fm.predict_proba(X) assert ('predict_proba not supported for multi-class gradient boosted ' + 'decision trees' in str(excinfo.value))
def process(self, inputs): input_meta = self.get_input_meta() predict_col = self.conf.get('prediction', 'predict') data_df = inputs[self.INPUT_PORT_NAME] if self.INPUT_PORT_MODEL_NAME in input_meta: # use external information instead of conf filename = get_file_path(inputs[self.INPUT_PORT_MODEL_NAME]) train_cols = input_meta[self.INPUT_PORT_MODEL_NAME]['train'] train_cols = list(train_cols.keys()) else: # use the conf information filename = get_file_path(self.conf['file']) if 'columns' in self.conf: if self.conf.get('include', True): train_cols = self.conf['columns'] else: train_cols = [ col for col in data_df.columns if col not in self.conf['columns'] ] # train_cols.sort() fm = ForestInference.load(filename, model_type=self.conf.get( "model_type", "xgboost")) prediction = fm.predict(data_df[train_cols]) prediction.index = data_df.index data_df[predict_col] = prediction return {self.OUTPUT_PORT_NAME: data_df}
def test_print_forest_shape(small_classifier_and_preds): model_path, model_type, X, xgb_preds = small_classifier_and_preds m = ForestInference.load(model_path, model_type=model_type, output_class=True, compute_shape_str=True) for substr in ['model size', ' MB', 'Depth histogram:', 'Leaf depth', 'Depth histogram fingerprint', 'Avg nodes per tree']: assert substr in m.shape_str
def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth, storage_type, model_class): # skip depth 20 for dense tests if max_depth == 20 and not storage_type: return # settings classification = True # change this to false to use regression n_categories = 2 random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_categories, random_state=random_state, classification=classification) # identify shape and indices train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) init_kwargs = { 'n_estimators': n_estimators, 'max_depth': max_depth, } if model_class == RandomForestClassifier: init_kwargs['max_features'] = 0.3 init_kwargs['n_jobs'] = -1 else: # model_class == GradientBoostingClassifier init_kwargs['init'] = 'zero' skl_model = model_class(**init_kwargs) skl_model.fit(X_train, y_train) skl_preds = skl_model.predict(X_validation) skl_preds_int = np.around(skl_preds) skl_proba = skl_model.predict_proba(X_validation) skl_acc = accuracy_score(y_validation, skl_preds > 0.5) algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG' fm = ForestInference.load_from_sklearn(skl_model, algo=algo, output_class=True, threshold=0.50, storage_type=storage_type) fil_preds = np.asarray(fm.predict(X_validation)) fil_preds = np.reshape(fil_preds, np.shape(skl_preds_int)) fil_proba = np.asarray(fm.predict_proba(X_validation)) fil_proba = np.reshape(fil_proba, np.shape(skl_proba)) fil_acc = accuracy_score(y_validation, fil_preds) assert fil_acc == pytest.approx(skl_acc, abs=1e-5) assert array_equal(fil_preds, skl_preds_int) assert np.allclose(fil_proba, skl_proba, 1e-3)
def test_lightgbm(tmp_path, num_classes): import lightgbm as lgb X, y = simulate_data(500, 10 if num_classes == 2 else 50, num_classes, random_state=43210, classification=True) train_data = lgb.Dataset(X, label=y) if num_classes == 2: param = { 'objective': 'binary', 'metric': 'binary_logloss', 'num_class': 1 } else: param = { 'objective': 'ova', # 'multiclass', would use softmax 'metric': 'multi_logloss', 'num_class': num_classes } num_round = 5 bst = lgb.train(param, train_data, num_round) gbm_preds = bst.predict(X) if num_classes > 2: gbm_preds = gbm_preds.argmax(axis=1) model_path = str(os.path.join(tmp_path, 'lgb.model')) bst.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") fil_preds = fm.predict(X) assert array_equal(np.round(gbm_preds), fil_preds) if num_classes == 2: lcls = lgb.LGBMClassifier().set_params(**param) lcls.fit(X, y) gbm_proba = lcls.predict_proba(X) lcls.booster_.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") fil_proba = fm.predict_proba(X) assert np.allclose(gbm_proba, fil_proba, 1e-2)
def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth, n_classes, storage_type, model_class): # settings classification = True # change this to false to use regression random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_classes, random_state=random_state, classification=classification) # identify shape and indices train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) init_kwargs = { 'n_estimators': n_estimators, 'max_depth': max_depth, } if model_class == RandomForestClassifier: init_kwargs['max_features'] = 0.3 init_kwargs['n_jobs'] = -1 else: # model_class == GradientBoostingClassifier init_kwargs['init'] = 'zero' skl_model = model_class(**init_kwargs, random_state=random_state) skl_model.fit(X_train, y_train) skl_preds = skl_model.predict(X_validation) skl_preds_int = np.around(skl_preds) skl_proba = skl_model.predict_proba(X_validation) skl_acc = accuracy_score(y_validation, skl_preds_int) algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG' fm = ForestInference.load_from_sklearn(skl_model, algo=algo, output_class=True, threshold=0.50, storage_type=storage_type) fil_preds = np.asarray(fm.predict(X_validation)) fil_preds = np.reshape(fil_preds, np.shape(skl_preds_int)) fil_acc = accuracy_score(y_validation, fil_preds) # fil_acc is within p99 error bars of skl_acc (diff == 0.017 +- 0.012) # however, some tests have a delta as big as 0.04. # sklearn uses float64 thresholds, while FIL uses float32 # TODO(levsnv): once FIL supports float64 accuracy, revisit thresholds threshold = 1e-5 if n_classes == 2 else 0.1 assert fil_acc == pytest.approx(skl_acc, abs=threshold) if n_classes == 2: assert array_equal(fil_preds, skl_preds_int) fil_proba = np.asarray(fm.predict_proba(X_validation)) fil_proba = np.reshape(fil_proba, np.shape(skl_proba)) assert np.allclose(fil_proba, skl_proba, 1e-3)
def test_output_args(small_classifier_and_preds): model_path, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=False, threshold=0.50) X = np.asarray(X) fil_preds = fm.predict(X) assert np.allclose(fil_preds, xgb_preds, 1e-3)
def test_output_storage_type(storage_type, small_classifier_and_preds): model_path, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, output_class=True, storage_type=storage_type, threshold=0.50) xgb_preds_int = np.around(xgb_preds) fil_preds = np.asarray(fm.predict(X)) assert np.allclose(fil_preds, xgb_preds_int, 1e-3)
def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth, storage_type, model_class): # skip depth 20 for dense tests if max_depth == 20 and storage_type == 'DENSE': return # settings n_categories = 1 random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_categories, random_state=random_state, classification=False) # identify shape and indices train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) init_kwargs = { 'n_estimators': n_estimators, 'max_depth': max_depth, } if model_class == RandomForestRegressor: init_kwargs['max_features'] = 0.3 init_kwargs['n_jobs'] = -1 else: # model_class == GradientBoostingRegressor init_kwargs['init'] = 'zero' skl_model = model_class(**init_kwargs) skl_model.fit(X_train, y_train) skl_preds = skl_model.predict(X_validation) skl_mse = mean_squared_error(y_validation, skl_preds) algo = 'NAIVE' if storage_type == 'SPARSE' else 'BATCH_TREE_REORG' fm = ForestInference.load_from_sklearn(skl_model, algo=algo, output_class=False, storage_type=storage_type) fil_preds = np.asarray(fm.predict(X_validation)) fil_preds = np.reshape(fil_preds, np.shape(skl_preds)) fil_mse = mean_squared_error(y_validation, fil_preds) # if fil is better than skl, no need to fail the test assert fil_mse <= skl_mse * (1. + 1e-7) + 1e-4 assert array_equal(fil_preds, skl_preds)
def test_thresholding(output_class, small_classifier_and_preds): model_path, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=output_class, threshold=0.50) fil_preds = np.asarray(fm.predict(X)) if output_class: assert ((fil_preds != 0.0) & (fil_preds != 1.0)).sum() == 0 else: assert ((fil_preds != 0.0) & (fil_preds != 1.0)).sum() > 0
def test_output_algos(algo, small_classifier_and_preds): model_path, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, algo=algo, output_class=True, threshold=0.50) xgb_preds_int = np.around(xgb_preds) fil_preds = np.asarray(fm.predict(X)) fil_preds = np.reshape(fil_preds, np.shape(xgb_preds_int)) assert np.allclose(fil_preds, xgb_preds_int, 1e-3)
def test_output_args(small_classifier_and_preds): model_path, model_type, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, model_type=model_type, algo='TREE_REORG', output_class=False, threshold=0.50) X = np.asarray(X) fil_preds = fm.predict(X) fil_preds = np.reshape(fil_preds, np.shape(xgb_preds)) assert array_equal(fil_preds, xgb_preds, 1e-3)
def test_fil_classification(n_rows, n_columns, num_rounds, n_classes, tmp_path): # settings classification = True # change this to false to use regression random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_classes, random_state=random_state, classification=classification) # identify shape and indices n_rows, n_columns = X.shape train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) model_path = os.path.join(tmp_path, 'xgb_class.model') bst = _build_and_save_xgboost(model_path, X_train, y_train, num_rounds=num_rounds, classification=classification, n_classes=n_classes) dvalidation = xgb.DMatrix(X_validation, label=y_validation) if n_classes == 2: xgb_preds = bst.predict(dvalidation) xgb_preds_int = np.around(xgb_preds) xgb_proba = np.stack([1 - xgb_preds, xgb_preds], axis=1) else: xgb_proba = bst.predict(dvalidation) xgb_preds_int = xgb_proba.argmax(axis=1) xgb_acc = accuracy_score(y_validation, xgb_preds_int) fm = ForestInference.load(model_path, algo='auto', output_class=True, threshold=0.50) fil_preds = np.asarray(fm.predict(X_validation)) fil_proba = np.asarray(fm.predict_proba(X_validation)) fil_acc = accuracy_score(y_validation, fil_preds) assert fil_acc == pytest.approx(xgb_acc, abs=0.01) assert array_equal(fil_preds, xgb_preds_int) np.testing.assert_allclose(fil_proba, xgb_proba, atol=proba_atol[n_classes > 2])
def test_fil_classification(n_rows, n_columns, num_rounds, tmp_path): # settings classification = True # change this to false to use regression n_rows = n_rows # we'll use 1 millions rows n_columns = n_columns n_categories = 2 random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, n_categories, random_state=random_state, classification=classification) # identify shape and indices n_rows, n_columns = X.shape train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) model_path = os.path.join(tmp_path, 'xgb_class.model') bst = _build_and_save_xgboost(model_path, X_train, y_train, num_rounds=num_rounds, classification=classification) dvalidation = xgb.DMatrix(X_validation, label=y_validation) xgb_preds = bst.predict(dvalidation) xgb_preds_int = np.around(xgb_preds) xgb_proba = np.stack([1 - xgb_preds, xgb_preds], axis=1) xgb_acc = accuracy_score(y_validation, xgb_preds > 0.5) fm = ForestInference.load(model_path, algo='BATCH_TREE_REORG', output_class=True, threshold=0.50) fil_preds = np.asarray(fm.predict(X_validation)) fil_preds = np.reshape(fil_preds, np.shape(xgb_preds_int)) fil_proba = np.asarray(fm.predict_proba(X_validation)) fil_proba = np.reshape(fil_proba, np.shape(xgb_proba)) fil_acc = accuracy_score(y_validation, fil_preds) assert fil_acc == pytest.approx(xgb_acc, 0.01) assert array_equal(fil_preds, xgb_preds_int) assert array_equal(fil_proba, xgb_proba)
def test_output_blocks_per_sm(storage_type, blocks_per_sm, small_classifier_and_preds): model_path, model_type, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, model_type=model_type, output_class=True, storage_type=storage_type, threshold=0.50, blocks_per_sm=blocks_per_sm) xgb_preds_int = np.around(xgb_preds) fil_preds = np.asarray(fm.predict(X)) fil_preds = np.reshape(fil_preds, np.shape(xgb_preds_int)) assert np.allclose(fil_preds, xgb_preds_int, 1e-3)
def test_output_args(format, small_classifier_and_preds): model_path, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=False, threshold=0.50) if format == 'numpy': X = np.asarray(X) elif format == 'cudf': X = cudf.DataFrame.from_gpu_matrix( cuda.to_device(np.ascontiguousarray(X))) elif format == 'gpuarray': X = cuda.to_device(np.ascontiguousarray(X)) else: assert False fil_preds = fm.predict(X) assert np.allclose(fil_preds, xgb_preds, 1e-3)
def test_lightgbm(tmp_path): import lightgbm as lgb X, y = simulate_data(500, 10, random_state=43210, classification=True) train_data = lgb.Dataset(X, label=y) param = {'objective': 'binary', 'metric': 'binary_logloss'} num_round = 5 bst = lgb.train(param, train_data, num_round) gbm_preds = bst.predict(X) model_path = str(os.path.join(tmp_path, 'lgb.model')) bst.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=False, model_type="lightgbm") fil_preds = np.asarray(fm.predict(X)) assert np.allclose(gbm_preds, fil_preds, 1e-3)
def load_trained_model(): """ Cached loading of trained [ XGBoost or RandomForest ] model into memory Note: Models selected via filename parsing, edit if necessary """ xgb_models = glob.glob('/opt/ml/model/*_xgb') rf_models = glob.glob('/opt/ml/model/*_rf') kmeans_models = glob.glob('/opt/ml/model/*_kmeans') app.logger.info(f'detected xgboost models : {xgb_models}') app.logger.info(f'detected randomforest models : {rf_models}') app.logger.info(f'detected kmeans models : {kmeans_models}\n\n') model_type = None start_time = time.perf_counter() if len(xgb_models): model_type = 'XGBoost' model_filename = xgb_models[0] if GPU_INFERENCE_FLAG: # FIL reloaded_model = ForestInference.load(model_filename) else: # native XGBoost reloaded_model = xgboost.Booster() reloaded_model.load_model(fname=model_filename) elif len(rf_models): model_type = 'RandomForest' model_filename = rf_models[0] reloaded_model = joblib.load(model_filename) elif len(kmeans_models): model_type = 'KMeans' model_filename = kmeans_models[0] reloaded_model = joblib.load(model_filename) else: raise Exception('! No trained models detected') exec_time = time.perf_counter() - start_time app.logger.info(f'> model {model_filename} ' f'loaded in {exec_time:.5f} s \n') return reloaded_model, model_type, model_filename
def test_threads_per_tree(threads_per_tree, small_classifier_and_preds): model_path, model_type, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, model_type=model_type, output_class=True, storage_type='auto', threshold=0.50, threads_per_tree=threads_per_tree, n_items=1) fil_preds = np.asarray(fm.predict(X)) fil_proba = np.asarray(fm.predict_proba(X)) xgb_proba = np.stack([1-xgb_preds, xgb_preds], axis=1) np.testing.assert_allclose(fil_proba, xgb_proba, atol=proba_atol[False]) xgb_preds_int = np.around(xgb_preds) fil_preds = np.reshape(fil_preds, np.shape(xgb_preds_int)) assert np.allclose(fil_preds, xgb_preds_int, 1e-3)
def test_fil_regression(n_rows, n_columns, num_rounds, tmp_path, max_depth): # settings classification = False # change this to false to use regression n_rows = n_rows # we'll use 1 millions rows n_columns = n_columns random_state = np.random.RandomState(43210) X, y = simulate_data(n_rows, n_columns, random_state=random_state, classification=classification, bias=10.0) # identify shape and indices n_rows, n_columns = X.shape train_size = 0.80 X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=train_size, random_state=0) model_path = os.path.join(tmp_path, 'xgb_reg.model') bst = _build_and_save_xgboost(model_path, X_train, y_train, classification=classification, num_rounds=num_rounds, xgboost_params={'max_depth': max_depth}) dvalidation = xgb.DMatrix(X_validation, label=y_validation) xgb_preds = bst.predict(dvalidation) xgb_mse = mean_squared_error(y_validation, xgb_preds) fm = ForestInference.load(model_path, algo='BATCH_TREE_REORG', output_class=False) fil_preds = np.asarray(fm.predict(X_validation)) fil_preds = np.reshape(fil_preds, np.shape(xgb_preds)) fil_mse = mean_squared_error(y_validation, fil_preds) assert fil_mse == pytest.approx(xgb_mse, 0.01) assert array_equal(fil_preds, xgb_preds)
def load_trained_model(): """ Cached loading of trained [ XGBoost or RandomForest ] model into memory Note: Models selected via filename parsing, edit if necessary """ xgb_models = glob.glob("/opt/ml/model/*_xgb") rf_models = glob.glob("/opt/ml/model/*_rf") app.logger.info(f"detected xgboost models : {xgb_models}") app.logger.info(f"detected randomforest models : {rf_models}\n\n") model_type = None start_time = time.perf_counter() if len(xgb_models): model_type = "XGBoost" model_filename = xgb_models[0] if GPU_INFERENCE_FLAG: # FIL reloaded_model = ForestInference.load(model_filename) else: # native XGBoost reloaded_model = xgboost.Booster() reloaded_model.load_model(fname=model_filename) elif len(rf_models): model_type = "RandomForest" model_filename = rf_models[0] reloaded_model = joblib.load(model_filename) else: raise Exception("! No trained models detected") exec_time = time.perf_counter() - start_time app.logger.info(f"> model {model_filename} " f"loaded in {exec_time:.5f} s \n") return reloaded_model, model_type, model_filename
def test_lightgbm(tmp_path, num_classes, n_categorical): import lightgbm as lgb if n_categorical > 0: n_features = 10 n_rows = 1000 n_informative = n_features else: n_features = 10 if num_classes == 2 else 50 n_rows = 500 n_informative = 'auto' X, y = simulate_data(n_rows, n_features, num_classes, n_informative=n_informative, random_state=43210, classification=True) if n_categorical > 0: X_fit, X_predict = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1, random_state=43210) else: X_fit, X_predict = X, X train_data = lgb.Dataset(X_fit, label=y) num_round = 5 model_path = str(os.path.join(tmp_path, 'lgb.model')) if num_classes == 2: param = {'objective': 'binary', 'metric': 'binary_logloss', 'num_class': 1} bst = lgb.train(param, train_data, num_round) bst.save_model(model_path) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") # binary classification gbm_proba = bst.predict(X_predict) fil_proba = fm.predict_proba(X_predict)[:, 1] gbm_preds = (gbm_proba > 0.5).astype(float) fil_preds = fm.predict(X_predict) assert array_equal(gbm_preds, fil_preds) np.testing.assert_allclose(gbm_proba, fil_proba, atol=proba_atol[num_classes > 2]) else: # multi-class classification lgm = lgb.LGBMClassifier(objective='multiclass', boosting_type='gbdt', n_estimators=num_round) lgm.fit(X_fit, y) lgm.booster_.save_model(model_path) lgm_preds = lgm.predict(X_predict).astype(int) fm = ForestInference.load(model_path, algo='TREE_REORG', output_class=True, model_type="lightgbm") assert array_equal(lgm.booster_.predict(X_predict).argmax(axis=1), lgm_preds) assert array_equal(lgm_preds, fm.predict(X_predict)) # lightgbm uses float64 thresholds, while FIL uses float32 np.testing.assert_allclose(lgm.predict_proba(X_predict), fm.predict_proba(X_predict), atol=proba_atol[num_classes > 2])