def make_annotation(model, dtrain_path, annotation_path): dpath = os.path.abspath(os.path.join(os.getcwd(), 'tests/examples/')) dtrain_path = os.path.join(dpath, dtrain_path) dtrain = treelite.DMatrix(dtrain_path) annotator = treelite.Annotator() annotator.annotate_branch(model=model, dmat=dtrain, verbose=True) annotator.save(path=annotation_path)
def run_pipeline_test(model, dtest_path, libname_fmt, expected_prob_path, expected_margin_path, multiclass, use_annotation, use_quantize): dpath = os.path.abspath(os.path.join(os.getcwd(), 'tests/examples/')) dtest_path = os.path.join(dpath, dtest_path) libpath = libname(libname_fmt) dtest = treelite.DMatrix(dtest_path) batch = treelite.runtime.Batch.from_csr(dtest) expected_prob_path = os.path.join(dpath, expected_prob_path) expected_margin_path = os.path.join(dpath, expected_margin_path) expected_prob = load_txt(expected_prob_path) expected_margin = load_txt(expected_margin_path) if multiclass: nrow = dtest.shape[0] expected_prob = expected_prob.reshape((nrow, -1)) expected_margin = expected_margin.reshape((nrow, -1)) params = {} if use_annotation is not None: params['annotate_in'] = use_annotation if use_quantize: params['quantize'] = 1 for toolchain in os_compatible_toolchains(): model.export_lib(toolchain=toolchain, libpath=libpath, params=params, verbose=True) predictor = treelite.runtime.Predictor(libpath=libpath, verbose=True) out_prob = predictor.predict(batch) assert np.allclose(out_prob, expected_prob, atol=1e-11, rtol=1e-6) out_margin = predictor.predict(batch, pred_margin=True) assert np.allclose(out_margin, expected_margin, atol=1e-11, rtol=1e-6)
def check_predictor(predictor, dataset): """Check whether a predictor produces correct predictions for a given dataset""" dtest = treelite.DMatrix(dataset_db[dataset].dtest) batch = treelite_runtime.Batch.from_csr(dtest) out_margin = predictor.predict(batch, pred_margin=True) out_prob = predictor.predict(batch) check_predictor_output(dataset, dtest.shape, out_margin, out_prob)
def test_srcpkg(self): """Test feature to export a source tarball""" model_path = os.path.join(dpath, 'mushroom/mushroom.model') dmat_path = os.path.join(dpath, 'mushroom/agaricus.test') libpath = libname('./mushroom/mushroom{}') model = treelite.Model.load(model_path, model_format='xgboost') toolchain = os_compatible_toolchains()[0] model.export_srcpkg(platform=os_platform(), toolchain=toolchain, pkgpath='./srcpkg.zip', libname=libpath, params={}, verbose=True) with ZipFile('./srcpkg.zip', 'r') as zip_ref: zip_ref.extractall('.') subprocess.call(['make', '-C', 'mushroom']) predictor = treelite.runtime.Predictor(libpath='./mushroom', verbose=True) X, _ = load_svmlight_file(dmat_path, zero_based=True) dmat = treelite.DMatrix(X) batch = treelite.runtime.Batch.from_csr(dmat) expected_prob_path = os.path.join(dpath, 'mushroom/agaricus.test.prob') expected_prob = load_txt(expected_prob_path) out_prob = predictor.predict(batch) assert_almost_equal(out_prob, expected_prob)
def test_skl_converter_regressor(tmpdir, clazz, toolchain): # pylint: disable=too-many-locals """Convert scikit-learn regressor""" X, y = load_boston(return_X_y=True) kwargs = {} if clazz == GradientBoostingRegressor: kwargs['init'] = 'zero' clf = clazz(max_depth=3, random_state=0, n_estimators=10, **kwargs) clf.fit(X, y) expected_pred = clf.predict(X) model = treelite.sklearn.import_model(clf) assert model.num_feature == clf.n_features_ assert model.num_output_group == 1 assert model.num_tree == clf.n_estimators dtrain = treelite.DMatrix(X) annotation_path = os.path.join(tmpdir, 'annotation.json') annotator = treelite.Annotator() annotator.annotate_branch(model=model, dmat=dtrain, verbose=True) annotator.save(path=annotation_path) libpath = os.path.join(tmpdir, 'skl' + _libext()) model.export_lib(toolchain=toolchain, libpath=libpath, params={'annotate_in': annotation_path}, verbose=True) predictor = treelite_runtime.Predictor(libpath=libpath) assert predictor.num_feature == clf.n_features_ assert model.num_output_group == 1 assert predictor.pred_transform == 'identity' assert predictor.global_bias == 0.0 assert predictor.sigmoid_alpha == 1.0 batch = treelite_runtime.Batch.from_npy2d(X) out_pred = predictor.predict(batch) np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5)
def run_pipeline_test(model, dtest_path, libname_fmt, expected_prob_path, expected_margin_path, multiclass, use_annotation=None, use_quantize=None, use_parallel_comp=None, use_code_folding=None, use_toolchains=None, use_compiler=None): dpath = os.path.abspath(os.path.join(os.getcwd(), 'tests/examples/')) dtest_path = os.path.join(dpath, dtest_path) libpath = libname(libname_fmt) dtest = treelite.DMatrix(dtest_path) batch = treelite.runtime.Batch.from_csr(dtest) expected_prob_path = os.path.join(dpath, expected_prob_path) \ if expected_prob_path is not None else None expected_margin_path = os.path.join(dpath, expected_margin_path) expected_prob = load_txt(expected_prob_path) expected_margin = load_txt(expected_margin_path) if multiclass: nrow = dtest.shape[0] expected_prob = expected_prob.reshape((nrow, -1)) \ if expected_prob is not None else None expected_margin = expected_margin.reshape((nrow, -1)) params = {} if use_annotation is not None: params['annotate_in'] = use_annotation if use_quantize: params['quantize'] = 1 if use_parallel_comp is not None: params['parallel_comp'] = use_parallel_comp if use_code_folding is not None: params['code_folding_req'] = use_code_folding if use_compiler is None: import inspect use_compiler \ = inspect.signature(model.export_lib).parameters['compiler'].default if use_toolchains is None: toolchains = os_compatible_toolchains() else: gcc = os.environ.get('GCC_PATH', 'gcc') toolchains = [(gcc if x == 'gcc' else x) for x in use_toolchains] for toolchain in toolchains: model.export_lib(toolchain=toolchain, libpath=libpath, compiler=use_compiler, params=params, verbose=True) predictor = treelite.runtime.Predictor(libpath=libpath, verbose=True) out_prob = predictor.predict(batch) if expected_prob is not None: assert_almost_equal(out_prob, expected_prob) out_margin = predictor.predict(batch, pred_margin=True) assert_almost_equal(out_margin, expected_margin)
def compute_annotation(dataset): model = treelite.Model.load( dataset_db[dataset].model, model_format=dataset_db[dataset].format) if dataset_db[dataset].dtrain is None: return None dtrain = treelite.DMatrix(dataset_db[dataset].dtrain) annotator = treelite.Annotator() annotator.annotate_branch(model=model, dmat=dtrain, verbose=True) annotation_path = os.path.join(tmpdir, f'{dataset}.json') annotator.save(annotation_path) with open(annotation_path, 'r') as f: return f.read()
def run_pipeline_test(model, dtest_path, libname_fmt, expected_prob_path, expected_margin_path, multiclass, use_annotation, use_quantize, use_parallel_comp, use_code_folding=None, use_all_toolchains=True): dpath = os.path.abspath(os.path.join(os.getcwd(), 'tests/examples/')) dtest_path = os.path.join(dpath, dtest_path) libpath = libname(libname_fmt) dtest = treelite.DMatrix(dtest_path) batch = treelite.runtime.Batch.from_csr(dtest) expected_prob_path = os.path.join(dpath, expected_prob_path) \ if expected_prob_path is not None else None expected_margin_path = os.path.join(dpath, expected_margin_path) expected_prob = load_txt(expected_prob_path) expected_margin = load_txt(expected_margin_path) if multiclass: nrow = dtest.shape[0] expected_prob = expected_prob.reshape((nrow, -1)) \ if expected_prob is not None else None expected_margin = expected_margin.reshape((nrow, -1)) params = {} if use_annotation is not None: params['annotate_in'] = use_annotation if use_quantize: params['quantize'] = 1 if use_parallel_comp is not None: params['parallel_comp'] = use_parallel_comp if use_code_folding is not None: params['code_folding_req'] = use_code_folding toolchains = os_compatible_toolchains() if not use_all_toolchains: toolchains = [toolchains[0]] for toolchain in toolchains: model.export_lib(toolchain=toolchain, libpath=libpath, params=params, verbose=True) predictor = treelite.runtime.Predictor(libpath=libpath, verbose=True) out_prob = predictor.predict(batch) if expected_prob is not None: assert_almost_equal(out_prob, expected_prob) out_margin = predictor.predict(batch, pred_margin=True) assert_almost_equal(out_margin, expected_margin)
def test_lightgbm_binary_classification(self): try: import lightgbm except ImportError: raise nose.SkipTest( ) # skip this test if LightGBM is not installed dtrain_path = os.path.join(dpath, 'mushroom/agaricus.train') dtest_path = os.path.join(dpath, 'mushroom/agaricus.test') dtrain = lightgbm.Dataset(dtrain_path) dtest = lightgbm.Dataset(dtest_path, reference=dtrain) watchlist = [dtrain, dtest] watchlist_names = ['train', 'test'] param = { 'task': 'train', 'boosting_type': 'gbdt', 'metric': 'auc', 'num_leaves': 7, 'learning_rate': 0.1 } num_round = 10 for objective in ['binary', 'xentlambda', 'xentropy']: param['objective'] = objective bst = lightgbm.train(param, dtrain, num_round, watchlist, watchlist_names) bst.save_model('./mushroom_lightgbm.txt') expected_prob = bst.predict(dtest_path) expected_margin = bst.predict(dtest_path, raw_score=True) model = treelite.Model.load('./mushroom_lightgbm.txt', model_format='lightgbm') libpath = libname('./agaricus_{}{{}}'.format(objective)) batch = treelite.runtime.Batch.from_csr( treelite.DMatrix(dtest_path)) for toolchain in os_compatible_toolchains(): model.export_lib(toolchain=toolchain, libpath=libpath, params={}, verbose=True) predictor = treelite.runtime.Predictor(libpath, verbose=True) out_prob = predictor.predict(batch) assert_almost_equal(out_prob, expected_prob) out_margin = predictor.predict(batch, pred_margin=True) assert_almost_equal(out_margin, expected_margin)
def test_skl_converter_multiclass_classifier(tmpdir, clazz, toolchain): # pylint: disable=too-many-locals """Convert scikit-learn multi-class classifier""" if clazz == RandomForestClassifier: pytest.xfail( reason='Need to use float64 thresholds to obtain correct result') X, y = load_iris(return_X_y=True) kwargs = {} if clazz == GradientBoostingClassifier: kwargs['init'] = 'zero' clf = clazz(max_depth=3, random_state=0, n_estimators=10, **kwargs) clf.fit(X, y) expected_prob = clf.predict_proba(X) model = treelite.sklearn.import_model(clf) assert model.num_feature == clf.n_features_ assert model.num_output_group == clf.n_classes_ assert (model.num_tree == clf.n_estimators * (clf.n_classes_ if clazz == GradientBoostingClassifier else 1)) dtrain = treelite.DMatrix(X) annotation_path = os.path.join(tmpdir, 'annotation.json') annotator = treelite.Annotator() annotator.annotate_branch(model=model, dmat=dtrain, verbose=True) annotator.save(path=annotation_path) libpath = os.path.join(tmpdir, 'skl' + _libext()) model.export_lib(toolchain=toolchain, libpath=libpath, params={'annotate_in': annotation_path}, verbose=True) predictor = treelite_runtime.Predictor(libpath=libpath) assert predictor.num_feature == clf.n_features_ assert predictor.num_output_group == clf.n_classes_ assert (predictor.pred_transform == ('softmax' if clazz == GradientBoostingClassifier else 'identity_multiclass')) assert predictor.global_bias == 0.0 assert predictor.sigmoid_alpha == 1.0 batch = treelite_runtime.Batch.from_npy2d(X) out_prob = predictor.predict(batch) np.testing.assert_almost_equal(out_prob, expected_prob)
def test_lightgbm_binary_classification(tmpdir, objective, toolchain): # pylint: disable=too-many-locals """Test a binary classifier""" dataset = 'mushroom' model_path = os.path.join(tmpdir, 'mushroom_lightgbm.txt') dtest_path = dataset_db[dataset].dtest dtrain = lightgbm.Dataset(dataset_db[dataset].dtrain) dtest = lightgbm.Dataset(dtest_path, reference=dtrain) param = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': objective, 'metric': 'auc', 'num_leaves': 7, 'learning_rate': 0.1 } bst = lightgbm.train(param, dtrain, num_boost_round=10, valid_sets=[dtrain, dtest], valid_names=['train', 'test']) bst.save_model(model_path) expected_prob = bst.predict(dtest_path) expected_margin = bst.predict(dtest_path, raw_score=True) model = treelite.Model.load(model_path, model_format='lightgbm') libpath = os.path.join(tmpdir, f'agaricus_{objective}' + _libext()) batch = treelite_runtime.Batch.from_csr(treelite.DMatrix(dtest_path)) model.export_lib(toolchain=toolchain, libpath=libpath, params={}, verbose=True) predictor = treelite_runtime.Predictor(libpath, verbose=True) out_prob = predictor.predict(batch) np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5) out_margin = predictor.predict(batch, pred_margin=True) np.testing.assert_almost_equal(out_margin, expected_margin, decimal=5)
import treelite.runtime import xgboost from fast_svmlight_loader import load_svmlight import sys import os import numpy as np import time files = [('allstate/allstate0.train.libsvm', 'allstate/allstate.json', 'allstate/allstate.so', 'allstate/allstate-1600-0.010000.model'), ('yahoo/yahoo.train', 'yahoo/yahoo.json', 'yahoo/yahoo.so', 'yahoo/yahoo-1600-0.01.model')] for datafile, annotfile, libfile, modfile in files: model = treelite.Model.load(modfile, 'xgboost') dmat = treelite.DMatrix(datafile) annotator = treelite.Annotator() annotator.annotate_branch(model, dmat, verbose=True) annotator.save(annotfile) model.export_lib(toolchain='gcc', libpath=libfile, params={ 'quantize': 1, 'annotate_in': annotfile }, verbose=True) for datafile, _, libfile, modfile in files: print('Loading data file {}'.format(datafile)) start = time.time() dmat = load_svmlight(filename=datafile, verbose=False)
def test_model_builder(tmpdir, use_annotation, quantize, toolchain): """A simple model""" num_feature = 127 pred_transform = 'sigmoid' builder = treelite.ModelBuilder(num_feature=num_feature, random_forest=False, pred_transform=pred_transform) # Build mushroom model tree = treelite.ModelBuilder.Tree() tree[0].set_numerical_test_node(feature_id=29, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=1, right_child_key=2) tree[1].set_numerical_test_node(feature_id=56, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=3, right_child_key=4) tree[3].set_numerical_test_node(feature_id=60, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=7, right_child_key=8) tree[7].set_leaf_node(leaf_value=1.89899647) tree[8].set_leaf_node(leaf_value=-1.94736838) tree[4].set_numerical_test_node(feature_id=21, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=9, right_child_key=10) tree[9].set_leaf_node(leaf_value=1.78378379) tree[10].set_leaf_node(leaf_value=-1.98135197) tree[2].set_numerical_test_node(feature_id=109, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=5, right_child_key=6) tree[5].set_numerical_test_node(feature_id=67, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=11, right_child_key=12) tree[11].set_leaf_node(leaf_value=-1.9854598) tree[12].set_leaf_node(leaf_value=0.938775539) tree[6].set_leaf_node(leaf_value=1.87096775) tree[0].set_root() builder.append(tree) tree = treelite.ModelBuilder.Tree() tree[0].set_numerical_test_node(feature_id=29, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=1, right_child_key=2) tree[1].set_numerical_test_node(feature_id=21, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=3, right_child_key=4) tree[3].set_leaf_node(leaf_value=1.14607906) tree[4].set_numerical_test_node(feature_id=36, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=7, right_child_key=8) tree[7].set_leaf_node(leaf_value=-6.87994671) tree[8].set_leaf_node(leaf_value=-0.10659159) tree[2].set_numerical_test_node(feature_id=109, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=5, right_child_key=6) tree[5].set_numerical_test_node(feature_id=39, opname='<', threshold=-9.53674316e-07, default_left=True, left_child_key=9, right_child_key=10) tree[9].set_leaf_node(leaf_value=-0.0930657759) tree[10].set_leaf_node(leaf_value=-1.15261209) tree[6].set_leaf_node(leaf_value=1.00423074) tree[0].set_root() builder.append(tree) model = builder.commit() assert model.num_feature == num_feature assert model.num_output_group == 1 assert model.num_tree == 2 annotation_path = os.path.join(tmpdir, 'annotation.json') if use_annotation: dtrain = treelite.DMatrix(dataset_db['mushroom'].dtrain) annotator = treelite.Annotator() annotator.annotate_branch(model=model, dmat=dtrain, verbose=True) annotator.save(path=annotation_path) params = { 'annotate_in': (annotation_path if use_annotation else 'NULL'), 'quantize': (1 if quantize else 0), 'parallel_comp': model.num_tree } libpath = os.path.join(tmpdir, 'mushroom' + _libext()) model.export_lib(toolchain=toolchain, libpath=libpath, params=params, verbose=True) predictor = treelite_runtime.Predictor(libpath=libpath, verbose=True) check_predictor(predictor, 'mushroom')