def main(readcsv=read_csv, method='defaultDense'):
    infile = "./data/batch/df_regression_train.csv"
    testfile = "./data/batch/df_regression_test.csv"

    # Configure a Linear regression training object
    train_algo = d4p.decision_forest_regression_training(
        nTrees=100,
        varImportance='MDA_Raw',
        bootstrap=True,
        engine=d4p.engines_mt2203(seed=777),
        resultsToCompute=
        'computeOutOfBagError|computeOutOfBagErrorPerObservation')

    # Read data. Let's have 13 independent, and 1 dependent variables (for each observation)
    indep_data = readcsv(infile, range(13), t=np.float32)
    dep_data = readcsv(infile, range(13, 14), t=np.float32)
    # Now train/compute, the result provides the model for prediction
    train_result = train_algo.compute(indep_data, dep_data)
    # Traiing result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance

    # Now let's do some prediction
    predict_algo = d4p.decision_forest_regression_prediction()
    # read test data (with same #features)
    pdata = readcsv(testfile, range(13), t=np.float32)
    ptdata = readcsv(testfile, range(13, 14), t=np.float32)
    # now predict using the model from the training above
    predict_result = predict_algo.compute(pdata, train_result.model)

    # The prediction result provides prediction
    assert predict_result.prediction.shape == (pdata.shape[0],
                                               dep_data.shape[1])

    return (train_result, predict_result, ptdata)
def run_inference(num_observations: int = 1000):
    """Run xgboost for specified number of observations"""
    # Load data
    train_x_df = common.get_test_data_df(X=common.X_df, size=num_observations)
    train_y_df = common.get_test_data_df(X=common.y_df, size=num_observations)
    num_rows = len(train_x_df)
    ######################
    print("_______________________________________")
    print("Total Number of Rows", num_rows)
    run_times = []
    inference_times = []
    for _ in range(NUM_LOOPS):

        start_time = timer()
        MODEL = d4p.decision_forest_regression_training(nTrees=100)
        train_result = MODEL.compute(train_x_df, train_y_df)
        end_time = timer()

        total_time = end_time - start_time
        run_times.append(total_time * 10e3)

        inference_time = total_time * (10e6) / num_rows
        inference_times.append(inference_time)

    return_elem = common.calculate_stats(inference_times)
    print(num_observations, ", ", return_elem)
    return return_elem
Beispiel #3
0
def df_regr_fit(X,
                y,
                n_trees=100,
                seed=12345,
                n_features_per_node=0,
                max_depth=0,
                min_impurity=0,
                bootstrap=True):

    fptype = getFPType(X)

    features_per_node = X.shape[1]
    if n_features_per_node > 0 and n_features_per_node <= features_per_node:
        features_per_node = n_features_per_node

    engine = engines_mt2203(seed=seed, fptype=fptype)

    algorithm = decision_forest_regression_training(
        fptype=fptype,
        method='defaultDense',
        nTrees=n_trees,
        observationsPerTreeFraction=1.,
        featuresPerNode=features_per_node,
        maxTreeDepth=max_depth,
        minObservationsInLeafNode=1,
        engine=engine,
        impurityThreshold=min_impurity,
        varImportance='MDI',
        resultsToCompute='',
        memorySavingMode=False,
        bootstrap=bootstrap)

    df_regr_result = algorithm.compute(X, y)

    return df_regr_result
Beispiel #4
0
def _daal_fit_regressor(self, X, y, sample_weight=None):
    self.n_features_ = X.shape[1]
    rs_ = check_random_state(self.random_state)
 
    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")
 
    X_fptype = getFPType(X)
    seed_ = rs_.randint(0, np.iinfo('i').max)
    daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
 
    _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False)

    n_samples_bootstrap = _get_n_samples_bootstrap(
        n_samples=X.shape[0],
        max_samples=self.max_samples
    )

    if sample_weight is not None:
        sample_weight = [sample_weight]
 
    # create algorithm
    dfr_algorithm = daal4py.decision_forest_regression_training(
        fptype = getFPType(X),
        method = 'defaultDense',
        nTrees = int(self.n_estimators),
        observationsPerTreeFraction = n_samples_bootstrap if self.bootstrap is True else 1.,
        featuresPerNode = int(_featuresPerNode),
        maxTreeDepth = int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode = (self.min_samples_leaf if isinstance(self.min_samples_leaf, numbers.Integral)
                                     else int(ceil(self.min_samples_leaf * X.shape[0]))),
        engine = daal_engine,
        impurityThreshold = float(0.0 if self.min_impurity_split is None else self.min_impurity_split),
        varImportance = "MDI",
        resultsToCompute = "",
        memorySavingMode = False,
        bootstrap = bool(self.bootstrap),
        minObservationsInSplitNode = (self.min_samples_split if isinstance(self.min_samples_split, numbers.Integral)
                                      else int(ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode = self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode = self.min_impurity_decrease,
        maxLeafNodes = 0 if self.max_leaf_nodes is None else self.max_leaf_nodes
    )
 
    self._cached_estimators_ = None

    dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight)
 
    # get resulting model
    model = dfr_trainingResult.model
    self.daal_model_ = model
 
    # compute oob_score_
    if self.oob_score:
        self.estimators_ = self._estimators_
        self._set_oob_score(X, y)
 
    return self
Beispiel #5
0
def compute(train_data, train_labels, predict_data, method='defaultDense'):
    # Configure a training object 
    train_algo = d4p.decision_forest_regression_training(nTrees=100,
                                                         engine = d4p.engines_mt2203(seed=777),
                                                         varImportance='MDA_Raw',
                                                         bootstrap=True,
                                                         resultsToCompute='computeOutOfBagError|computeOutOfBagErrorPerObservation',
                                                         method=method
                                                         )
    # Training result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance
    train_result = train_algo.compute(train_data, train_labels)

    # now predict using the model from the training above
    predict_algo = d4p.decision_forest_regression_prediction()

    predict_result = predict_algo.compute(predict_data, train_result.model)

    return train_result, predict_result
Beispiel #6
0
def compute(train_data, train_labels, predict_data):
    # Configure a training object
    train_algo = d4p.decision_forest_regression_training(
        method='hist',
        maxBins=256,
        minBinSize=1,
        nTrees=100,
        fptype='float',
        varImportance='MDA_Raw',
        bootstrap=True,
        engine=d4p.engines_mt2203(seed=777),
        resultsToCompute=
        'computeOutOfBagError|computeOutOfBagErrorPerObservation')

    # Training result provides (depending on parameters) model,
    # outOfBagError, outOfBagErrorPerObservation and/or variableImportance
    train_result = train_algo.compute(train_data, train_labels)

    # now predict using the model from the training above
    predict_algo = d4p.decision_forest_regression_prediction(fptype='float')

    predict_result = predict_algo.compute(predict_data, train_result.model)

    return train_result, predict_result
Beispiel #7
0
    def _daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.double, np.single]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                 " expected. Please change the shape of y to "
                 "(n_samples,), for example using ravel().",
                 DataConversionWarning, stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        self.n_features_ = X.shape[1]
        rs_ = check_random_state(self.random_state)

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        X_fptype = getFPType(X)
        seed_ = rs_.randint(0, np.iinfo('i').max)
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)

        _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False)

        # create algorithm
        dfr_algorithm = daal4py.decision_forest_regression_training(
            fptype = getFPType(X),
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=1,
            engine=daal_engine,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap)
        )

        self._cached_estimators_ = None
        dfr_trainingResult = dfr_algorithm.compute(X, y)

        # get resulting model
        model = dfr_trainingResult.model
        self.daal_model_ = model

        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Beispiel #8
0
def _daal_fit_regressor(self, X, y, sample_weight=None):
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    rs_ = check_random_state(self.random_state)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    X_fptype = getFPType(X)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    # limitation on the number of stream for mt2203 is 6024
    # more details here:
    # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html
    max_stream_count = 6024
    if self.n_estimators <= max_stream_count:
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    else:
        daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype)

    _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                 X.shape[1],
                                                 is_classification=False)

    n_samples_bootstrap = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if sample_weight is not None:
        sample_weight = [sample_weight]

    # create algorithm
    dfr_algorithm = daal4py.decision_forest_regression_training(
        fptype=getFPType(X),
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap
        if self.bootstrap is True else 1.,
        featuresPerNode=int(_featuresPerNode),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)

    self._cached_estimators_ = None

    dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfr_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    #if self.oob_score:
    #    self.estimators_ = self._estimators_
    #    self._set_oob_score(X, y)

    return self
Beispiel #9
0
    def daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.double, np.single]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        self.n_features_ = X.shape[1]
        rs_ = check_random_state(self.random_state)

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        X_fptype = getFPType(X)
        seed_ = rs_.randint(0, np.iinfo('i').max)
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)

        _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                     X.shape[1],
                                                     is_classification=False)

        # create algorithm
        dfr_algorithm = daal4py.decision_forest_regression_training(
            fptype=getFPType(X),
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=1,
            engine=daal_engine,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                    self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap))

        dfr_trainingResult = dfr_algorithm.compute(X, y)

        # get resulting model
        model = dfr_trainingResult.model
        self.daal_model_ = model

        # convert model to estimators
        est = DecisionTreeRegressor(
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            random_state=None)

        # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution
        estimators_ = []
        for i in range(self.n_estimators):
            est_i = clone(est)
            est_i.n_features_ = self.n_features_
            est_i.n_outputs_ = self.n_outputs_

            tree_i_state_class = daal4py.getTreeState(model, i)
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }

            est_i.tree_ = Tree(self.n_features_, np.array([1], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self.estimators_ = estimators_
        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self
from timeit import default_timer as timer

#import xgboost as xgb
from sklearn.metrics import mean_squared_error
import daal4py as d4p
import numpy as np
import pandas as pd

import common
d4p.daalinit()
NUM_LOOPS = 100

print("Computing for Random Forest")
MODEL =  d4p.decision_forest_regression_training(nTrees=100)
train_result = MODEL.compute(common.X_df, common.y_df)

def run_inference(num_observations:int = 1000):
    """Run xgboost for specified number of observations"""
    # Load data
    test_df = common.get_test_data_df(X=common.X_df,size = num_observations)
    num_rows = len(test_df)
    ######################
    print("_______________________________________")
    print("Total Number of Rows", num_rows)
    run_times = []
    inference_times = []
    for _ in range(NUM_LOOPS):
        
        start_time = timer()
        predict_algo = d4p.decision_forest_regression_prediction(fptype='float')
        predict_result = predict_algo.compute(test_df, train_result.model)