def main(readcsv=read_csv, method='defaultDense'):
    infile = "./data/batch/df_regression_train.csv"
    testfile = "./data/batch/df_regression_test.csv"

    # Configure a Linear regression training object
    train_algo = d4p.decision_forest_regression_training(
        nTrees=100,
        varImportance='MDA_Raw',
        bootstrap=True,
        engine=d4p.engines_mt2203(seed=777),
        resultsToCompute=
        'computeOutOfBagError|computeOutOfBagErrorPerObservation')

    # Read data. Let's have 13 independent, and 1 dependent variables (for each observation)
    indep_data = readcsv(infile, range(13), t=np.float32)
    dep_data = readcsv(infile, range(13, 14), t=np.float32)
    # Now train/compute, the result provides the model for prediction
    train_result = train_algo.compute(indep_data, dep_data)
    # Traiing result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance

    # Now let's do some prediction
    predict_algo = d4p.decision_forest_regression_prediction()
    # read test data (with same #features)
    pdata = readcsv(testfile, range(13), t=np.float32)
    ptdata = readcsv(testfile, range(13, 14), t=np.float32)
    # now predict using the model from the training above
    predict_result = predict_algo.compute(pdata, train_result.model)

    # The prediction result provides prediction
    assert predict_result.prediction.shape == (pdata.shape[0],
                                               dep_data.shape[1])

    return (train_result, predict_result, ptdata)
Ejemplo n.º 2
0
def df_clsf_fit(X, y, n_classes, n_trees=100, seed=12345,
                n_features_per_node=0, max_depth=0, min_impurity=0,
                bootstrap=True, verbose=False):

    fptype = getFPType(X)

    features_per_node = X.shape[1]
    if n_features_per_node > 0 and n_features_per_node < features_per_node:
        features_per_node = n_features_per_node

    engine = engines_mt2203(seed=seed, fptype=fptype)

    algorithm = decision_forest_classification_training(
        nClasses=n_classes,
        fptype=fptype,
        method='defaultDense',
        nTrees=n_trees,
        observationsPerTreeFraction=1.,
        featuresPerNode=features_per_node,
        maxTreeDepth=max_depth,
        minObservationsInLeafNode=1,
        engine=engine,
        impurityThreshold=min_impurity,
        varImportance='MDI',
        resultsToCompute='',
        memorySavingMode=False,
        bootstrap=bootstrap
    )

    df_clsf_result = algorithm.compute(X, y)

    return df_clsf_result
Ejemplo n.º 3
0
def _daal_fit_regressor(self, X, y, sample_weight=None):
    self.n_features_ = X.shape[1]
    rs_ = check_random_state(self.random_state)
 
    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")
 
    X_fptype = getFPType(X)
    seed_ = rs_.randint(0, np.iinfo('i').max)
    daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
 
    _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False)

    n_samples_bootstrap = _get_n_samples_bootstrap(
        n_samples=X.shape[0],
        max_samples=self.max_samples
    )

    if sample_weight is not None:
        sample_weight = [sample_weight]
 
    # create algorithm
    dfr_algorithm = daal4py.decision_forest_regression_training(
        fptype = getFPType(X),
        method = 'defaultDense',
        nTrees = int(self.n_estimators),
        observationsPerTreeFraction = n_samples_bootstrap if self.bootstrap is True else 1.,
        featuresPerNode = int(_featuresPerNode),
        maxTreeDepth = int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode = (self.min_samples_leaf if isinstance(self.min_samples_leaf, numbers.Integral)
                                     else int(ceil(self.min_samples_leaf * X.shape[0]))),
        engine = daal_engine,
        impurityThreshold = float(0.0 if self.min_impurity_split is None else self.min_impurity_split),
        varImportance = "MDI",
        resultsToCompute = "",
        memorySavingMode = False,
        bootstrap = bool(self.bootstrap),
        minObservationsInSplitNode = (self.min_samples_split if isinstance(self.min_samples_split, numbers.Integral)
                                      else int(ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode = self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode = self.min_impurity_decrease,
        maxLeafNodes = 0 if self.max_leaf_nodes is None else self.max_leaf_nodes
    )
 
    self._cached_estimators_ = None

    dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight)
 
    # get resulting model
    model = dfr_trainingResult.model
    self.daal_model_ = model
 
    # compute oob_score_
    if self.oob_score:
        self.estimators_ = self._estimators_
        self._set_oob_score(X, y)
 
    return self
Ejemplo n.º 4
0
def compute(train_data, train_labels, predict_data, method='defaultDense'):
    # Configure a training object 
    train_algo = d4p.decision_forest_regression_training(nTrees=100,
                                                         engine = d4p.engines_mt2203(seed=777),
                                                         varImportance='MDA_Raw',
                                                         bootstrap=True,
                                                         resultsToCompute='computeOutOfBagError|computeOutOfBagErrorPerObservation',
                                                         method=method
                                                         )
    # Training result provides (depending on parameters) model, outOfBagError, outOfBagErrorPerObservation and/or variableImportance
    train_result = train_algo.compute(train_data, train_labels)

    # now predict using the model from the training above
    predict_algo = d4p.decision_forest_regression_prediction()

    predict_result = predict_algo.compute(predict_data, train_result.model)

    return train_result, predict_result
Ejemplo n.º 5
0
def compute(train_data, train_labels, predict_data):
    # Configure a training object
    train_algo = d4p.decision_forest_regression_training(
        method='hist',
        maxBins=256,
        minBinSize=1,
        nTrees=100,
        fptype='float',
        varImportance='MDA_Raw',
        bootstrap=True,
        engine=d4p.engines_mt2203(seed=777),
        resultsToCompute=
        'computeOutOfBagError|computeOutOfBagErrorPerObservation')

    # Training result provides (depending on parameters) model,
    # outOfBagError, outOfBagErrorPerObservation and/or variableImportance
    train_result = train_algo.compute(train_data, train_labels)

    # now predict using the model from the training above
    predict_algo = d4p.decision_forest_regression_prediction(fptype='float')

    predict_result = predict_algo.compute(predict_data, train_result.model)

    return train_result, predict_result
Ejemplo n.º 6
0
    def _daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.double, np.single]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                 " expected. Please change the shape of y to "
                 "(n_samples,), for example using ravel().",
                 DataConversionWarning, stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        self.n_features_ = X.shape[1]
        rs_ = check_random_state(self.random_state)

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        X_fptype = getFPType(X)
        seed_ = rs_.randint(0, np.iinfo('i').max)
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)

        _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=False)

        # create algorithm
        dfr_algorithm = daal4py.decision_forest_regression_training(
            fptype = getFPType(X),
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=1,
            engine=daal_engine,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap)
        )

        self._cached_estimators_ = None
        dfr_trainingResult = dfr_algorithm.compute(X, y)

        # get resulting model
        model = dfr_trainingResult.model
        self.daal_model_ = model

        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Ejemplo n.º 7
0
    def _daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.single, np.double]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.asarray(y)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn("A column-vector y was passed when a 1d array was"
                 " expected. Please change the shape of y to "
                 "(n_samples,), for example using ravel().",
                 DataConversionWarning, stacklevel=2)

        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if self.n_outputs_ != 1:
            _class_name = self.__class__.__name__
            raise ValueError(_class_name + " does not currently support multi-output data. Consider using OneHotEncoder")

        y = check_array(y, ensure_2d=False, dtype=None)
        y, _ = self._validate_y_class_weight(y)
        self.n_classes_ = self.n_classes_[0]
        self.classes_ = self.classes_[0]

        self.n_features_ = X.shape[1]

        rs_ = check_random_state(self.random_state)
        seed_ = rs_.randint(0, np.iinfo('i').max)

        if self.n_classes_ < 2:
            raise ValueError("Training data only contain information about one class.")

        # create algorithm
        X_fptype = getFPType(X)
        daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
        _featuresPerNode = _to_absolute_max_features(self.max_features, X.shape[1], is_classification=True)

        dfc_algorithm = daal4py.decision_forest_classification_training(
            nClasses=int(self.n_classes_),
            fptype=X_fptype,
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=int(self.min_samples_leaf),
            engine=daal_engine_,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap)
        )
        self._cached_estimators_ = None
        # compute
        dfc_trainingResult = dfc_algorithm.compute(X, y)

        # get resulting model
        model = dfc_trainingResult.model
        self.daal_model_ = model

        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Ejemplo n.º 8
0
def _daal_fit_regressor(self, X, y, sample_weight=None):
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    rs_ = check_random_state(self.random_state)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    X_fptype = getFPType(X)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    # limitation on the number of stream for mt2203 is 6024
    # more details here:
    # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html
    max_stream_count = 6024
    if self.n_estimators <= max_stream_count:
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    else:
        daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype)

    _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                 X.shape[1],
                                                 is_classification=False)

    n_samples_bootstrap = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if sample_weight is not None:
        sample_weight = [sample_weight]

    # create algorithm
    dfr_algorithm = daal4py.decision_forest_regression_training(
        fptype=getFPType(X),
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap
        if self.bootstrap is True else 1.,
        featuresPerNode=int(_featuresPerNode),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)

    self._cached_estimators_ = None

    dfr_trainingResult = dfr_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfr_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    #if self.oob_score:
    #    self.estimators_ = self._estimators_
    #    self._set_oob_score(X, y)

    return self
Ejemplo n.º 9
0
def _daal_fit_classifier(self, X, y, sample_weight=None):
    y = check_array(y, ensure_2d=False, dtype=None)
    y, expanded_class_weight = self._validate_y_class_weight(y)
    n_classes_ = self.n_classes_[0]
    self.n_features_ = X.shape[1]

    if expanded_class_weight is not None:
        if sample_weight is not None:
            sample_weight = sample_weight * expanded_class_weight
        else:
            sample_weight = expanded_class_weight
    if sample_weight is not None:
        sample_weight = [sample_weight]

    rs_ = check_random_state(self.random_state)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    if n_classes_ < 2:
        raise ValueError(
            "Training data only contain information about one class.")

    # create algorithm
    X_fptype = getFPType(X)
    daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    features_per_node_ = _to_absolute_max_features(self.max_features,
                                                   X.shape[1],
                                                   is_classification=True)

    n_samples_bootstrap_ = _get_n_samples_bootstrap(
        n_samples=X.shape[0], max_samples=self.max_samples)

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    dfc_algorithm = daal4py.decision_forest_classification_training(
        nClasses=int(n_classes_),
        fptype=X_fptype,
        method='hist' if daal_check_version(
            (2021, 'P', 200)) else 'defaultDense',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap_
        if self.bootstrap is True else 1.,
        featuresPerNode=int(features_per_node_),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf if isinstance(
            self.min_samples_leaf, numbers.Integral) else int(
                ceil(self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine_,
        impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute="",
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split if isinstance(
            self.min_samples_split, numbers.Integral) else int(
                ceil(self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize)
    self._cached_estimators_ = None
    # compute
    dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfc_trainingResult.model
    self.daal_model_ = model

    # compute oob_score_
    if self.oob_score:
        self.estimators_ = self._estimators_
        self._set_oob_score(X, y)

    return self
Ejemplo n.º 10
0
def _daal_fit_classifier(self, X, y, sample_weight=None):
    y = check_array(y, ensure_2d=False, dtype=None)
    y, expanded_class_weight = self._validate_y_class_weight(y)
    n_classes_ = self.n_classes_[0]
    self.n_features_in_ = X.shape[1]
    if not sklearn_check_version('1.0'):
        self.n_features_ = self.n_features_in_

    if expanded_class_weight is not None:
        if sample_weight is not None:
            sample_weight = sample_weight * expanded_class_weight
        else:
            sample_weight = expanded_class_weight
    if sample_weight is not None:
        sample_weight = [sample_weight]

    rs_ = check_random_state(self.random_state)
    seed_ = rs_.randint(0, np.iinfo('i').max)

    if n_classes_ < 2:
        raise ValueError(
            "Training data only contain information about one class.")

    # create algorithm
    X_fptype = getFPType(X)

    # limitation on the number of stream for mt2203 is 6024
    # more details here:
    # https://oneapi-src.github.io/oneDAL/daal/algorithms/engines/mt2203.html
    max_stream_count = 6024
    if self.n_estimators <= max_stream_count:
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
    else:
        daal_engine = daal4py.engines_mt19937(seed=seed_, fptype=X_fptype)

    features_per_node_ = _to_absolute_max_features(
        self.max_features, X.shape[1], is_classification=True)

    n_samples_bootstrap_ = _get_n_samples_bootstrap(
        n_samples=X.shape[0],
        max_samples=self.max_samples
    )

    if not self.bootstrap and self.oob_score:
        raise ValueError("Out of bag estimation only available"
                         " if bootstrap=True")

    dfc_algorithm = daal4py.decision_forest_classification_training(
        nClasses=int(n_classes_),
        fptype=X_fptype,
        method='hist',
        nTrees=int(self.n_estimators),
        observationsPerTreeFraction=n_samples_bootstrap_
        if self.bootstrap is True else 1.,
        featuresPerNode=int(features_per_node_),
        maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
        minObservationsInLeafNode=(self.min_samples_leaf
                                   if isinstance(
                                       self.min_samples_leaf, numbers.Integral)
                                   else int(ceil(
                                       self.min_samples_leaf * X.shape[0]))),
        engine=daal_engine,
        impurityThreshold=float(
            0.0 if self.min_impurity_split is None else self.min_impurity_split),
        varImportance="MDI",
        resultsToCompute=(
            "computeOutOfBagErrorAccuracy|computeOutOfBagErrorDecisionFunction"
            if self.oob_score
            else ""),
        memorySavingMode=False,
        bootstrap=bool(self.bootstrap),
        minObservationsInSplitNode=(self.min_samples_split
                                    if isinstance(
                                        self.min_samples_split, numbers.Integral)
                                    else int(ceil(
                                        self.min_samples_split * X.shape[0]))),
        minWeightFractionInLeafNode=self.min_weight_fraction_leaf,
        minImpurityDecreaseInSplitNode=self.min_impurity_decrease,
        maxLeafNodes=0 if self.max_leaf_nodes is None else self.max_leaf_nodes,
        maxBins=self.maxBins,
        minBinSize=self.minBinSize
    )
    self._cached_estimators_ = None
    # compute
    dfc_trainingResult = dfc_algorithm.compute(X, y, sample_weight)

    # get resulting model
    model = dfc_trainingResult.model
    self.daal_model_ = model

    if self.oob_score:
        self.oob_score_ = dfc_trainingResult.outOfBagErrorAccuracy[0][0]
        self.oob_decision_function_ = dfc_trainingResult.outOfBagErrorDecisionFunction
        if self.oob_decision_function_.shape[-1] == 1:
            self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1)

    return self
Ejemplo n.º 11
0
    def daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.double, np.single]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        y = check_array(y, ensure_2d=False, dtype=X.dtype)
        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]
        self.n_features_ = X.shape[1]
        rs_ = check_random_state(self.random_state)

        if not self.bootstrap and self.oob_score:
            raise ValueError("Out of bag estimation only available"
                             " if bootstrap=True")

        X_fptype = getFPType(X)
        seed_ = rs_.randint(0, np.iinfo('i').max)
        daal_engine = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)

        _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                     X.shape[1],
                                                     is_classification=False)

        # create algorithm
        dfr_algorithm = daal4py.decision_forest_regression_training(
            fptype=getFPType(X),
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=1,
            engine=daal_engine,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                    self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap))

        dfr_trainingResult = dfr_algorithm.compute(X, y)

        # get resulting model
        model = dfr_trainingResult.model
        self.daal_model_ = model

        # convert model to estimators
        est = DecisionTreeRegressor(
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            random_state=None)

        # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution
        estimators_ = []
        for i in range(self.n_estimators):
            est_i = clone(est)
            est_i.n_features_ = self.n_features_
            est_i.n_outputs_ = self.n_outputs_

            tree_i_state_class = daal4py.getTreeState(model, i)
            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }

            est_i.tree_ = Tree(self.n_features_, np.array([1], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self.estimators_ = estimators_
        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self
Ejemplo n.º 12
0
    def daal_fit(self, X, y):
        self._check_daal_supported_parameters()
        _supported_dtypes_ = [np.single, np.double]
        X = check_array(X, dtype=_supported_dtypes_)
        y = np.atleast_1d(y)

        if y.ndim == 2 and y.shape[1] == 1:
            warnings.warn(
                "A column-vector y was passed when a 1d array was"
                " expected. Please change the shape of y to "
                "(n_samples,), for example using ravel().",
                DataConversionWarning,
                stacklevel=2)

        check_consistent_length(X, y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if self.n_outputs_ != 1:
            _class_name = self.__class__.__name__
            raise ValueError(
                _class_name +
                " does not currently support multi-output data. Consider using OneHotEncoder"
            )

        y = check_array(y, ensure_2d=False, dtype=None)
        y, _ = self._validate_y_class_weight(y)
        self.n_classes_ = self.n_classes_[0]
        self.classes_ = self.classes_[0]

        self.n_features_ = X.shape[1]

        rs_ = check_random_state(self.random_state)
        seed_ = rs_.randint(0, np.iinfo('i').max)

        if self.n_classes_ < 2:
            raise ValueError(
                "Training data only contain information about one class.")

        # create algorithm
        X_fptype = getFPType(X)
        daal_engine_ = daal4py.engines_mt2203(seed=seed_, fptype=X_fptype)
        _featuresPerNode = _to_absolute_max_features(self.max_features,
                                                     X.shape[1],
                                                     is_classification=False)

        dfc_algorithm = daal4py.decision_forest_classification_training(
            nClasses=int(self.n_classes_),
            fptype=X_fptype,
            method='defaultDense',
            nTrees=int(self.n_estimators),
            observationsPerTreeFraction=1,
            featuresPerNode=int(_featuresPerNode),
            maxTreeDepth=int(0 if self.max_depth is None else self.max_depth),
            minObservationsInLeafNode=1,
            engine=daal_engine_,
            impurityThreshold=float(0.0 if self.min_impurity_split is None else
                                    self.min_impurity_split),
            varImportance="MDI",
            resultsToCompute="",
            memorySavingMode=False,
            bootstrap=bool(self.bootstrap))
        # compute
        dfc_trainingResult = dfc_algorithm.compute(X, y)

        # get resulting model
        model = dfc_trainingResult.model
        self.daal_model_ = model

        # convert model to estimators
        est = DecisionTreeClassifier(
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            max_features=self.max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_impurity_decrease=self.min_impurity_decrease,
            min_impurity_split=self.min_impurity_split,
            random_state=None)

        # we need to set est.tree_ field with Trees constructed from Intel(R) DAAL solution
        estimators_ = []
        for i in range(self.n_estimators):
            # print("Tree #{}".format(i))
            est_i = clone(est)
            est_i.n_features_ = self.n_features_
            est_i.n_outputs_ = self.n_outputs_
            est_i.classes_ = self.classes_
            est_i.n_classes_ = self.n_classes_
            # treeState members: 'class_count', 'leaf_count', 'max_depth', 'node_ar', 'node_count', 'value_ar'
            tree_i_state_class = daal4py.getTreeState(model, i,
                                                      self.n_classes_)

            node_ndarray = tree_i_state_class.node_ar
            value_ndarray = tree_i_state_class.value_ar
            value_shape = (node_ndarray.shape[0], self.n_outputs_,
                           self.n_classes_)

            # assert np.allclose(value_ndarray, value_ndarray.astype(np.intc, casting='unsafe')), "Value array is non-integer"

            tree_i_state_dict = {
                'max_depth': tree_i_state_class.max_depth,
                'node_count': tree_i_state_class.node_count,
                'nodes': tree_i_state_class.node_ar,
                'values': tree_i_state_class.value_ar
            }
            #
            est_i.tree_ = Tree(self.n_features_,
                               np.array([self.n_classes_], dtype=np.intp),
                               self.n_outputs_)
            est_i.tree_.__setstate__(tree_i_state_dict)
            estimators_.append(est_i)

        self.estimators_ = estimators_

        # compute oob_score_
        if self.oob_score:
            self._set_oob_score(X, y)

        return self