Esempio n. 1
0
def convert(model, features, target):
    """Convert a LinearSVR model to the protobuf spec.
    Parameters
    ----------
    model: LinearSVR
        A trained LinearSVR model.

    feature_names: [str]
        Name of the input columns.

    target: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    # Check the scikit learn model
    _sklearn_util.check_expected_type(model, _LinearSVR)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'coef_'))

    return _MLModel(_linear_regression._convert(model, features, target))
Esempio n. 2
0
def convert(model, feature_names, target):
    """Convert a decision tree model to protobuf format.

    Parameters
    ----------
    decision_tree : DecisionTreeRegressor
        A trained scikit-learn tree model.

    feature_names: [str]
        Name of the input columns.

    target: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model, _tree.DecisionTreeRegressor)
    _sklearn_util.check_fitted(
        model, lambda m: hasattr(m, 'tree_') and model.tree_ is not None)
    return _MLModel(_convert_tree_ensemble(model, feature_names, target))
Esempio n. 3
0
def convert(model, input_name, output_features):
    """Convert a decision tree model to protobuf format.

    Parameters
    ----------
    decision_tree : DecisionTreeClassifier
        A trained scikit-learn tree model.

    input_name: str
        Name of the input columns.

    output_name: str
        Name of the output columns.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not (HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model, _tree.DecisionTreeClassifier)
    _sklearn_util.check_fitted(
        model, lambda m: hasattr(m, 'tree_') and model.tree_ is not None)

    return _MLModel(
        convert_tree_ensemble(model,
                              input_name,
                              output_features,
                              mode='classifier',
                              class_labels=model.classes_))
def get_input_dimension(model):
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'statistics_'))
    return len(model.statistics_)
def update_dimension(model, input_dimension):
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
    
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'mean_'))
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'scale_'))
    # Nothing to do for this model
    return input_dimension
def convert(model, input_features, output_features):
    """Convert a DictVectorizer model to the protobuf spec.

    Parameters
    ----------
    model: DictVectorizer
        A fitted DictVectorizer model.

    input_features: str
        Name of the input column.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    # Set the interface params.
    spec = _Model_pb2.Model()
    spec.specificationVersion = SPECIFICATION_VERSION

    assert len(input_features) == 1
    assert isinstance(input_features[0][1], datatypes.Array)

    # feature name in and out are the same here
    spec = set_transform_interface_params(spec, input_features,
                                          output_features)

    # Test the scikit-learn model
    _sklearn_util.check_expected_type(model, Imputer)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'statistics_'))

    if model.axis != 0:
        raise ValueError("Imputation is only supported along axis = 0.")

    # The imputer in our framework only works on single columns, so
    # we need to translate that over.  The easiest way to do that is to
    # put it in a nested pipeline with a feature extractor and a

    tr_spec = spec.imputer

    for v in model.statistics_:
        tr_spec.imputedDoubleArray.vector.append(v)

    try:
        tr_spec.replaceDoubleValue = float(model.missing_values)
    except ValueError:
        raise ValueError("Only scalar values or NAN as missing_values "
                         "in _imputer are supported.")

    return _MLModel(spec)
Esempio n. 7
0
def convert(model, feature_names, target):
    """Convert a boosted tree model to protobuf format.

    Parameters
    ----------
    decision_tree : GradientBoostingClassifier
        A trained scikit-learn tree model.

    feature_names: [str]
        Name of the input columns.

    target: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model,
                                      _ensemble.GradientBoostingClassifier)

    def is_gbr_model(m):
        if len(m.estimators_) == 0:
            return False
        if hasattr(m, 'estimators_') and m.estimators_ is not None:
            for t in m.estimators_.flatten():
                if not hasattr(t, 'tree_') or t.tree_ is None:
                    return False
            return True
        else:
            return False

    _sklearn_util.check_fitted(model, is_gbr_model)
    post_evaluation_transform = None
    if model.n_classes_ == 2:
        base_prediction = [model.init_.prior]
        post_evaluation_transform = 'Regression_Logistic'
    else:
        base_prediction = list(model.init_.priors)
        post_evaluation_transform = 'Classification_SoftMax'
    return _MLModel(
        _convert_tree_ensemble(
            model,
            feature_names,
            target,
            mode='classifier',
            base_prediction=base_prediction,
            class_labels=model.classes_,
            post_evaluation_transform=post_evaluation_transform))
Esempio n. 8
0
def get_input_dimension(model):
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'active_features_'))
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'n_values_'))

    if model.categorical_features == 'all':
        return len(model.feature_indices_) - 1
    else:
        # This can't actually be determined from the model as indices after the
        # rest of the categorical values don't seem to be tracked
        return None
def convert(model, input_features, output_features):
    """Convert a boosted tree model to protobuf format.

    Parameters
    ----------
    decision_tree : GradientBoostingRegressor
        A trained scikit-learn tree model.

    input_feature: [str]
        Name of the input columns.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model,
                                      _ensemble.GradientBoostingRegressor)

    def is_gbr_model(m):
        if len(m.estimators_) == 0:
            return False
        if hasattr(m, 'estimators_') and m.estimators_ is not None:
            for t in m.estimators_.flatten():
                if not hasattr(t, 'tree_') or t.tree_ is None:
                    return False
            return True
        else:
            return False

    _sklearn_util.check_fitted(model, is_gbr_model)

    base_prediction = model.init_.mean

    return _MLModel(
        _convert_tree_ensemble(model,
                               input_features,
                               output_features,
                               base_prediction=base_prediction))
Esempio n. 10
0
def convert(model, input_features, output_features):
    """Convert a normalizer model to the protobuf spec.

    Parameters
    ----------
    model: Normalizer
        A Normalizer.

    input_features: str
        Name of the input column.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """

    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    # Test the scikit-learn model
    _sklearn_util.check_expected_type(model, Normalizer)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'norm'))

    # Set the interface params.
    spec = _Model_pb2.Model()
    spec.specificationVersion = SPECIFICATION_VERSION
    spec = _set_transform_interface_params(spec, input_features,
                                           output_features)

    # Set the one hot encoder parameters
    _normalizer_spec = spec.normalizer
    if model.norm == 'l1':
        _normalizer_spec.normType = _proto__normalizer.L1
    elif model.norm == 'l2':
        _normalizer_spec.normType = _proto__normalizer.L2
    elif model.norm == 'max':
        _normalizer_spec.normType = _proto__normalizer.LMax
    return _MLModel(spec)
Esempio n. 11
0
def update_dimension(model, input_dimension):
    """
    Given a model that takes an array of dimension input_dimension, returns
    the output dimension.
    """
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'active_features_'))
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'n_values_'))

    if model.categorical_features == 'all':
        return len(model.active_features_)
    else:
        out_dimension = (len(model.active_features_) +
                         (input_dimension - len(model.n_values_)))

    return out_dimension
def convert(model, feature_names, target):
    """Convert a boosted tree model to protobuf format.

    Parameters
    ----------
    decision_tree : RandomForestClassifier
        A trained scikit-learn tree model.

    feature_names: [str]
        Name of the input columns.

    target: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    _sklearn_util.check_expected_type(model, _ensemble.RandomForestClassifier)

    def is_rf_model(m):
        if len(m.estimators_) == 0:
            return False
        if hasattr(m, 'estimators_') and m.estimators_ is not None:
            for t in m.estimators_:
                if not hasattr(t, 'tree_') or t.tree_ is None:
                    return False
            return True
        else:
            return False

    _sklearn_util.check_fitted(model, is_rf_model)
    return _MLModel(
        _convert_tree_ensemble(model,
                               feature_names,
                               target,
                               mode='classifier',
                               class_labels=model.classes_))
def convert(model, input_features, output_features):
    """Convert a _imputer model to the protobuf spec.

    Parameters
    ----------
    model: Imputer
        A trained Imputer model.

    input_features: str
        Name of the input column.

    output_features: str
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')
    
    # Test the scikit-learn model
    _sklearn_util.check_expected_type(model, StandardScaler)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'mean_'))
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'scale_'))

    # Set the interface params.
    spec = _Model_pb2.Model()
    spec.specificationVersion = SPECIFICATION_VERSION
    spec = _set_transform_interface_params(spec, input_features, output_features)

    # Set the parameters
    tr_spec = spec.scaler
    for x in model.mean_:
        tr_spec.shiftValue.append(-x)

    for x in model.scale_:
        tr_spec.scaleValue.append(1.0 / x)

    return _MLModel(spec)
Esempio n. 14
0
def convert(model, input_features, output_features):
    """Convert a one-hot-encoder model to the protobuf spec.

    Parameters
    ----------
    model: OneHotEncoder
        A trained one-hot encoder model.

    input_features: str, optional
        Name of the input column.

    output_features: str, optional
        Name of the output column.

    Returns
    -------
    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    """
    if not (_HAS_SKLEARN):
        raise RuntimeError(
            'scikit-learn not found. scikit-learn conversion API is disabled.')

    # Make sure the model is fitted.
    _sklearn_util.check_expected_type(model, OneHotEncoder)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'active_features_'))
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'n_values_'))

    input_dimension = get_input_dimension(model)

    if input_dimension is not None:
        # Make sure that our starting dimensions are correctly managed.
        assert len(input_features) == 1
        assert input_features[0][1] == datatypes.Array(input_dimension)

    input_dimension = input_features[0][1].num_elements

    expected_output_dimension = update_dimension(model, input_dimension)
    assert output_features[0][1] == datatypes.Array(expected_output_dimension)

    # Create a pipeline that can do all of the subsequent feature extraction.
    feature_vectorizer_input_features = []
    feature_vectorizer_size_map = {}

    if model.categorical_features == 'all':
        _categorical_features = set(xrange(input_dimension))
        _cat_feature_idx_mapping = dict(
            (i, i) for i in xrange(input_dimension))
    else:
        _categorical_features = set(model.categorical_features)
        _cat_feature_idx_mapping = dict(
            (_idx, i)
            for i, _idx in enumerate(sorted(model.categorical_features)))

    pline = Pipeline(input_features, output_features)

    # Track the overall packing index, which determins the output ordering.
    pack_idx = 0

    # First, go through all the columns that are encoded. The sklearn OHE puts
    # all of these first, regardless of their original ordering.
    for idx in xrange(input_dimension):
        f_name = "__OHE_%d__" % pack_idx

        if idx in _categorical_features:

            # This input column is one hot encoded
            feature_extractor_spec = create_array_feature_extractor(
                input_features, f_name, idx, output_type='Int64')

            pline.add_model(feature_extractor_spec)

            _cat_feature_idx = _cat_feature_idx_mapping[idx]

            ohe_input_features = [(f_name, datatypes.Int64())]
            ohe_output_features = [(f_name, datatypes.Dictionary('Int64'))]

            # Create a one hot encoder per column
            o_spec = _Model_pb2.Model()
            o_spec.specificationVersion = SPECIFICATION_VERSION
            o_spec = set_transform_interface_params(o_spec, ohe_input_features,
                                                    ohe_output_features)

            ohe_spec = o_spec.oneHotEncoder
            ohe_spec.outputSparse = True

            if model.handle_unknown == 'error':
                ohe_spec.handleUnknown = _OHE_pb2.OneHotEncoder.HandleUnknown.Value(
                    'ErrorOnUnknown')
            else:
                ohe_spec.handleUnknown = _OHE_pb2.OneHotEncoder.HandleUnknown.Value(
                    'IgnoreUnknown')

            # Need to do a quick search to find the part of the active_features_ mask
            # that represents the categorical variables in our part.  Could do this
            # with binary search, but we probably don't need speed so much here.
            def bs_find(a, i):
                lb, k = 0, len(a)
                while k > 0:
                    _idx = lb + (k // 2)
                    if a[_idx] < i:
                        lb = _idx + 1
                        k -= 1
                    k = (k // 2)

                return lb

            # Here are the indices we are looking fo
            f_idx_bottom = model.feature_indices_[_cat_feature_idx]
            f_idx_top = model.feature_indices_[_cat_feature_idx + 1]

            # Now find where in the active features list we should look.
            cat_feat_idx_bottom = bs_find(model.active_features_, f_idx_bottom)
            cat_feat_idx_top = bs_find(model.active_features_, f_idx_top)
            n_cat_values = cat_feat_idx_top - cat_feat_idx_bottom

            for i in range(cat_feat_idx_bottom, cat_feat_idx_top):
                # The actual categorical value is stored as an offset in the active_features list.
                cat_idx = model.active_features_[i] - f_idx_bottom
                ohe_spec.int64Categories.vector.append(cat_idx)

            # Add the ohe to the pipeline
            pline.add_model(o_spec)

            # Add the result to the feature_vectorizer at the end.
            feature_vectorizer_input_features.append(
                (f_name, datatypes.Dictionary('Int64')))
            feature_vectorizer_size_map[f_name] = n_cat_values

            pack_idx += 1

    # Now go through all the columns that are not encoded as the sklearn OHE puts
    # these after the encoded ones.  For speed, we can put these all in a single
    # ArrayFeatureExtractor
    #
    pass_through_features = [
        idx for idx in xrange(input_dimension)
        if idx not in _categorical_features
    ]

    if pass_through_features:

        f_name = "__OHE_pass_through__"

        # This input column is not one hot encoded
        feature_extractor_spec = create_array_feature_extractor(
            input_features, f_name, pass_through_features)

        pline.add_model(feature_extractor_spec)
        feature_vectorizer_input_features.append(
            (f_name, datatypes.Array(len(pass_through_features))))

    # Finally, add the feature vectorizer to the pipeline.
    output_feature_name = output_features[0][0]
    output_feature_dimension = output_features[0][1].num_elements

    fvec, _num_out_dim = create_feature_vectorizer(
        feature_vectorizer_input_features, output_features[0][0],
        feature_vectorizer_size_map)

    # Make sure that the feature vectorizer input actually matches up with the
    assert _num_out_dim == output_features[0][1].num_elements

    pline.add_model(fvec)

    return _MLModel(pline.spec)