Ejemplos de _make_internal_url en Python, ejemplos de turicreate.util._make_internal_url en Python

Ejemplo n.º 1

0

Mostrar archivo

    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~turicreate.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        turicreate.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = tc.load_model('my_model_file')

        """
        import copy

        state = copy.copy(self._get_native_state())
        state["model_version"] = self._get_version()
        return glconnect.get_unity().save_model2(self.__class__._native_name(),
                                                 _make_internal_url(location),
                                                 state)

Ejemplo n.º 2

0

Mostrar archivo

    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~turicreate.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        turicreate.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = turicreate.load_model('my_model_file')
        """
        return glconnect.get_unity().save_model(self, _make_internal_url(location))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: boosted_trees_regression.py Proyecto: omodolapovictorb/turicreate

def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set="auto",
           max_depth=6,
           step_size=0.3,
           min_loss_reduction=0.0,
           min_child_weight=0.1,
           row_subsample=1.0,
           column_subsample=1.0,
           verbose=True,
           random_seed=None,
           metric="auto",
           **kwargs):
    """
    Create a :class:`~turicreate.boosted_trees_regression.BoostedTreesRegression` to predict
    a scalar target variable using one or more features. In addition to standard
    numeric and categorical types, features can also be extracted automatically
    from list- or dictionary-type SFrame columns.


    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.
        Only numerical typed (int, float) target column is allowed.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, using all columns.

    max_iterations : int, optional
        The number of iterations for boosting. It is also the number of trees
        in the model.

    validation_set : SFrame, optional
        The validation set that is used to watch the validation result as
        boosting progress.

    max_depth : float, optional
        Maximum depth of a tree. Must be at least 1.

    step_size : float, [0,1],  optional
        Step size (shrinkage) used in update to prevents overfitting.  It
        shrinks the prediction of each weak learner to make the boosting
        process more conservative.  The smaller, the more conservative the
        algorithm will be. Smaller step_size is usually used together with
        larger max_iterations.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, [0,1], optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the examples (rows) to grow each tree.

    column_subsample : float, [0,1], optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this also usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the columns to grow each tree.

    verbose : boolean, optional
        If True, print progress information during training.

    random_seed: int, optional
        Seeds random operations such as column and row subsampling, such that
        results are reproducible.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'rmse', 'max_error'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``early_stopping_rounds`` : int, default None
            If the validation metric does not improve after <early_stopping_rounds>,
            stop training and return the best model.
            If multiple metrics are being tracked, the last one is used.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : BoostedTreesRegression
          A trained gradient boosted trees model

    References
    ----------
    - `Wikipedia - Gradient tree boosting
      <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    BoostedTreesRegression, turicreate.linear_regression.LinearRegression, turicreate.regression.create

    Examples
    --------

    Setup the data:

    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = turicreate.SFrame.read_csv(url)
    >>> data['label'] = data['label'] == 'p'

    Split the data into training and test data:

    >>> train, test = data.random_split(0.8)

    Create the model:

    >>> model = turicreate.boosted_trees_regression.create(train, target='label')

    Make predictions and evaluate the model:

    >>> predictions = model.predict(test)
    >>> results = model.evaluate(test)

    """
    if random_seed is not None:
        kwargs["random_seed"] = random_seed
    if "model_checkpoint_path" in kwargs:
        kwargs["model_checkpoint_path"] = _make_internal_url(
            kwargs["model_checkpoint_path"])
    if "resume_from_checkpoint" in kwargs:
        kwargs["resume_from_checkpoint"] = _make_internal_url(
            kwargs["resume_from_checkpoint"])

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name="boosted_trees_regression",
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       max_depth=max_depth,
                       step_size=step_size,
                       min_loss_reduction=min_loss_reduction,
                       min_child_weight=min_child_weight,
                       row_subsample=row_subsample,
                       column_subsample=column_subsample,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return BoostedTreesRegression(model.__proxy__)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: _model.py Proyecto: fozoglu/instrument-recognition

def load_model(location):
    """
    Load any Turi Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    Turi Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = tc.load_model('my_model_file')
    """

    # Check if the location is a dir_archive, if not, use glunpickler to load
    # as pure python model
    # If the location is a http location, skip the check, and directly proceed
    # to load model as dir_archive. This is because
    # 1) exists() does not work with http protocol, and
    # 2) GLUnpickler does not support http
    protocol = file_util.get_protocol(location)
    dir_archive_exists = False
    if protocol == '':
        model_path = file_util.expand_full_path(location)
        dir_archive_exists = file_util.exists(
            os.path.join(model_path, 'dir_archive.ini'))
    else:
        model_path = location
        if protocol in ['http', 'https']:
            dir_archive_exists = True
        else:
            import posixpath
            dir_archive_exists = file_util.exists(
                posixpath.join(model_path, 'dir_archive.ini'))
    if not dir_archive_exists:
        raise IOError("Directory %s does not exist" % location)

    _internal_url = _make_internal_url(location)
    saved_state = glconnect.get_unity().load_model(_internal_url)
    # The archive version could be both bytes/unicode
    key = u'archive_version'
    archive_version = saved_state[key] if key in saved_state else saved_state[
        key.encode()]
    if archive_version < 0:
        raise ToolkitError("File does not appear to be a Turi Create model.")
    elif archive_version > 1:
        raise ToolkitError(
            "Unable to load model.\n\n"
            "This model looks to have been saved with a future version of Turi Create.\n"
            "Please upgrade Turi Create before attempting to load this model file."
        )
    elif archive_version == 1:
        cls = MODEL_NAME_MAP[saved_state['model_name']]
        if 'model' in saved_state:
            # this is a native model
            return cls(saved_state['model'])
        else:
            # this is a CustomModel
            model_data = saved_state['side_data']
            model_version = model_data['model_version']
            del model_data['model_version']
            return cls._load_version(model_data, model_version)
    else:
        # very legacy model format. Attempt pickle loading
        import sys
        sys.stderr.write(
            "This model was saved in a legacy model format. Compatibility cannot be guaranteed in future versions.\n"
        )
        if _six.PY3:
            raise ToolkitError(
                "Unable to load legacy model in Python 3.\n\n"
                "To migrate a model, try loading it using Turi Create 4.0 or\n"
                "later in Python 2 and then re-save it. The re-saved model should\n"
                "work in Python 3.")

        if 'graphlab' not in sys.modules:
            sys.modules['graphlab'] = sys.modules['turicreate']
            # backward compatibility. Otherwise old pickles will not load
            sys.modules["turicreate_util"] = sys.modules['turicreate.util']
            sys.modules["graphlab_util"] = sys.modules['turicreate.util']

            # More backwards compatibility with the turicreate namespace code.
            for k, v in list(sys.modules.items()):
                if 'turicreate' in k:
                    sys.modules[k.replace('turicreate', 'graphlab')] = v
        #legacy loader
        import pickle
        model_wrapper = pickle.loads(saved_state[b'model_wrapper'])
        return model_wrapper(saved_state[b'model_base'])

Ejemplo n.º 5

0

Mostrar archivo

def create(dataset,
target,
features=None,
max_iterations=10,
validation_set='auto',
verbose=True,
class_weights=None,
random_seed=None,
metric='auto',
**kwargs):
"""
Create a (binary or multi-class) classifier model of type
:class:`~turicreate.random_forest_classifier.RandomForestClassifier` using
an ensemble of decision trees trained on subsets of the data.

Parameters
----------
dataset : SFrame
A training dataset containing feature columns and a target column.

target : str
Name of the column containing the target variable. The values in this
column must be of string or integer type. String target variables are
automatically mapped to integers in alphabetical order of the variable values.
For example, a target variable with 'cat', 'dog', and 'foosa' as possible
values is mapped to 0, 1, and, 2 respectively.

features : list[str], optional
A list of columns names of features used for training the model.
Defaults to None, which uses all columns in the SFrame ``dataset``
excepting the target column..

max_iterations : int, optional
The maximum number of iterations to perform. For multi-class
classification with K classes, each iteration will create K-1 trees.

max_depth : float, optional
Maximum depth of a tree.

class_weights : {dict, `auto`}, optional
Weights the examples in the training data according to the given class
weights. If set to `None`, all classes are supposed to have weight one. The
`auto` mode set the class weight to be inversely proportional to number of
examples in the training data with the given class.

min_loss_reduction : float, optional (non-negative)
Minimum loss reduction required to make a further partition on a
leaf node of the tree. The larger it is, the more conservative the
algorithm will be. Must be non-negative.

min_child_weight : float, optional (non-negative)
Controls the minimum weight of each leaf node. Larger values result in
more conservative tree learning and help prevent overfitting.
Formally, this is minimum sum of instance weights (hessians) in each
node. If the tree learning algorithm results in a leaf node with the
sum of instance weights less than `min_child_weight`, tree building
will terminate.

row_subsample : float, optional
Subsample the ratio of the training set in each iteration of tree
construction. This is called the bagging trick and can usually help
prevent overfitting. Setting this to a value of 0.5 results in the
model randomly sampling half of the examples (rows) to grow each tree.

column_subsample : float, optional
Subsample ratio of the columns in each iteration of tree
construction. Like row_subsample, this can also help prevent
model overfitting. Setting this to a value of 0.5 results in the
model randomly sampling half of the columns to grow each tree.

validation_set : SFrame, optional
A dataset for monitoring the model's generalization performance.
For each row of the progress table, the chosen metrics are computed
for both the provided training dataset and the validation_set. The
format of this SFrame must be the same as the training set.
By default this argument is set to 'auto' and a validation set is
automatically sampled and used for progress printing. If
validation_set is set to None, then no additional metrics
are computed. This is computed once per full iteration. Large
differences in model accuracy between the training data and validation
data is indicative of overfitting. The default value is 'auto'.

verbose : boolean, optional
Print progress information during training (if set to true).

random_seed : int, optional
Seeds random opertations such as column and row subsampling, such that
results are reproducable.

metric : str or list[str], optional
Performance metric(s) that are tracked during training. When specified,
the progress table will display the tracked metric(s) on training and
validation set.
Supported metrics are: {'accuracy', 'auc', 'log_loss'}

kwargs : dict, optional
Additional arguments for training the model.

- ``model_checkpoint_path`` : str, default None
If specified, checkpoint the model training to the given path every n iterations,
where n is specified by ``model_checkpoint_interval``.
For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
set to ``/tmp/model_tmp``, the checkpoints will be saved into
``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

- ``model_checkpoint_interval`` : int, default 5
If model_check_point_path is specified,
save the model to the given path every n iterations.

- ``resume_from_checkpoint`` : str, default None
Continues training from a model checkpoint. The model must take
exact the same training data as the checkpointed model.

Returns
-------
out : RandomForestClassifier
A trained random forest model for classification tasks.

References
----------
- `Trevor Hastie's slides on Boosted Trees and Random Forest
<http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

See Also
--------
RandomForestClassifier, turicreate.logistic_classifier.LogisticClassifier, turicreate.svm_classifier.SVMClassifier

Examples
--------

.. sourcecode:: python

>>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
>>> data = turicreate.SFrame.read_csv(url)

>>> train, test = data.random_split(0.8)
>>> model = turicreate.random_forest_classifier.create(train, target='label')

>>> predicitons = model.classify(test)
>>> results = model.evaluate(test)
"""

if random_seed is not None:
kwargs['random_seed'] = random_seed
if 'model_checkpoint_path' in kwargs:
kwargs['model_checkpoint_path'] = _make_internal_url(
kwargs['model_checkpoint_path'])
if 'resume_from_checkpoint' in kwargs:
kwargs['resume_from_checkpoint'] = _make_internal_url(
kwargs['resume_from_checkpoint'])
if 'num_trees' in kwargs:
logger = _logging.getLogger(__name__)
logger.warning(
"The `num_trees` keyword argument is deprecated. Please "
"use the `max_iterations` argument instead. Any value provided "
"for `num_trees` will be used in place of `max_iterations`.")
max_iterations = kwargs['num_trees']
del kwargs['num_trees']

model = _sl.create(dataset=dataset,
target=target,
features=features,
model_name='random_forest_classifier',
max_iterations=max_iterations,
validation_set=validation_set,
class_weights=class_weights,
verbose=verbose,
metric=metric,
**kwargs)
return RandomForestClassifier(model.__proxy__)

Ejemplo n.º 6

0

Mostrar archivo

def load_model(location):
    """
    Load any Turi Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    Turi Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = tc.load_model('my_model_file')
    """

    # Check if the location is a dir_archive, if not, use glunpickler to load
    # as pure python model
    # If the location is a http location, skip the check, and directly proceed
    # to load model as dir_archive. This is because
    # 1) exists() does not work with http protocol, and
    # 2) GLUnpickler does not support http
    protocol = file_util.get_protocol(location)
    dir_archive_exists = False
    if protocol == "":
        model_path = file_util.expand_full_path(location)
        dir_archive_exists = file_util.exists(
            os.path.join(model_path, "dir_archive.ini"))
    else:
        model_path = location
        if protocol in ["http", "https", "s3"]:
            dir_archive_exists = True
        else:
            import posixpath

            dir_archive_exists = file_util.exists(
                posixpath.join(model_path, "dir_archive.ini"))
    if not dir_archive_exists:
        raise IOError("Directory %s does not exist" % location)

    _internal_url = _make_internal_url(location)
    saved_state = glconnect.get_unity().load_model(_internal_url)
    saved_state = _wrap_function_return(saved_state)
    # The archive version could be both bytes/unicode
    key = u"archive_version"
    archive_version = (saved_state[key]
                       if key in saved_state else saved_state[key.encode()])
    if archive_version < 0:
        raise ToolkitError("File does not appear to be a Turi Create model.")
    elif archive_version > 1:
        raise ToolkitError(
            "Unable to load model.\n\n"
            "This model looks to have been saved with a future version of Turi Create.\n"
            "Please upgrade Turi Create before attempting to load this model file."
        )
    elif archive_version == 1:
        name = saved_state["model_name"]
        if name in MODEL_NAME_MAP:
            cls = MODEL_NAME_MAP[name]
            if "model" in saved_state:
                if name in [
                        "activity_classifier",
                        "object_detector",
                        "style_transfer",
                        "drawing_classifier",
                ]:
                    import turicreate.toolkits.libtctensorflow
                # this is a native model
                return cls(saved_state["model"])
            else:
                # this is a CustomModel
                model_data = saved_state["side_data"]
                model_version = model_data["model_version"]
                del model_data["model_version"]

                if name == "activity_classifier":
                    import turicreate.toolkits.libtctensorflow

                    model = _extensions.activity_classifier()
                    model.import_from_custom_model(model_data, model_version)
                    return cls(model)

                if name == "object_detector":
                    import turicreate.toolkits.libtctensorflow

                    model = _extensions.object_detector()
                    model.import_from_custom_model(model_data, model_version)
                    return cls(model)

                if name == "style_transfer":
                    import turicreate.toolkits.libtctensorflow

                    model = _extensions.style_transfer()
                    model.import_from_custom_model(model_data, model_version)
                    return cls(model)

                if name == "drawing_classifier":
                    import turicreate.toolkits.libtctensorflow

                    model = _extensions.drawing_classifier()
                    model.import_from_custom_model(model_data, model_version)
                    return cls(model)

                if name == "one_shot_object_detector":
                    import turicreate.toolkits.libtctensorflow

                    od_cls = MODEL_NAME_MAP["object_detector"]
                    if "detector_model" in model_data["detector"]:
                        model_data["detector"] = od_cls(
                            model_data["detector"]["detector_model"])
                    else:
                        model = _extensions.object_detector()
                        model.import_from_custom_model(
                            model_data["detector"],
                            model_data["_detector_version"])
                        model_data["detector"] = od_cls(model)
                    return cls(model_data)

                return cls._load_version(model_data, model_version)

        elif hasattr(_extensions, name):
            return saved_state["model"]
        else:
            raise ToolkitError(
                "Unable to load model of name '%s'; model name not registered."
                % name)
    else:
        # very legacy model format. Attempt pickle loading
        import sys

        sys.stderr.write(
            "This model was saved in a legacy model format. Compatibility cannot be guaranteed in future versions.\n"
        )
        if _six.PY3:
            raise ToolkitError(
                "Unable to load legacy model in Python 3.\n\n"
                "To migrate a model, try loading it using Turi Create 4.0 or\n"
                "later in Python 2 and then re-save it. The re-saved model should\n"
                "work in Python 3.")

        if "graphlab" not in sys.modules:
            sys.modules["graphlab"] = sys.modules["turicreate"]
            # backward compatibility. Otherwise old pickles will not load
            sys.modules["turicreate_util"] = sys.modules["turicreate.util"]
            sys.modules["graphlab_util"] = sys.modules["turicreate.util"]

            # More backwards compatibility with the turicreate namespace code.
            for k, v in list(sys.modules.items()):
                if "turicreate" in k:
                    sys.modules[k.replace("turicreate", "graphlab")] = v
        # legacy loader
        import pickle

        model_wrapper = pickle.loads(saved_state[b"model_wrapper"])
        return model_wrapper(saved_state[b"model_base"])

Ejemplo n.º 7

0

Mostrar archivo

Archivo: random_forest_regression.py Proyecto: zoid-anurag/turicreate

def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set='auto',
           verbose=True,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a :class:`~turicreate.random_forest_regression.RandomForestRegression` to predict
    a scalar target variable using one or more features. In addition to standard
    numeric and categorical types, features can also be extracted automatically
    from list- or dictionary-type SFrame columns.


    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.
        Only numerical typed (int, float) target column is allowed.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, using all columns.

    max_iterations : int, optional
        The number of iterations to perform.

    max_depth : float, optional
        Maximum depth of a tree.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the examples (rows) to grow each tree.

    column_subsample : float, optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this also usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the columns to grow each tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting. The default value is 'auto'.


    verbose : boolean, optional
        If True, print progress information during training.

    random_seed : int, optional
        Seeds random opertations such as column and row subsampling, such that
        results are reproducable.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'rmse', 'max_error'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : RandomForestRegression
          A trained random forest model for regression tasks.

    References
    ----------
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    RandomForestRegression, turicreate.linear_regression.LinearRegression, turicreate.regression.create

    Examples
    --------

    Setup the data:

    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = turicreate.SFrame.read_csv(url)
    >>> data['label'] = data['label'] == 'p'

    Split the data into training and test data:

    >>> train, test = data.random_split(0.8)

    Create the model:

    >>> model = turicreate.random_forest_regression.create(train, target='label')

    Make predictions and evaluate the model:

    >>> predictions = model.predict(test)
    >>> results = model.evaluate(test)

    """
    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if 'model_checkpoint_path' in kwargs:
        kwargs['model_checkpoint_path'] = _make_internal_url(
            kwargs['model_checkpoint_path'])
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(
            kwargs['resume_from_checkpoint'])
    if 'num_trees' in kwargs:
        logger = _logging.getLogger(__name__)
        logger.warning(
            "The `num_trees` keyword argument is deprecated. Please "
            "use the `max_iterations` argument instead. Any value provided "
            "for `num_trees` will be used in place of `max_iterations`.")
        max_iterations = kwargs['num_trees']
        del kwargs['num_trees']

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name='random_forest_regression',
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return RandomForestRegression(model.__proxy__)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: boosted_trees_classifier.py Proyecto: zero91/turicreate

def create(dataset,
target,
features=None,
max_iterations=10,
validation_set="auto",
class_weights=None,
max_depth=6,
step_size=0.3,
min_loss_reduction=0.0,
min_child_weight=0.1,
row_subsample=1.0,
column_subsample=1.0,
verbose=True,
random_seed=None,
metric="auto",
**kwargs):
"""
Create a (binary or multi-class) classifier model of type
:class:`~turicreate.boosted_trees_classifier.BoostedTreesClassifier` using
gradient boosted trees (sometimes known as GBMs).

Parameters
----------
dataset : SFrame
A training dataset containing feature columns and a target column.

target : str
Name of the column containing the target variable. The values in this
column must be of string or integer type. String target variables are
automatically mapped to integers in alphabetical order of the variable values.
For example, a target variable with 'cat', 'dog', and 'foosa' as possible
values is mapped to 0, 1, and, 2 respectively.

features : list[str], optional
A list of columns names of features used for training the model.
Defaults to None, which uses all columns in the SFrame ``dataset``
excepting the target column..

max_iterations : int, optional
The maximum number of iterations for boosting. Each iteration results
in the creation of an extra tree.

validation_set : SFrame, optional
A dataset for monitoring the model's generalization performance.
For each row of the progress table, the chosen metrics are computed
for both the provided training dataset and the validation_set. The
format of this SFrame must be the same as the training set.
By default this argument is set to 'auto' and a validation set is
automatically sampled and used for progress printing. If
validation_set is set to None, then no additional metrics
are computed. This is computed once per full iteration. Large
differences in model accuracy between the training data and validation
data is indicative of overfitting. The default value is 'auto'.

class_weights : {dict, `auto`}, optional

Weights the examples in the training data according to the given class
weights. If provided, the dictionary must contain a key for each class
label. The value can be any positive number greater than 1e-20. Weights
are interpreted as relative to each other. So setting the weights to be
2.0 for the positive class and 1.0 for the negative class has the same
effect as setting them to be 20.0 and 10.0, respectively. If set to
`None`, all classes are taken to have weight 1.0. The `auto` mode sets
the class weight to be inversely proportional to the number of examples
in the training data with the given class.

max_depth : float, optional
Maximum depth of a tree. Must be at least 1.

step_size : float, [0,1], optional
Step size (shrinkage) used in update to prevents overfitting. It
shrinks the prediction of each weak learner to make the boosting
process more conservative. The smaller the step size, the more conservative
the algorithm will be. Smaller step_size work well when
`max_iterations` is large.

min_loss_reduction : float, optional (non-negative)
Minimum loss reduction required to make a further partition/split a
node during the tree learning phase. Larger (more positive) values
can help prevent overfitting by avoiding splits that do not
sufficiently reduce the loss function.

min_child_weight : float, optional (non-negative)
Controls the minimum weight of each leaf node. Larger values result in
more conservative tree learning and help prevent overfitting.
Formally, this is minimum sum of instance weights (hessians) in each
node. If the tree learning algorithm results in a leaf node with the
sum of instance weights less than `min_child_weight`, tree building
will terminate.

row_subsample : float, [0,1], optional
Subsample the ratio of the training set in each iteration of tree
construction. This is called the bagging trick and can usually help
prevent overfitting. Setting this to a value of 0.5 results in the
model randomly sampling half of the examples (rows) to grow each tree.

column_subsample : float, [0,1], optional
Subsample ratio of the columns in each iteration of tree
construction. Like row_subsample, this can also help prevent
model overfitting. Setting this to a value of 0.5 results in the
model randomly sampling half of the columns to grow each tree.

verbose : boolean, optional
Print progress information during training (if set to true).

random_seed : int, optional
Seeds random opertations such as column and row subsampling, such that
results are reproducable.

metric : str or list[str], optional
Performance metric(s) that are tracked during training. When specified,
the progress table will display the tracked metric(s) on training and
validation set.
Supported metrics are: {'accuracy', 'auc', 'log_loss'}

kwargs : dict, optional
Additional arguments for training the model.

- ``early_stopping_rounds`` : int, default None
If the validation metric does not improve after <early_stopping_rounds>,
stop training and return the best model.
If multiple metrics are being tracked, the last one is used.

- ``model_checkpoint_path`` : str, default None
If specified, checkpoint the model training to the given path every n iterations,
where n is specified by ``model_checkpoint_interval``.
For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
set to ``/tmp/model_tmp``, the checkpoints will be saved into
``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

- ``model_checkpoint_interval`` : int, default 5
If model_check_point_path is specified,
save the model to the given path every n iterations.

- ``resume_from_checkpoint`` : str, default None
Continues training from a model checkpoint. The model must take
exact the same training data as the checkpointed model.

Returns
-------
out : BoostedTreesClassifier
A trained gradient boosted trees model for classifications tasks.

References
----------

- `Wikipedia - Gradient tree boosting
<http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
- `Trevor Hastie's slides on Boosted Trees and Random Forest
<http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

See Also
--------
BoostedTreesClassifier, turicreate.logistic_classifier.LogisticClassifier, turicreate.svm_classifier.SVMClassifier

Examples
--------

.. sourcecode:: python

>>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
>>> data = turicreate.SFrame.read_csv(url)

>>> train, test = data.random_split(0.8)
>>> model = turicreate.boosted_trees_classifier.create(train, target='label')

>>> predictions = model.classify(test)
>>> results = model.evaluate(test)
"""
if random_seed is not None:
kwargs["random_seed"] = random_seed
if "model_checkpoint_path" in kwargs:
kwargs["model_checkpoint_path"] = _make_internal_url(
kwargs["model_checkpoint_path"])
if "resume_from_checkpoint" in kwargs:
kwargs["resume_from_checkpoint"] = _make_internal_url(
kwargs["resume_from_checkpoint"])

model = _sl.create(dataset=dataset,
target=target,
features=features,
model_name="boosted_trees_classifier",
max_iterations=max_iterations,
validation_set=validation_set,
class_weights=class_weights,
max_depth=max_depth,
step_size=step_size,
min_loss_reduction=min_loss_reduction,
min_child_weight=min_child_weight,
row_subsample=row_subsample,
column_subsample=column_subsample,
verbose=verbose,
metric=metric,
**kwargs)
return BoostedTreesClassifier(model.__proxy__)