Ejemplo n.º 1
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~turicreate.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        turicreate.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = tc.load_model('my_model_file')

        """
        import copy

        state = copy.copy(self._get_native_state())
        state["model_version"] = self._get_version()
        return glconnect.get_unity().save_model2(self.__class__._native_name(),
                                                 _make_internal_url(location),
                                                 state)
Ejemplo n.º 2
0
    def save(self, location):
        """
        Save the model. The model is saved as a directory which can then be
        loaded using the :py:func:`~turicreate.load_model` method.

        Parameters
        ----------
        location : string
            Target destination for the model. Can be a local path or remote URL.

        See Also
        ----------
        turicreate.load_model

        Examples
        ----------
        >>> model.save('my_model_file')
        >>> loaded_model = turicreate.load_model('my_model_file')
        """
        return glconnect.get_unity().save_model(self, _make_internal_url(location))
def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set="auto",
           max_depth=6,
           step_size=0.3,
           min_loss_reduction=0.0,
           min_child_weight=0.1,
           row_subsample=1.0,
           column_subsample=1.0,
           verbose=True,
           random_seed=None,
           metric="auto",
           **kwargs):
    """
    Create a :class:`~turicreate.boosted_trees_regression.BoostedTreesRegression` to predict
    a scalar target variable using one or more features. In addition to standard
    numeric and categorical types, features can also be extracted automatically
    from list- or dictionary-type SFrame columns.


    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.
        Only numerical typed (int, float) target column is allowed.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, using all columns.

    max_iterations : int, optional
        The number of iterations for boosting. It is also the number of trees
        in the model.

    validation_set : SFrame, optional
        The validation set that is used to watch the validation result as
        boosting progress.

    max_depth : float, optional
        Maximum depth of a tree. Must be at least 1.

    step_size : float, [0,1],  optional
        Step size (shrinkage) used in update to prevents overfitting.  It
        shrinks the prediction of each weak learner to make the boosting
        process more conservative.  The smaller, the more conservative the
        algorithm will be. Smaller step_size is usually used together with
        larger max_iterations.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, [0,1], optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the examples (rows) to grow each tree.

    column_subsample : float, [0,1], optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this also usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the columns to grow each tree.

    verbose : boolean, optional
        If True, print progress information during training.

    random_seed: int, optional
        Seeds random operations such as column and row subsampling, such that
        results are reproducible.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'rmse', 'max_error'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``early_stopping_rounds`` : int, default None
            If the validation metric does not improve after <early_stopping_rounds>,
            stop training and return the best model.
            If multiple metrics are being tracked, the last one is used.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : BoostedTreesRegression
          A trained gradient boosted trees model

    References
    ----------
    - `Wikipedia - Gradient tree boosting
      <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    BoostedTreesRegression, turicreate.linear_regression.LinearRegression, turicreate.regression.create

    Examples
    --------

    Setup the data:

    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = turicreate.SFrame.read_csv(url)
    >>> data['label'] = data['label'] == 'p'

    Split the data into training and test data:

    >>> train, test = data.random_split(0.8)

    Create the model:

    >>> model = turicreate.boosted_trees_regression.create(train, target='label')

    Make predictions and evaluate the model:

    >>> predictions = model.predict(test)
    >>> results = model.evaluate(test)

    """
    if random_seed is not None:
        kwargs["random_seed"] = random_seed
    if "model_checkpoint_path" in kwargs:
        kwargs["model_checkpoint_path"] = _make_internal_url(
            kwargs["model_checkpoint_path"])
    if "resume_from_checkpoint" in kwargs:
        kwargs["resume_from_checkpoint"] = _make_internal_url(
            kwargs["resume_from_checkpoint"])

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name="boosted_trees_regression",
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       max_depth=max_depth,
                       step_size=step_size,
                       min_loss_reduction=min_loss_reduction,
                       min_child_weight=min_child_weight,
                       row_subsample=row_subsample,
                       column_subsample=column_subsample,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return BoostedTreesRegression(model.__proxy__)
Ejemplo n.º 4
0
def load_model(location):
    """
    Load any Turi Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    Turi Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = tc.load_model('my_model_file')
    """

    # Check if the location is a dir_archive, if not, use glunpickler to load
    # as pure python model
    # If the location is a http location, skip the check, and directly proceed
    # to load model as dir_archive. This is because
    # 1) exists() does not work with http protocol, and
    # 2) GLUnpickler does not support http
    protocol = file_util.get_protocol(location)
    dir_archive_exists = False
    if protocol == '':
        model_path = file_util.expand_full_path(location)
        dir_archive_exists = file_util.exists(
            os.path.join(model_path, 'dir_archive.ini'))
    else:
        model_path = location
        if protocol in ['http', 'https']:
            dir_archive_exists = True
        else:
            import posixpath
            dir_archive_exists = file_util.exists(
                posixpath.join(model_path, 'dir_archive.ini'))
    if not dir_archive_exists:
        raise IOError("Directory %s does not exist" % location)

    _internal_url = _make_internal_url(location)
    saved_state = glconnect.get_unity().load_model(_internal_url)
    # The archive version could be both bytes/unicode
    key = u'archive_version'
    archive_version = saved_state[key] if key in saved_state else saved_state[
        key.encode()]
    if archive_version < 0:
        raise ToolkitError("File does not appear to be a Turi Create model.")
    elif archive_version > 1:
        raise ToolkitError(
            "Unable to load model.\n\n"
            "This model looks to have been saved with a future version of Turi Create.\n"
            "Please upgrade Turi Create before attempting to load this model file."
        )
    elif archive_version == 1:
        cls = MODEL_NAME_MAP[saved_state['model_name']]
        if 'model' in saved_state:
            # this is a native model
            return cls(saved_state['model'])
        else:
            # this is a CustomModel
            model_data = saved_state['side_data']
            model_version = model_data['model_version']
            del model_data['model_version']
            return cls._load_version(model_data, model_version)
    else:
        # very legacy model format. Attempt pickle loading
        import sys
        sys.stderr.write(
            "This model was saved in a legacy model format. Compatibility cannot be guaranteed in future versions.\n"
        )
        if _six.PY3:
            raise ToolkitError(
                "Unable to load legacy model in Python 3.\n\n"
                "To migrate a model, try loading it using Turi Create 4.0 or\n"
                "later in Python 2 and then re-save it. The re-saved model should\n"
                "work in Python 3.")

        if 'graphlab' not in sys.modules:
            sys.modules['graphlab'] = sys.modules['turicreate']
            # backward compatibility. Otherwise old pickles will not load
            sys.modules["turicreate_util"] = sys.modules['turicreate.util']
            sys.modules["graphlab_util"] = sys.modules['turicreate.util']

            # More backwards compatibility with the turicreate namespace code.
            for k, v in list(sys.modules.items()):
                if 'turicreate' in k:
                    sys.modules[k.replace('turicreate', 'graphlab')] = v
        #legacy loader
        import pickle
        model_wrapper = pickle.loads(saved_state[b'model_wrapper'])
        return model_wrapper(saved_state[b'model_base'])
Ejemplo n.º 5
0
def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set='auto',
           verbose=True,
           class_weights=None,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a (binary or multi-class) classifier model of type
    :class:`~turicreate.random_forest_classifier.RandomForestClassifier` using
    an ensemble of decision trees trained on subsets of the data.

    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.  String target variables are
        automatically mapped to integers in alphabetical order of the variable values.
        For example, a target variable with 'cat', 'dog', and 'foosa' as possible
        values is mapped to 0, 1, and, 2 respectively.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, which uses all columns in the SFrame ``dataset``
        excepting the target column..

    max_iterations : int, optional
        The maximum number of iterations to perform. For multi-class
        classification with K classes, each iteration will create K-1 trees.

    max_depth : float, optional
        Maximum depth of a tree.

    class_weights : {dict, `auto`}, optional
        Weights the examples in the training data according to the given class
        weights. If set to `None`, all classes are supposed to have weight one. The
        `auto` mode set the class weight to be inversely proportional to number of
        examples in the training data with the given class.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition on a
        leaf node of the tree. The larger it is, the more conservative the
        algorithm will be. Must be non-negative.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and can usually help
        prevent overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the examples (rows) to grow each tree.

    column_subsample : float, optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this can also help prevent
        model overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the columns to grow each tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting. The default value is 'auto'.

    verbose : boolean, optional
        Print progress information during training (if set to true).

    random_seed : int, optional
        Seeds random opertations such as column and row subsampling, such that
        results are reproducable.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'accuracy', 'auc', 'log_loss'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.


    Returns
    -------
      out : RandomForestClassifier
          A trained random forest model for classification tasks.

    References
    ----------
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    RandomForestClassifier, turicreate.logistic_classifier.LogisticClassifier, turicreate.svm_classifier.SVMClassifier


    Examples
    --------

    .. sourcecode:: python

      >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
      >>> data = turicreate.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = turicreate.random_forest_classifier.create(train, target='label')

      >>> predicitons = model.classify(test)
      >>> results = model.evaluate(test)
    """

    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if 'model_checkpoint_path' in kwargs:
        kwargs['model_checkpoint_path'] = _make_internal_url(
            kwargs['model_checkpoint_path'])
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(
            kwargs['resume_from_checkpoint'])
    if 'num_trees' in kwargs:
        logger = _logging.getLogger(__name__)
        logger.warning(
            "The `num_trees` keyword argument is deprecated. Please "
            "use the `max_iterations` argument instead. Any value provided "
            "for `num_trees` will be used in place of `max_iterations`.")
        max_iterations = kwargs['num_trees']
        del kwargs['num_trees']

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name='random_forest_classifier',
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       class_weights=class_weights,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return RandomForestClassifier(model.__proxy__)
Ejemplo n.º 6
0
def load_model(location):
    """
    Load any Turi Create model that was previously saved.

    This function assumes the model (can be any model) was previously saved in
    Turi Create model format with model.save(filename).

    Parameters
    ----------
    location : string
        Location of the model to load. Can be a local path or a remote URL.
        Because models are saved as directories, there is no file extension.

    Examples
    ----------
    >>> model.save('my_model_file')
    >>> loaded_model = tc.load_model('my_model_file')
    """

    # Check if the location is a dir_archive, if not, use glunpickler to load
    # as pure python model
    # If the location is a http location, skip the check, and directly proceed
    # to load model as dir_archive. This is because
    # 1) exists() does not work with http protocol, and
    # 2) GLUnpickler does not support http
    protocol = file_util.get_protocol(location)
    dir_archive_exists = False
    if protocol == "":
        model_path = file_util.expand_full_path(location)
        dir_archive_exists = file_util.exists(
            os.path.join(model_path, "dir_archive.ini"))
    else:
        model_path = location
        if protocol in ["http", "https", "s3"]:
            dir_archive_exists = True
        else:
            import posixpath

            dir_archive_exists = file_util.exists(
                posixpath.join(model_path, "dir_archive.ini"))
    if not dir_archive_exists:
        raise IOError("Directory %s does not exist" % location)

    _internal_url = _make_internal_url(location)
    saved_state = glconnect.get_unity().load_model(_internal_url)
    saved_state = _wrap_function_return(saved_state)
    # The archive version could be both bytes/unicode
    key = u"archive_version"
    archive_version = (saved_state[key]
                       if key in saved_state else saved_state[key.encode()])
    if archive_version < 0:
        raise ToolkitError("File does not appear to be a Turi Create model.")
    elif archive_version > 1:
        raise ToolkitError(
            "Unable to load model.\n\n"
            "This model looks to have been saved with a future version of Turi Create.\n"
            "Please upgrade Turi Create before attempting to load this model file."
        )
    elif archive_version == 1:
        name = saved_state["model_name"]
        if name in MODEL_NAME_MAP:
            cls = MODEL_NAME_MAP[name]
            if "model" in saved_state:
                if name in [
                        "activity_classifier",
                        "object_detector",
                        "style_transfer",
                        "drawing_classifier",
                ]:
                    import turicreate.toolkits.libtctensorflow
                # this is a native model
                return cls(saved_state["model"])
            else:
                # this is a CustomModel
                model_data = saved_state["side_data"]
                model_version = model_data["model_version"]
                del model_data["model_version"]

                if name == "activity_classifier":
                    import turicreate.toolkits.libtctensorflow

                    model = _extensions.activity_classifier()
                    model.import_from_custom_model(model_data, model_version)
                    return cls(model)

                if name == "object_detector":
                    import turicreate.toolkits.libtctensorflow

                    model = _extensions.object_detector()
                    model.import_from_custom_model(model_data, model_version)
                    return cls(model)

                if name == "style_transfer":
                    import turicreate.toolkits.libtctensorflow

                    model = _extensions.style_transfer()
                    model.import_from_custom_model(model_data, model_version)
                    return cls(model)

                if name == "drawing_classifier":
                    import turicreate.toolkits.libtctensorflow

                    model = _extensions.drawing_classifier()
                    model.import_from_custom_model(model_data, model_version)
                    return cls(model)

                if name == "one_shot_object_detector":
                    import turicreate.toolkits.libtctensorflow

                    od_cls = MODEL_NAME_MAP["object_detector"]
                    if "detector_model" in model_data["detector"]:
                        model_data["detector"] = od_cls(
                            model_data["detector"]["detector_model"])
                    else:
                        model = _extensions.object_detector()
                        model.import_from_custom_model(
                            model_data["detector"],
                            model_data["_detector_version"])
                        model_data["detector"] = od_cls(model)
                    return cls(model_data)

                return cls._load_version(model_data, model_version)

        elif hasattr(_extensions, name):
            return saved_state["model"]
        else:
            raise ToolkitError(
                "Unable to load model of name '%s'; model name not registered."
                % name)
    else:
        # very legacy model format. Attempt pickle loading
        import sys

        sys.stderr.write(
            "This model was saved in a legacy model format. Compatibility cannot be guaranteed in future versions.\n"
        )
        if _six.PY3:
            raise ToolkitError(
                "Unable to load legacy model in Python 3.\n\n"
                "To migrate a model, try loading it using Turi Create 4.0 or\n"
                "later in Python 2 and then re-save it. The re-saved model should\n"
                "work in Python 3.")

        if "graphlab" not in sys.modules:
            sys.modules["graphlab"] = sys.modules["turicreate"]
            # backward compatibility. Otherwise old pickles will not load
            sys.modules["turicreate_util"] = sys.modules["turicreate.util"]
            sys.modules["graphlab_util"] = sys.modules["turicreate.util"]

            # More backwards compatibility with the turicreate namespace code.
            for k, v in list(sys.modules.items()):
                if "turicreate" in k:
                    sys.modules[k.replace("turicreate", "graphlab")] = v
        # legacy loader
        import pickle

        model_wrapper = pickle.loads(saved_state[b"model_wrapper"])
        return model_wrapper(saved_state[b"model_base"])
def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set='auto',
           verbose=True,
           random_seed=None,
           metric='auto',
           **kwargs):
    """
    Create a :class:`~turicreate.random_forest_regression.RandomForestRegression` to predict
    a scalar target variable using one or more features. In addition to standard
    numeric and categorical types, features can also be extracted automatically
    from list- or dictionary-type SFrame columns.


    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.
        Only numerical typed (int, float) target column is allowed.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, using all columns.

    max_iterations : int, optional
        The number of iterations to perform.

    max_depth : float, optional
        Maximum depth of a tree.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the examples (rows) to grow each tree.

    column_subsample : float, optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this also usually can help
        prevent overfitting.  Setting it to 0.5 means that model randomly
        collected half of the columns to grow each tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting. The default value is 'auto'.


    verbose : boolean, optional
        If True, print progress information during training.

    random_seed : int, optional
        Seeds random opertations such as column and row subsampling, such that
        results are reproducable.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'rmse', 'max_error'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : RandomForestRegression
          A trained random forest model for regression tasks.

    References
    ----------
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    RandomForestRegression, turicreate.linear_regression.LinearRegression, turicreate.regression.create

    Examples
    --------

    Setup the data:

    >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    >>> data = turicreate.SFrame.read_csv(url)
    >>> data['label'] = data['label'] == 'p'

    Split the data into training and test data:

    >>> train, test = data.random_split(0.8)

    Create the model:

    >>> model = turicreate.random_forest_regression.create(train, target='label')

    Make predictions and evaluate the model:

    >>> predictions = model.predict(test)
    >>> results = model.evaluate(test)

    """
    if random_seed is not None:
        kwargs['random_seed'] = random_seed
    if 'model_checkpoint_path' in kwargs:
        kwargs['model_checkpoint_path'] = _make_internal_url(
            kwargs['model_checkpoint_path'])
    if 'resume_from_checkpoint' in kwargs:
        kwargs['resume_from_checkpoint'] = _make_internal_url(
            kwargs['resume_from_checkpoint'])
    if 'num_trees' in kwargs:
        logger = _logging.getLogger(__name__)
        logger.warning(
            "The `num_trees` keyword argument is deprecated. Please "
            "use the `max_iterations` argument instead. Any value provided "
            "for `num_trees` will be used in place of `max_iterations`.")
        max_iterations = kwargs['num_trees']
        del kwargs['num_trees']

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name='random_forest_regression',
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return RandomForestRegression(model.__proxy__)
Ejemplo n.º 8
0
def create(dataset,
           target,
           features=None,
           max_iterations=10,
           validation_set="auto",
           class_weights=None,
           max_depth=6,
           step_size=0.3,
           min_loss_reduction=0.0,
           min_child_weight=0.1,
           row_subsample=1.0,
           column_subsample=1.0,
           verbose=True,
           random_seed=None,
           metric="auto",
           **kwargs):
    """
    Create a (binary or multi-class) classifier model of type
    :class:`~turicreate.boosted_trees_classifier.BoostedTreesClassifier` using
    gradient boosted trees (sometimes known as GBMs).

    Parameters
    ----------
    dataset : SFrame
        A training dataset containing feature columns and a target column.

    target : str
        Name of the column containing the target variable. The values in this
        column must be of string or integer type.  String target variables are
        automatically mapped to integers in alphabetical order of the variable values.
        For example, a target variable with 'cat', 'dog', and 'foosa' as possible
        values is mapped to 0, 1, and, 2 respectively.

    features : list[str], optional
        A list of columns names of features used for training the model.
        Defaults to None, which uses all columns in the SFrame ``dataset``
        excepting the target column..

    max_iterations : int, optional
        The maximum number of iterations for boosting. Each iteration results
        in the creation of an extra tree.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. This is computed once per full iteration. Large
        differences in model accuracy between the training data and validation
        data is indicative of overfitting. The default value is 'auto'.

    class_weights : {dict, `auto`}, optional

        Weights the examples in the training data according to the given class
        weights. If provided, the dictionary must contain a key for each class
        label. The value can be any positive number greater than 1e-20. Weights
        are interpreted as relative to each other. So setting the weights to be
        2.0 for the positive class and 1.0 for the negative class has the same
        effect as setting them to be 20.0 and 10.0, respectively. If set to
        `None`, all classes are taken to have weight 1.0. The `auto` mode sets
        the class weight to be inversely proportional to the number of examples
        in the training data with the given class.

    max_depth : float, optional
        Maximum depth of a tree. Must be at least 1.

    step_size : float, [0,1], optional
        Step size (shrinkage) used in update to prevents overfitting.  It
        shrinks the prediction of each weak learner to make the boosting
        process more conservative.  The smaller the step size, the more conservative
        the algorithm will be. Smaller step_size work well when
        `max_iterations` is large.

    min_loss_reduction : float, optional (non-negative)
        Minimum loss reduction required to make a further partition/split a
        node during the tree learning phase. Larger (more positive) values
        can help prevent overfitting by avoiding splits that do not
        sufficiently reduce the loss function.

    min_child_weight : float, optional (non-negative)
        Controls the minimum weight of each leaf node. Larger values result in
        more conservative tree learning and help prevent overfitting.
        Formally, this is minimum sum of instance weights (hessians) in each
        node. If the tree learning algorithm results in a leaf node with the
        sum of instance weights less than `min_child_weight`, tree building
        will terminate.

    row_subsample : float, [0,1], optional
        Subsample the ratio of the training set in each iteration of tree
        construction.  This is called the bagging trick and can usually help
        prevent overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the examples (rows) to grow each tree.

    column_subsample : float, [0,1], optional
        Subsample ratio of the columns in each iteration of tree
        construction.  Like row_subsample, this can also help prevent
        model overfitting.  Setting this to a value of 0.5 results in the
        model randomly sampling half of the columns to grow each tree.

    verbose : boolean, optional
        Print progress information during training (if set to true).

    random_seed : int, optional
        Seeds random opertations such as column and row subsampling, such that
        results are reproducable.

    metric : str or list[str], optional
        Performance metric(s) that are tracked during training. When specified,
        the progress table will display the tracked metric(s) on training and
        validation set.
        Supported metrics are: {'accuracy', 'auc', 'log_loss'}

    kwargs : dict, optional
        Additional arguments for training the model.

        - ``early_stopping_rounds`` : int, default None
            If the validation metric does not improve after <early_stopping_rounds>,
            stop training and return the best model.
            If multiple metrics are being tracked, the last one is used.

        - ``model_checkpoint_path`` : str, default None
            If specified, checkpoint the model training to the given path every n iterations,
            where n is specified by ``model_checkpoint_interval``.
            For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is
            set to ``/tmp/model_tmp``, the checkpoints will be saved into
            ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc.
            Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints.

        - ``model_checkpoint_interval`` : int, default 5
            If model_check_point_path is specified,
            save the model to the given path every n iterations.

        - ``resume_from_checkpoint`` : str, default None
            Continues training from a model checkpoint. The model must take
            exact the same training data as the checkpointed model.

    Returns
    -------
      out : BoostedTreesClassifier
          A trained gradient boosted trees model for classifications tasks.

    References
    ----------

    - `Wikipedia - Gradient tree boosting
      <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_
    - `Trevor Hastie's slides on Boosted Trees and Random Forest
      <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_

    See Also
    --------
    BoostedTreesClassifier, turicreate.logistic_classifier.LogisticClassifier, turicreate.svm_classifier.SVMClassifier

    Examples
    --------

    .. sourcecode:: python

      >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
      >>> data = turicreate.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = turicreate.boosted_trees_classifier.create(train, target='label')

      >>> predictions = model.classify(test)
      >>> results = model.evaluate(test)
    """
    if random_seed is not None:
        kwargs["random_seed"] = random_seed
    if "model_checkpoint_path" in kwargs:
        kwargs["model_checkpoint_path"] = _make_internal_url(
            kwargs["model_checkpoint_path"])
    if "resume_from_checkpoint" in kwargs:
        kwargs["resume_from_checkpoint"] = _make_internal_url(
            kwargs["resume_from_checkpoint"])

    model = _sl.create(dataset=dataset,
                       target=target,
                       features=features,
                       model_name="boosted_trees_classifier",
                       max_iterations=max_iterations,
                       validation_set=validation_set,
                       class_weights=class_weights,
                       max_depth=max_depth,
                       step_size=step_size,
                       min_loss_reduction=min_loss_reduction,
                       min_child_weight=min_child_weight,
                       row_subsample=row_subsample,
                       column_subsample=column_subsample,
                       verbose=verbose,
                       metric=metric,
                       **kwargs)
    return BoostedTreesClassifier(model.__proxy__)