def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~turicreate.load_model` method. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- turicreate.load_model Examples ---------- >>> model.save('my_model_file') >>> loaded_model = tc.load_model('my_model_file') """ import copy state = copy.copy(self._get_native_state()) state["model_version"] = self._get_version() return glconnect.get_unity().save_model2(self.__class__._native_name(), _make_internal_url(location), state)
def save(self, location): """ Save the model. The model is saved as a directory which can then be loaded using the :py:func:`~turicreate.load_model` method. Parameters ---------- location : string Target destination for the model. Can be a local path or remote URL. See Also ---------- turicreate.load_model Examples ---------- >>> model.save('my_model_file') >>> loaded_model = turicreate.load_model('my_model_file') """ return glconnect.get_unity().save_model(self, _make_internal_url(location))
def create(dataset, target, features=None, max_iterations=10, validation_set="auto", max_depth=6, step_size=0.3, min_loss_reduction=0.0, min_child_weight=0.1, row_subsample=1.0, column_subsample=1.0, verbose=True, random_seed=None, metric="auto", **kwargs): """ Create a :class:`~turicreate.boosted_trees_regression.BoostedTreesRegression` to predict a scalar target variable using one or more features. In addition to standard numeric and categorical types, features can also be extracted automatically from list- or dictionary-type SFrame columns. Parameters ---------- dataset : SFrame A training dataset containing feature columns and a target column. Only numerical typed (int, float) target column is allowed. target : str The name of the column in ``dataset`` that is the prediction target. This column must have a numeric type. features : list[str], optional A list of columns names of features used for training the model. Defaults to None, using all columns. max_iterations : int, optional The number of iterations for boosting. It is also the number of trees in the model. validation_set : SFrame, optional The validation set that is used to watch the validation result as boosting progress. max_depth : float, optional Maximum depth of a tree. Must be at least 1. step_size : float, [0,1], optional Step size (shrinkage) used in update to prevents overfitting. It shrinks the prediction of each weak learner to make the boosting process more conservative. The smaller, the more conservative the algorithm will be. Smaller step_size is usually used together with larger max_iterations. min_loss_reduction : float, optional (non-negative) Minimum loss reduction required to make a further partition/split a node during the tree learning phase. Larger (more positive) values can help prevent overfitting by avoiding splits that do not sufficiently reduce the loss function. min_child_weight : float, optional (non-negative) Controls the minimum weight of each leaf node. Larger values result in more conservative tree learning and help prevent overfitting. Formally, this is minimum sum of instance weights (hessians) in each node. If the tree learning algorithm results in a leaf node with the sum of instance weights less than `min_child_weight`, tree building will terminate. row_subsample : float, [0,1], optional Subsample the ratio of the training set in each iteration of tree construction. This is called the bagging trick and usually can help prevent overfitting. Setting it to 0.5 means that model randomly collected half of the examples (rows) to grow each tree. column_subsample : float, [0,1], optional Subsample ratio of the columns in each iteration of tree construction. Like row_subsample, this also usually can help prevent overfitting. Setting it to 0.5 means that model randomly collected half of the columns to grow each tree. verbose : boolean, optional If True, print progress information during training. random_seed: int, optional Seeds random operations such as column and row subsampling, such that results are reproducible. metric : str or list[str], optional Performance metric(s) that are tracked during training. When specified, the progress table will display the tracked metric(s) on training and validation set. Supported metrics are: {'rmse', 'max_error'} kwargs : dict, optional Additional arguments for training the model. - ``early_stopping_rounds`` : int, default None If the validation metric does not improve after <early_stopping_rounds>, stop training and return the best model. If multiple metrics are being tracked, the last one is used. - ``model_checkpoint_path`` : str, default None If specified, checkpoint the model training to the given path every n iterations, where n is specified by ``model_checkpoint_interval``. For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is set to ``/tmp/model_tmp``, the checkpoints will be saved into ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc. Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints. - ``model_checkpoint_interval`` : int, default 5 If model_check_point_path is specified, save the model to the given path every n iterations. - ``resume_from_checkpoint`` : str, default None Continues training from a model checkpoint. The model must take exact the same training data as the checkpointed model. Returns ------- out : BoostedTreesRegression A trained gradient boosted trees model References ---------- - `Wikipedia - Gradient tree boosting <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_ - `Trevor Hastie's slides on Boosted Trees and Random Forest <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_ See Also -------- BoostedTreesRegression, turicreate.linear_regression.LinearRegression, turicreate.regression.create Examples -------- Setup the data: >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = turicreate.SFrame.read_csv(url) >>> data['label'] = data['label'] == 'p' Split the data into training and test data: >>> train, test = data.random_split(0.8) Create the model: >>> model = turicreate.boosted_trees_regression.create(train, target='label') Make predictions and evaluate the model: >>> predictions = model.predict(test) >>> results = model.evaluate(test) """ if random_seed is not None: kwargs["random_seed"] = random_seed if "model_checkpoint_path" in kwargs: kwargs["model_checkpoint_path"] = _make_internal_url( kwargs["model_checkpoint_path"]) if "resume_from_checkpoint" in kwargs: kwargs["resume_from_checkpoint"] = _make_internal_url( kwargs["resume_from_checkpoint"]) model = _sl.create(dataset=dataset, target=target, features=features, model_name="boosted_trees_regression", max_iterations=max_iterations, validation_set=validation_set, max_depth=max_depth, step_size=step_size, min_loss_reduction=min_loss_reduction, min_child_weight=min_child_weight, row_subsample=row_subsample, column_subsample=column_subsample, verbose=verbose, metric=metric, **kwargs) return BoostedTreesRegression(model.__proxy__)
def load_model(location): """ Load any Turi Create model that was previously saved. This function assumes the model (can be any model) was previously saved in Turi Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = tc.load_model('my_model_file') """ # Check if the location is a dir_archive, if not, use glunpickler to load # as pure python model # If the location is a http location, skip the check, and directly proceed # to load model as dir_archive. This is because # 1) exists() does not work with http protocol, and # 2) GLUnpickler does not support http protocol = file_util.get_protocol(location) dir_archive_exists = False if protocol == '': model_path = file_util.expand_full_path(location) dir_archive_exists = file_util.exists( os.path.join(model_path, 'dir_archive.ini')) else: model_path = location if protocol in ['http', 'https']: dir_archive_exists = True else: import posixpath dir_archive_exists = file_util.exists( posixpath.join(model_path, 'dir_archive.ini')) if not dir_archive_exists: raise IOError("Directory %s does not exist" % location) _internal_url = _make_internal_url(location) saved_state = glconnect.get_unity().load_model(_internal_url) # The archive version could be both bytes/unicode key = u'archive_version' archive_version = saved_state[key] if key in saved_state else saved_state[ key.encode()] if archive_version < 0: raise ToolkitError("File does not appear to be a Turi Create model.") elif archive_version > 1: raise ToolkitError( "Unable to load model.\n\n" "This model looks to have been saved with a future version of Turi Create.\n" "Please upgrade Turi Create before attempting to load this model file." ) elif archive_version == 1: cls = MODEL_NAME_MAP[saved_state['model_name']] if 'model' in saved_state: # this is a native model return cls(saved_state['model']) else: # this is a CustomModel model_data = saved_state['side_data'] model_version = model_data['model_version'] del model_data['model_version'] return cls._load_version(model_data, model_version) else: # very legacy model format. Attempt pickle loading import sys sys.stderr.write( "This model was saved in a legacy model format. Compatibility cannot be guaranteed in future versions.\n" ) if _six.PY3: raise ToolkitError( "Unable to load legacy model in Python 3.\n\n" "To migrate a model, try loading it using Turi Create 4.0 or\n" "later in Python 2 and then re-save it. The re-saved model should\n" "work in Python 3.") if 'graphlab' not in sys.modules: sys.modules['graphlab'] = sys.modules['turicreate'] # backward compatibility. Otherwise old pickles will not load sys.modules["turicreate_util"] = sys.modules['turicreate.util'] sys.modules["graphlab_util"] = sys.modules['turicreate.util'] # More backwards compatibility with the turicreate namespace code. for k, v in list(sys.modules.items()): if 'turicreate' in k: sys.modules[k.replace('turicreate', 'graphlab')] = v #legacy loader import pickle model_wrapper = pickle.loads(saved_state[b'model_wrapper']) return model_wrapper(saved_state[b'model_base'])
def create(dataset, target, features=None, max_iterations=10, validation_set='auto', verbose=True, class_weights=None, random_seed=None, metric='auto', **kwargs): """ Create a (binary or multi-class) classifier model of type :class:`~turicreate.random_forest_classifier.RandomForestClassifier` using an ensemble of decision trees trained on subsets of the data. Parameters ---------- dataset : SFrame A training dataset containing feature columns and a target column. target : str Name of the column containing the target variable. The values in this column must be of string or integer type. String target variables are automatically mapped to integers in alphabetical order of the variable values. For example, a target variable with 'cat', 'dog', and 'foosa' as possible values is mapped to 0, 1, and, 2 respectively. features : list[str], optional A list of columns names of features used for training the model. Defaults to None, which uses all columns in the SFrame ``dataset`` excepting the target column.. max_iterations : int, optional The maximum number of iterations to perform. For multi-class classification with K classes, each iteration will create K-1 trees. max_depth : float, optional Maximum depth of a tree. class_weights : {dict, `auto`}, optional Weights the examples in the training data according to the given class weights. If set to `None`, all classes are supposed to have weight one. The `auto` mode set the class weight to be inversely proportional to number of examples in the training data with the given class. min_loss_reduction : float, optional (non-negative) Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger it is, the more conservative the algorithm will be. Must be non-negative. min_child_weight : float, optional (non-negative) Controls the minimum weight of each leaf node. Larger values result in more conservative tree learning and help prevent overfitting. Formally, this is minimum sum of instance weights (hessians) in each node. If the tree learning algorithm results in a leaf node with the sum of instance weights less than `min_child_weight`, tree building will terminate. row_subsample : float, optional Subsample the ratio of the training set in each iteration of tree construction. This is called the bagging trick and can usually help prevent overfitting. Setting this to a value of 0.5 results in the model randomly sampling half of the examples (rows) to grow each tree. column_subsample : float, optional Subsample ratio of the columns in each iteration of tree construction. Like row_subsample, this can also help prevent model overfitting. Setting this to a value of 0.5 results in the model randomly sampling half of the columns to grow each tree. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. This is computed once per full iteration. Large differences in model accuracy between the training data and validation data is indicative of overfitting. The default value is 'auto'. verbose : boolean, optional Print progress information during training (if set to true). random_seed : int, optional Seeds random opertations such as column and row subsampling, such that results are reproducable. metric : str or list[str], optional Performance metric(s) that are tracked during training. When specified, the progress table will display the tracked metric(s) on training and validation set. Supported metrics are: {'accuracy', 'auc', 'log_loss'} kwargs : dict, optional Additional arguments for training the model. - ``model_checkpoint_path`` : str, default None If specified, checkpoint the model training to the given path every n iterations, where n is specified by ``model_checkpoint_interval``. For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is set to ``/tmp/model_tmp``, the checkpoints will be saved into ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc. Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints. - ``model_checkpoint_interval`` : int, default 5 If model_check_point_path is specified, save the model to the given path every n iterations. - ``resume_from_checkpoint`` : str, default None Continues training from a model checkpoint. The model must take exact the same training data as the checkpointed model. Returns ------- out : RandomForestClassifier A trained random forest model for classification tasks. References ---------- - `Trevor Hastie's slides on Boosted Trees and Random Forest <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_ See Also -------- RandomForestClassifier, turicreate.logistic_classifier.LogisticClassifier, turicreate.svm_classifier.SVMClassifier Examples -------- .. sourcecode:: python >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = turicreate.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = turicreate.random_forest_classifier.create(train, target='label') >>> predicitons = model.classify(test) >>> results = model.evaluate(test) """ if random_seed is not None: kwargs['random_seed'] = random_seed if 'model_checkpoint_path' in kwargs: kwargs['model_checkpoint_path'] = _make_internal_url( kwargs['model_checkpoint_path']) if 'resume_from_checkpoint' in kwargs: kwargs['resume_from_checkpoint'] = _make_internal_url( kwargs['resume_from_checkpoint']) if 'num_trees' in kwargs: logger = _logging.getLogger(__name__) logger.warning( "The `num_trees` keyword argument is deprecated. Please " "use the `max_iterations` argument instead. Any value provided " "for `num_trees` will be used in place of `max_iterations`.") max_iterations = kwargs['num_trees'] del kwargs['num_trees'] model = _sl.create(dataset=dataset, target=target, features=features, model_name='random_forest_classifier', max_iterations=max_iterations, validation_set=validation_set, class_weights=class_weights, verbose=verbose, metric=metric, **kwargs) return RandomForestClassifier(model.__proxy__)
def load_model(location): """ Load any Turi Create model that was previously saved. This function assumes the model (can be any model) was previously saved in Turi Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = tc.load_model('my_model_file') """ # Check if the location is a dir_archive, if not, use glunpickler to load # as pure python model # If the location is a http location, skip the check, and directly proceed # to load model as dir_archive. This is because # 1) exists() does not work with http protocol, and # 2) GLUnpickler does not support http protocol = file_util.get_protocol(location) dir_archive_exists = False if protocol == "": model_path = file_util.expand_full_path(location) dir_archive_exists = file_util.exists( os.path.join(model_path, "dir_archive.ini")) else: model_path = location if protocol in ["http", "https", "s3"]: dir_archive_exists = True else: import posixpath dir_archive_exists = file_util.exists( posixpath.join(model_path, "dir_archive.ini")) if not dir_archive_exists: raise IOError("Directory %s does not exist" % location) _internal_url = _make_internal_url(location) saved_state = glconnect.get_unity().load_model(_internal_url) saved_state = _wrap_function_return(saved_state) # The archive version could be both bytes/unicode key = u"archive_version" archive_version = (saved_state[key] if key in saved_state else saved_state[key.encode()]) if archive_version < 0: raise ToolkitError("File does not appear to be a Turi Create model.") elif archive_version > 1: raise ToolkitError( "Unable to load model.\n\n" "This model looks to have been saved with a future version of Turi Create.\n" "Please upgrade Turi Create before attempting to load this model file." ) elif archive_version == 1: name = saved_state["model_name"] if name in MODEL_NAME_MAP: cls = MODEL_NAME_MAP[name] if "model" in saved_state: if name in [ "activity_classifier", "object_detector", "style_transfer", "drawing_classifier", ]: import turicreate.toolkits.libtctensorflow # this is a native model return cls(saved_state["model"]) else: # this is a CustomModel model_data = saved_state["side_data"] model_version = model_data["model_version"] del model_data["model_version"] if name == "activity_classifier": import turicreate.toolkits.libtctensorflow model = _extensions.activity_classifier() model.import_from_custom_model(model_data, model_version) return cls(model) if name == "object_detector": import turicreate.toolkits.libtctensorflow model = _extensions.object_detector() model.import_from_custom_model(model_data, model_version) return cls(model) if name == "style_transfer": import turicreate.toolkits.libtctensorflow model = _extensions.style_transfer() model.import_from_custom_model(model_data, model_version) return cls(model) if name == "drawing_classifier": import turicreate.toolkits.libtctensorflow model = _extensions.drawing_classifier() model.import_from_custom_model(model_data, model_version) return cls(model) if name == "one_shot_object_detector": import turicreate.toolkits.libtctensorflow od_cls = MODEL_NAME_MAP["object_detector"] if "detector_model" in model_data["detector"]: model_data["detector"] = od_cls( model_data["detector"]["detector_model"]) else: model = _extensions.object_detector() model.import_from_custom_model( model_data["detector"], model_data["_detector_version"]) model_data["detector"] = od_cls(model) return cls(model_data) return cls._load_version(model_data, model_version) elif hasattr(_extensions, name): return saved_state["model"] else: raise ToolkitError( "Unable to load model of name '%s'; model name not registered." % name) else: # very legacy model format. Attempt pickle loading import sys sys.stderr.write( "This model was saved in a legacy model format. Compatibility cannot be guaranteed in future versions.\n" ) if _six.PY3: raise ToolkitError( "Unable to load legacy model in Python 3.\n\n" "To migrate a model, try loading it using Turi Create 4.0 or\n" "later in Python 2 and then re-save it. The re-saved model should\n" "work in Python 3.") if "graphlab" not in sys.modules: sys.modules["graphlab"] = sys.modules["turicreate"] # backward compatibility. Otherwise old pickles will not load sys.modules["turicreate_util"] = sys.modules["turicreate.util"] sys.modules["graphlab_util"] = sys.modules["turicreate.util"] # More backwards compatibility with the turicreate namespace code. for k, v in list(sys.modules.items()): if "turicreate" in k: sys.modules[k.replace("turicreate", "graphlab")] = v # legacy loader import pickle model_wrapper = pickle.loads(saved_state[b"model_wrapper"]) return model_wrapper(saved_state[b"model_base"])
def create(dataset, target, features=None, max_iterations=10, validation_set='auto', verbose=True, random_seed=None, metric='auto', **kwargs): """ Create a :class:`~turicreate.random_forest_regression.RandomForestRegression` to predict a scalar target variable using one or more features. In addition to standard numeric and categorical types, features can also be extracted automatically from list- or dictionary-type SFrame columns. Parameters ---------- dataset : SFrame A training dataset containing feature columns and a target column. Only numerical typed (int, float) target column is allowed. target : str The name of the column in ``dataset`` that is the prediction target. This column must have a numeric type. features : list[str], optional A list of columns names of features used for training the model. Defaults to None, using all columns. max_iterations : int, optional The number of iterations to perform. max_depth : float, optional Maximum depth of a tree. min_loss_reduction : float, optional (non-negative) Minimum loss reduction required to make a further partition/split a node during the tree learning phase. Larger (more positive) values can help prevent overfitting by avoiding splits that do not sufficiently reduce the loss function. min_child_weight : float, optional (non-negative) Controls the minimum weight of each leaf node. Larger values result in more conservative tree learning and help prevent overfitting. Formally, this is minimum sum of instance weights (hessians) in each node. If the tree learning algorithm results in a leaf node with the sum of instance weights less than `min_child_weight`, tree building will terminate. row_subsample : float, optional Subsample the ratio of the training set in each iteration of tree construction. This is called the bagging trick and usually can help prevent overfitting. Setting it to 0.5 means that model randomly collected half of the examples (rows) to grow each tree. column_subsample : float, optional Subsample ratio of the columns in each iteration of tree construction. Like row_subsample, this also usually can help prevent overfitting. Setting it to 0.5 means that model randomly collected half of the columns to grow each tree. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. This is computed once per full iteration. Large differences in model accuracy between the training data and validation data is indicative of overfitting. The default value is 'auto'. verbose : boolean, optional If True, print progress information during training. random_seed : int, optional Seeds random opertations such as column and row subsampling, such that results are reproducable. metric : str or list[str], optional Performance metric(s) that are tracked during training. When specified, the progress table will display the tracked metric(s) on training and validation set. Supported metrics are: {'rmse', 'max_error'} kwargs : dict, optional Additional arguments for training the model. - ``model_checkpoint_path`` : str, default None If specified, checkpoint the model training to the given path every n iterations, where n is specified by ``model_checkpoint_interval``. For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is set to ``/tmp/model_tmp``, the checkpoints will be saved into ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc. Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints. - ``model_checkpoint_interval`` : int, default 5 If model_check_point_path is specified, save the model to the given path every n iterations. - ``resume_from_checkpoint`` : str, default None Continues training from a model checkpoint. The model must take exact the same training data as the checkpointed model. Returns ------- out : RandomForestRegression A trained random forest model for regression tasks. References ---------- - `Trevor Hastie's slides on Boosted Trees and Random Forest <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_ See Also -------- RandomForestRegression, turicreate.linear_regression.LinearRegression, turicreate.regression.create Examples -------- Setup the data: >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = turicreate.SFrame.read_csv(url) >>> data['label'] = data['label'] == 'p' Split the data into training and test data: >>> train, test = data.random_split(0.8) Create the model: >>> model = turicreate.random_forest_regression.create(train, target='label') Make predictions and evaluate the model: >>> predictions = model.predict(test) >>> results = model.evaluate(test) """ if random_seed is not None: kwargs['random_seed'] = random_seed if 'model_checkpoint_path' in kwargs: kwargs['model_checkpoint_path'] = _make_internal_url( kwargs['model_checkpoint_path']) if 'resume_from_checkpoint' in kwargs: kwargs['resume_from_checkpoint'] = _make_internal_url( kwargs['resume_from_checkpoint']) if 'num_trees' in kwargs: logger = _logging.getLogger(__name__) logger.warning( "The `num_trees` keyword argument is deprecated. Please " "use the `max_iterations` argument instead. Any value provided " "for `num_trees` will be used in place of `max_iterations`.") max_iterations = kwargs['num_trees'] del kwargs['num_trees'] model = _sl.create(dataset=dataset, target=target, features=features, model_name='random_forest_regression', max_iterations=max_iterations, validation_set=validation_set, verbose=verbose, metric=metric, **kwargs) return RandomForestRegression(model.__proxy__)
def create(dataset, target, features=None, max_iterations=10, validation_set="auto", class_weights=None, max_depth=6, step_size=0.3, min_loss_reduction=0.0, min_child_weight=0.1, row_subsample=1.0, column_subsample=1.0, verbose=True, random_seed=None, metric="auto", **kwargs): """ Create a (binary or multi-class) classifier model of type :class:`~turicreate.boosted_trees_classifier.BoostedTreesClassifier` using gradient boosted trees (sometimes known as GBMs). Parameters ---------- dataset : SFrame A training dataset containing feature columns and a target column. target : str Name of the column containing the target variable. The values in this column must be of string or integer type. String target variables are automatically mapped to integers in alphabetical order of the variable values. For example, a target variable with 'cat', 'dog', and 'foosa' as possible values is mapped to 0, 1, and, 2 respectively. features : list[str], optional A list of columns names of features used for training the model. Defaults to None, which uses all columns in the SFrame ``dataset`` excepting the target column.. max_iterations : int, optional The maximum number of iterations for boosting. Each iteration results in the creation of an extra tree. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. This is computed once per full iteration. Large differences in model accuracy between the training data and validation data is indicative of overfitting. The default value is 'auto'. class_weights : {dict, `auto`}, optional Weights the examples in the training data according to the given class weights. If provided, the dictionary must contain a key for each class label. The value can be any positive number greater than 1e-20. Weights are interpreted as relative to each other. So setting the weights to be 2.0 for the positive class and 1.0 for the negative class has the same effect as setting them to be 20.0 and 10.0, respectively. If set to `None`, all classes are taken to have weight 1.0. The `auto` mode sets the class weight to be inversely proportional to the number of examples in the training data with the given class. max_depth : float, optional Maximum depth of a tree. Must be at least 1. step_size : float, [0,1], optional Step size (shrinkage) used in update to prevents overfitting. It shrinks the prediction of each weak learner to make the boosting process more conservative. The smaller the step size, the more conservative the algorithm will be. Smaller step_size work well when `max_iterations` is large. min_loss_reduction : float, optional (non-negative) Minimum loss reduction required to make a further partition/split a node during the tree learning phase. Larger (more positive) values can help prevent overfitting by avoiding splits that do not sufficiently reduce the loss function. min_child_weight : float, optional (non-negative) Controls the minimum weight of each leaf node. Larger values result in more conservative tree learning and help prevent overfitting. Formally, this is minimum sum of instance weights (hessians) in each node. If the tree learning algorithm results in a leaf node with the sum of instance weights less than `min_child_weight`, tree building will terminate. row_subsample : float, [0,1], optional Subsample the ratio of the training set in each iteration of tree construction. This is called the bagging trick and can usually help prevent overfitting. Setting this to a value of 0.5 results in the model randomly sampling half of the examples (rows) to grow each tree. column_subsample : float, [0,1], optional Subsample ratio of the columns in each iteration of tree construction. Like row_subsample, this can also help prevent model overfitting. Setting this to a value of 0.5 results in the model randomly sampling half of the columns to grow each tree. verbose : boolean, optional Print progress information during training (if set to true). random_seed : int, optional Seeds random opertations such as column and row subsampling, such that results are reproducable. metric : str or list[str], optional Performance metric(s) that are tracked during training. When specified, the progress table will display the tracked metric(s) on training and validation set. Supported metrics are: {'accuracy', 'auc', 'log_loss'} kwargs : dict, optional Additional arguments for training the model. - ``early_stopping_rounds`` : int, default None If the validation metric does not improve after <early_stopping_rounds>, stop training and return the best model. If multiple metrics are being tracked, the last one is used. - ``model_checkpoint_path`` : str, default None If specified, checkpoint the model training to the given path every n iterations, where n is specified by ``model_checkpoint_interval``. For instance, if `model_checkpoint_interval` is 5, and `model_checkpoint_path` is set to ``/tmp/model_tmp``, the checkpoints will be saved into ``/tmp/model_tmp/model_checkpoint_5``, ``/tmp/model_tmp/model_checkpoint_10``, ... etc. Training can be resumed by setting ``resume_from_checkpoint`` to one of these checkpoints. - ``model_checkpoint_interval`` : int, default 5 If model_check_point_path is specified, save the model to the given path every n iterations. - ``resume_from_checkpoint`` : str, default None Continues training from a model checkpoint. The model must take exact the same training data as the checkpointed model. Returns ------- out : BoostedTreesClassifier A trained gradient boosted trees model for classifications tasks. References ---------- - `Wikipedia - Gradient tree boosting <http://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting>`_ - `Trevor Hastie's slides on Boosted Trees and Random Forest <http://jessica2.msri.org/attachments/10778/10778-boost.pdf>`_ See Also -------- BoostedTreesClassifier, turicreate.logistic_classifier.LogisticClassifier, turicreate.svm_classifier.SVMClassifier Examples -------- .. sourcecode:: python >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> data = turicreate.SFrame.read_csv(url) >>> train, test = data.random_split(0.8) >>> model = turicreate.boosted_trees_classifier.create(train, target='label') >>> predictions = model.classify(test) >>> results = model.evaluate(test) """ if random_seed is not None: kwargs["random_seed"] = random_seed if "model_checkpoint_path" in kwargs: kwargs["model_checkpoint_path"] = _make_internal_url( kwargs["model_checkpoint_path"]) if "resume_from_checkpoint" in kwargs: kwargs["resume_from_checkpoint"] = _make_internal_url( kwargs["resume_from_checkpoint"]) model = _sl.create(dataset=dataset, target=target, features=features, model_name="boosted_trees_classifier", max_iterations=max_iterations, validation_set=validation_set, class_weights=class_weights, max_depth=max_depth, step_size=step_size, min_loss_reduction=min_loss_reduction, min_child_weight=min_child_weight, row_subsample=row_subsample, column_subsample=column_subsample, verbose=verbose, metric=metric, **kwargs) return BoostedTreesClassifier(model.__proxy__)