def create(dataset, target, model_name, features=None, validation_set='auto', distributed='auto', verbose=True, seed=None, **kwargs): """ Create a :class:`~turicreate.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel This function is normally not called, call specific model's create function instead Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model features : list[string], optional List of feature names used by feature column validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. distributed: env The distributed environment verbose : boolean whether print out messages during training seed : int, optional Seed for random number generation. Set this value to ensure that the same model is created every time. kwargs : dict Additional parameter options that can be passed """ # Perform error-checking and trim inputs to specified columns dataset, validation_set = _validate_data(dataset, target, features, validation_set) # Sample a validation set from the training data if requested if isinstance(validation_set, str): assert validation_set == 'auto' if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95, seed=seed) else: validation_set = _turicreate.SFrame() elif validation_set is None: validation_set = _turicreate.SFrame() # Sanitize model-specific options options = {k.lower(): kwargs[k] for k in kwargs} # Create a model instance and train it model = _turicreate.extensions.__dict__[model_name]() with QuietProgress(verbose): model.train(dataset, target, validation_set, options) return SupervisedLearningModel(model, model_name)
def create_classification_with_model_selector(dataset, target, model_selector, features=None, validation_set='auto', verbose=True): """ Create a :class:`~turicreate.toolkits.SupervisedLearningModel`, This is generic function that allows you to create any model that implements SupervisedLearningModel. This function is normally not called, call specific model's create function instead. Parameters ---------- dataset : SFrame Dataset for training the model. target : string Name of the column containing the target variable. The values in this column must be 0 or 1, of integer type. model_name : string Name of the model model_selector: function Provide a model selector. features : list[string], optional List of feature names used by feature column verbose : boolean whether print out messages during training """ # Perform error-checking and trim inputs to specified columns dataset, validation_set = _validate_data(dataset, target, features, validation_set) # Sample the data features_sframe = dataset if features_sframe.num_rows() > 1e5: fraction = 1.0 * 1e5 / features_sframe.num_rows() features_sframe = features_sframe.sample(fraction, seed = 0) # Get available models for this dataset num_classes = len(dataset[target].unique()) selected_model_names = model_selector(num_classes, features_sframe) # Create a validation set if isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print_validation_track_notification() dataset, validation_set = dataset.random_split(.95) else: validation_set = None else: raise TypeError('Unrecognized value for validation_set.') # Match C++ model names with user model names python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier', 'random_forest_classifier': 'RandomForestClassifier', 'decision_tree_classifier': 'DecisionTreeClassifier', 'classifier_logistic_regression': 'LogisticClassifier', 'classifier_svm': 'SVMClassifier'} # Print useful user-facing progress messages if verbose: print('PROGRESS: The following methods are available for this type of problem.') print('PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names])) if len(selected_model_names) > 1: print('PROGRESS: The returned model will be chosen according to validation accuracy.') models = {} metrics = {} for model_name in selected_model_names: # Fit each of the available models m = create_selected(model_name, dataset, target, features, validation_set, verbose) models[model_name] = m if 'validation_accuracy' in m._list_fields(): metrics[model_name] = m.validation_accuracy elif 'training_accuracy' in m._list_fields(): metrics[model_name] = m.training_accuracy # Most models have this. elif 'progress' in m._list_fields(): prog = m.progress validation_column = 'Validation Accuracy' accuracy_column = 'Training Accuracy' if validation_column in prog.column_names(): metrics[model_name] = float(prog[validation_column].tail(1)[0]) else: metrics[model_name] = float(prog[accuracy_column].tail(1)[0]) else: raise ValueError("Model does not have metrics that can be used for model selection.") # Choose model based on either validation, if available. best_model = None best_acc = None for model_name in selected_model_names: if best_acc is None: best_model = model_name best_acc = metrics[model_name] if best_acc is not None and best_acc < metrics[model_name]: best_model = model_name best_acc = metrics[model_name] ret = [] width = 32 if len(selected_model_names) > 1: ret.append('PROGRESS: Model selection based on validation accuracy:') ret.append('---------------------------------------------') key_str = '{:<{}}: {}' for model_name in selected_model_names: name = python_names[model_name] row = key_str.format(name, width, str(metrics[model_name])) ret.append(row) ret.append('---------------------------------------------') ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.') if verbose: print('\nPROGRESS: '.join(ret)) return models[best_model]
def create(dataset, target, features=None, validation_set="auto", verbose=True): """ Automatically create a suitable regression model based on the provided training data. To use specific options of a desired model, use the ``create`` function of the corresponding model. Parameters ---------- dataset : SFrame Dataset for training the model. target : str The name of the column in ``dataset`` that is the prediction target. This column must have a numeric type (int/float). features : list[string], optional Names of the columns containing features. 'None' (the default) indicates that all columns except the target variable should be used as features. The features are columns in the input SFrame that can be of the following types: - *Numeric*: values of numeric type integer or float. - *Categorical*: values of type string. - *Array*: list of numeric (integer or float) values. Each list element is treated as a separate feature in the model. - *Dictionary*: key-value pairs with numeric (integer or float) values Each key of a dictionary is treated as a separate feature and the value in the dictionary corresponds to the value of the feature. Dictionaries are ideal for representing sparse data. Columns of type *list* are not supported. Convert such feature columns to type array if all entries in the list are of numeric types. If the lists contain data of mixed types, separate them out into different columns. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. For each row of the progress table, the chosen metrics are computed for both the provided training dataset and the validation_set. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. verbose : boolean, optional If True, print progress information during training. Returns ------- out : A trained regression model. See Also -------- turicreate.linear_regression.LinearRegression, turicreate.boosted_trees_regression.BoostedTreesRegression Examples -------- .. sourcecode:: python # Setup the data >>> import turicreate as tc >>> data = tc.SFrame('https://static.turi.com/datasets/regression/houses.csv') # Selects the best model based on your data. >>> model = tc.regression.create(data, target='price', ... features=['bath', 'bedroom', 'size']) # Make predictions and evaluate results. >>> predictions = model.predict(data) >>> results = model.evaluate(data) # Setup the data >>> import turicreate as tc >>> data = tc.SFrame('https://static.turi.com/datasets/regression/houses.csv') # Selects the best model based on your data. >>> model = tc.regression.create(data, target='price', ... features=['bath', 'bedroom', 'size']) # Make predictions and evaluate results. >>> predictions = model.predict(data) >>> results = model.evaluate(data) """ dataset, validation_set = _validate_data(dataset, target, features, validation_set) if validation_set is None: validation_set = _turicreate.SFrame() model_proxy = _turicreate.extensions.create_automatic_regression_model( dataset, target, validation_set, {} ) return _sl.wrap_model_proxy(model_proxy)