コード例 #1
0
def create(dataset, target, model_name, features=None,
           validation_set='auto', distributed='auto',
           verbose=True, seed=None, **kwargs):
    """
    Create a :class:`~turicreate.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel This function is normally not called, call
    specific model's create function instead

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    features : list[string], optional
        List of feature names used by feature column

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.
        For each row of the progress table, the chosen metrics are computed
        for both the provided training dataset and the validation_set. The
        format of this SFrame must be the same as the training set.
        By default this argument is set to 'auto' and a validation set is
        automatically sampled and used for progress printing. If
        validation_set is set to None, then no additional metrics
        are computed. The default value is 'auto'.

    distributed: env
        The distributed environment

    verbose : boolean
        whether print out messages during training

    seed : int, optional
        Seed for random number generation. Set this value to ensure that the
        same model is created every time.

    kwargs : dict
        Additional parameter options that can be passed
    """

    # Perform error-checking and trim inputs to specified columns
    dataset, validation_set = _validate_data(dataset, target, features,
                                             validation_set)

    # Sample a validation set from the training data if requested
    if isinstance(validation_set, str):
        assert validation_set == 'auto'
        if dataset.num_rows() >= 100:
            if verbose:
                print_validation_track_notification()
            dataset, validation_set = dataset.random_split(.95, seed=seed)
        else:
            validation_set = _turicreate.SFrame()
    elif validation_set is None:
        validation_set = _turicreate.SFrame()

    # Sanitize model-specific options
    options = {k.lower(): kwargs[k] for k in kwargs}

    # Create a model instance and train it
    model = _turicreate.extensions.__dict__[model_name]()
    with QuietProgress(verbose):
        model.train(dataset, target, validation_set, options)

    return SupervisedLearningModel(model, model_name)
コード例 #2
0
def create_classification_with_model_selector(dataset, target, model_selector,
    features=None, validation_set='auto', verbose=True):
    """
    Create a :class:`~turicreate.toolkits.SupervisedLearningModel`,

    This is generic function that allows you to create any model that
    implements SupervisedLearningModel. This function is normally not called, call
    specific model's create function instead.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : string
        Name of the column containing the target variable. The values in this
        column must be 0 or 1, of integer type.

    model_name : string
        Name of the model

    model_selector: function
        Provide a model selector.

    features : list[string], optional
        List of feature names used by feature column

    verbose : boolean
        whether print out messages during training

    """

    # Perform error-checking and trim inputs to specified columns
    dataset, validation_set = _validate_data(dataset, target, features,
                                             validation_set)

    # Sample the data
    features_sframe = dataset
    if features_sframe.num_rows() > 1e5:
        fraction = 1.0 * 1e5 / features_sframe.num_rows()
        features_sframe = features_sframe.sample(fraction, seed = 0)

    # Get available models for this dataset
    num_classes = len(dataset[target].unique())
    selected_model_names = model_selector(num_classes, features_sframe)

    # Create a validation set
    if isinstance(validation_set, str):
        if validation_set == 'auto':
            if dataset.num_rows() >= 100:
                if verbose:
                    print_validation_track_notification()
                dataset, validation_set = dataset.random_split(.95)
            else:
                validation_set = None
        else:
            raise TypeError('Unrecognized value for validation_set.')

    # Match C++ model names with user model names
    python_names = {'boosted_trees_classifier': 'BoostedTreesClassifier',
                    'random_forest_classifier': 'RandomForestClassifier',
                    'decision_tree_classifier': 'DecisionTreeClassifier',
                    'classifier_logistic_regression': 'LogisticClassifier',
                    'classifier_svm': 'SVMClassifier'}

    # Print useful user-facing progress messages
    if verbose:
        print('PROGRESS: The following methods are available for this type of problem.')
        print('PROGRESS: ' + ', '.join([python_names[x] for x in selected_model_names]))
        if len(selected_model_names) > 1:
            print('PROGRESS: The returned model will be chosen according to validation accuracy.')

    models = {}
    metrics = {}
    for model_name in selected_model_names:

        # Fit each of the available models
        m = create_selected(model_name, dataset, target, features, validation_set, verbose)
        models[model_name] = m

        if 'validation_accuracy' in m._list_fields():
            metrics[model_name] = m.validation_accuracy
        elif 'training_accuracy' in m._list_fields():
            metrics[model_name] = m.training_accuracy

        # Most models have this.
        elif 'progress' in m._list_fields():
            prog = m.progress
            validation_column = 'Validation Accuracy'
            accuracy_column = 'Training Accuracy'
            if validation_column in prog.column_names():
                metrics[model_name] = float(prog[validation_column].tail(1)[0])
            else:
                metrics[model_name] = float(prog[accuracy_column].tail(1)[0])
        else:
            raise ValueError("Model does not have metrics that can be used for model selection.")

    # Choose model based on either validation, if available.
    best_model = None
    best_acc = None
    for model_name in selected_model_names:
        if best_acc is None:
            best_model = model_name
            best_acc = metrics[model_name]
        if best_acc is not None and best_acc < metrics[model_name]:
            best_model = model_name
            best_acc = metrics[model_name]

    ret = []
    width = 32
    if len(selected_model_names) > 1:
        ret.append('PROGRESS: Model selection based on validation accuracy:')
        ret.append('---------------------------------------------')
        key_str = '{:<{}}: {}'
        for model_name in selected_model_names:
            name = python_names[model_name]
            row = key_str.format(name, width, str(metrics[model_name]))
            ret.append(row)
        ret.append('---------------------------------------------')
        ret.append('Selecting ' + python_names[best_model] + ' based on validation set performance.')

    if verbose:
        print('\nPROGRESS: '.join(ret))
    return models[best_model]
コード例 #3
0
def create(dataset, target, features=None, validation_set="auto", verbose=True):
    """
    Automatically create a suitable regression model based on the provided
    training data.

    To use specific options of a desired model, use the ``create`` function
    of the corresponding model.

    Parameters
    ----------
    dataset : SFrame
        Dataset for training the model.

    target : str
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type (int/float).

    features : list[string], optional
        Names of the columns containing features. 'None' (the default) indicates
        that all columns except the target variable should be used as features.

        The features are columns in the input SFrame that can be of the
        following types:

        - *Numeric*: values of numeric type integer or float.

        - *Categorical*: values of type string.

        - *Array*: list of numeric (integer or float) values. Each list element
          is treated as a separate feature in the model.

        - *Dictionary*: key-value pairs with numeric (integer or float) values
          Each key of a dictionary is treated as a separate feature and the
          value in the dictionary corresponds to the value of the feature.
          Dictionaries are ideal for representing sparse data.

        Columns of type *list* are not supported. Convert such feature
        columns to type array if all entries in the list are of numeric
        types. If the lists contain data of mixed types, separate
        them out into different columns.

    validation_set : SFrame, optional
        A dataset for monitoring the model's generalization performance.  For
        each row of the progress table, the chosen metrics are computed for
        both the provided training dataset and the validation_set. The format
        of this SFrame must be the same as the training set.  By default this
        argument is set to 'auto' and a validation set is automatically sampled
        and used for progress printing. If validation_set is set to None, then
        no additional metrics are computed. The default value is 'auto'.


    verbose : boolean, optional
        If True, print progress information during training.

    Returns
    -------
      out : A trained regression model.

    See Also
    --------
    turicreate.linear_regression.LinearRegression,
    turicreate.boosted_trees_regression.BoostedTreesRegression

    Examples
    --------
    .. sourcecode:: python

      # Setup the data
      >>> import turicreate as tc
      >>> data =  tc.SFrame('https://static.turi.com/datasets/regression/houses.csv')

      # Selects the best model based on your data.
      >>> model = tc.regression.create(data, target='price',
      ...                                  features=['bath', 'bedroom', 'size'])

      # Make predictions and evaluate results.
      >>> predictions = model.predict(data)
      >>> results = model.evaluate(data)

      # Setup the data
      >>> import turicreate as tc
      >>> data =  tc.SFrame('https://static.turi.com/datasets/regression/houses.csv')

      # Selects the best model based on your data.
      >>> model = tc.regression.create(data, target='price',
      ...                                  features=['bath', 'bedroom', 'size'])

      # Make predictions and evaluate results.
      >>> predictions = model.predict(data)
      >>> results = model.evaluate(data)

    """

    dataset, validation_set = _validate_data(dataset, target, features, validation_set)
    if validation_set is None:
        validation_set = _turicreate.SFrame()

    model_proxy = _turicreate.extensions.create_automatic_regression_model(
        dataset, target, validation_set, {}
    )

    return _sl.wrap_model_proxy(model_proxy)